parser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. from __future__ import unicode_literals
  2. import markdown
  3. import bleach
  4. from bs4 import BeautifulSoup
  5. from django.utils import six
  6. from htmlmin.minify import html_minify
  7. from .bbcode import blocks, inline
  8. from .md.shortimgs import ShortImagesExtension
  9. from .mentions import add_mentions
  10. from .pipeline import pipeline
  11. __all__ = ['parse']
  12. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  13. allow_images=True, allow_blocks=True, minify=True):
  14. """
  15. Message parser
  16. Utility for flavours to call
  17. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  18. headers, lists, images, spoilers, text styles
  19. Returns dict object
  20. """
  21. md = md_factory(
  22. allow_links=allow_links,
  23. allow_images=allow_images,
  24. allow_blocks=allow_blocks,
  25. )
  26. parsing_result = {
  27. 'original_text': text,
  28. 'parsed_text': '',
  29. 'markdown': md,
  30. 'mentions': [],
  31. 'images': [],
  32. 'outgoing_links': [],
  33. 'inside_links': []
  34. }
  35. # Parse text
  36. parsed_text = md.convert(text)
  37. # Clean and store parsed text
  38. parsing_result['parsed_text'] = parsed_text.strip()
  39. if allow_links:
  40. linkify_paragraphs(parsing_result)
  41. parsing_result = pipeline.process_result(parsing_result)
  42. if allow_mentions:
  43. add_mentions(request, parsing_result)
  44. if allow_links or allow_images:
  45. clean_links(request, parsing_result)
  46. if minify:
  47. minify_result(parsing_result)
  48. return parsing_result
  49. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  50. """
  51. Create and configure markdown object
  52. """
  53. md = markdown.Markdown(safe_mode='escape', extensions=['nl2br'])
  54. # Remove references
  55. del md.preprocessors['reference']
  56. del md.inlinePatterns['reference']
  57. del md.inlinePatterns['image_reference']
  58. del md.inlinePatterns['short_reference']
  59. # Add [b], [i], [u]
  60. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  61. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  62. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  63. if allow_links:
  64. # Add [url]
  65. pass
  66. else:
  67. # Remove links
  68. del md.inlinePatterns['link']
  69. del md.inlinePatterns['autolink']
  70. del md.inlinePatterns['automail']
  71. if allow_images:
  72. # Add [img]
  73. short_images_md = ShortImagesExtension()
  74. short_images_md.extendMarkdown(md)
  75. else:
  76. # Remove images
  77. del md.inlinePatterns['image_link']
  78. if allow_blocks:
  79. # Add [hr] [quote], [spoiler], [list] and [code] blocks
  80. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  81. else:
  82. # Remove blocks
  83. del md.parser.blockprocessors['hashheader']
  84. del md.parser.blockprocessors['setextheader']
  85. del md.parser.blockprocessors['code']
  86. del md.parser.blockprocessors['quote']
  87. del md.parser.blockprocessors['hr']
  88. del md.parser.blockprocessors['olist']
  89. del md.parser.blockprocessors['ulist']
  90. return pipeline.extend_markdown(md)
  91. def linkify_paragraphs(result):
  92. result['parsed_text'] = bleach.linkify(result['parsed_text'], skip_pre=True, parse_email=True)
  93. def clean_links(request, result):
  94. host = request.get_host()
  95. site_address = '%s://%s' % (request.scheme, request.get_host())
  96. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  97. for link in soup.find_all('a'):
  98. if is_internal_link(link['href'], host):
  99. link['href'] = clean_internal_link(link['href'], host)
  100. else:
  101. result['outgoing_links'].append(link['href'])
  102. if link.string:
  103. link.string = clean_link_prefix(link.string)
  104. for img in soup.find_all('img'):
  105. img['alt'] = clean_link_prefix(img['alt'])
  106. if is_internal_link(img['src'], host):
  107. img['src'] = clean_internal_link(img['src'], host)
  108. result['images'].append(img['src'])
  109. else:
  110. result['images'].append(img['src'])
  111. # [6:-7] trims <body></body> wrap
  112. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  113. def is_internal_link(link, host):
  114. if link.startswith('/') and not link.startswith('//'):
  115. return True
  116. link = clean_link_prefix(link).lstrip('www.').lower()
  117. return link.lower().startswith(host.lstrip('www.'))
  118. def clean_link_prefix(link):
  119. if link.lower().startswith('https:'):
  120. link = link[6:]
  121. if link.lower().startswith('http:'):
  122. link = link[5:]
  123. if link.startswith('//'):
  124. link = link[2:]
  125. return link
  126. def clean_internal_link(link, host):
  127. link = clean_link_prefix(link)
  128. if link.lower().startswith('www.'):
  129. link = link[4:]
  130. if host.lower().startswith('www.'):
  131. host = host[4:]
  132. if link.lower().startswith(host):
  133. link = link[len(host):]
  134. return link or '/'
  135. def minify_result(result):
  136. # [25:-14] trims <html><head></head><body> and </body></html>
  137. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  138. result['parsed_text'] = result['parsed_text'][25:-14]