parser.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. from __future__ import unicode_literals
  2. import bleach
  3. import markdown
  4. from bs4 import BeautifulSoup
  5. from htmlmin.minify import html_minify
  6. from django.http import Http404
  7. from django.urls import resolve
  8. from django.utils import six
  9. from markdown.extensions.fenced_code import FencedCodeExtension
  10. from .bbcode import blocks, inline
  11. from .md.shortimgs import ShortImagesExtension
  12. from .md.striketrough import StriketroughExtension
  13. from .mentions import add_mentions
  14. from .pipeline import pipeline
  15. __all__ = ['parse']
  16. MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')
  17. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  18. allow_images=True, allow_blocks=True, force_shva=False, minify=True):
  19. """
  20. Message parser
  21. Utility for flavours to call
  22. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  23. headers, lists, images, spoilers, text styles
  24. Returns dict object
  25. """
  26. md = md_factory(
  27. allow_links=allow_links,
  28. allow_images=allow_images,
  29. allow_blocks=allow_blocks,
  30. )
  31. parsing_result = {
  32. 'original_text': text,
  33. 'parsed_text': '',
  34. 'markdown': md,
  35. 'mentions': [],
  36. 'images': [],
  37. 'outgoing_links': [],
  38. 'inside_links': []
  39. }
  40. # Parse text
  41. parsed_text = md.convert(text)
  42. # Clean and store parsed text
  43. parsing_result['parsed_text'] = parsed_text.strip()
  44. if allow_links:
  45. linkify_paragraphs(parsing_result)
  46. parsing_result = pipeline.process_result(parsing_result)
  47. if allow_mentions:
  48. add_mentions(request, parsing_result)
  49. if allow_links or allow_images:
  50. clean_links(request, parsing_result, force_shva)
  51. if minify:
  52. minify_result(parsing_result)
  53. return parsing_result
  54. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  55. """
  56. Create and configure markdown object
  57. """
  58. md = markdown.Markdown(safe_mode='escape', extensions=['nl2br'])
  59. # Remove references
  60. del md.preprocessors['reference']
  61. del md.inlinePatterns['reference']
  62. del md.inlinePatterns['image_reference']
  63. del md.inlinePatterns['short_reference']
  64. # Add [b], [i], [u]
  65. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  66. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  67. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  68. # Add ~~deleted~~
  69. striketrough_md = StriketroughExtension()
  70. striketrough_md.extendMarkdown(md)
  71. if not allow_links:
  72. # Remove links
  73. del md.inlinePatterns['link']
  74. del md.inlinePatterns['autolink']
  75. del md.inlinePatterns['automail']
  76. if allow_images:
  77. # Add [img]
  78. short_images_md = ShortImagesExtension()
  79. short_images_md.extendMarkdown(md)
  80. else:
  81. # Remove images
  82. del md.inlinePatterns['image_link']
  83. if allow_blocks:
  84. # Add [hr] and [quote] blocks
  85. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  86. fenced_code = FencedCodeExtension()
  87. fenced_code.extendMarkdown(md, None)
  88. code_bbcode = blocks.CodeBlockExtension()
  89. code_bbcode.extendMarkdown(md)
  90. quote_bbcode = blocks.QuoteExtension()
  91. quote_bbcode.extendMarkdown(md)
  92. else:
  93. # Remove blocks
  94. del md.parser.blockprocessors['hashheader']
  95. del md.parser.blockprocessors['setextheader']
  96. del md.parser.blockprocessors['code']
  97. del md.parser.blockprocessors['quote']
  98. del md.parser.blockprocessors['hr']
  99. del md.parser.blockprocessors['olist']
  100. del md.parser.blockprocessors['ulist']
  101. return pipeline.extend_markdown(md)
  102. def linkify_paragraphs(result):
  103. result['parsed_text'] = bleach.linkify(result['parsed_text'], skip_pre=True, parse_email=True)
  104. def clean_links(request, result, force_shva=False):
  105. host = request.get_host()
  106. site_address = '%s://%s' % (request.scheme, request.get_host())
  107. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  108. for link in soup.find_all('a'):
  109. if is_internal_link(link['href'], host):
  110. link['href'] = clean_internal_link(link['href'], host)
  111. result['inside_links'].append(link['href'])
  112. link['href'] = clean_attachment_link(link['href'], force_shva)
  113. else:
  114. result['outgoing_links'].append(link['href'])
  115. if link.string:
  116. link.string = clean_link_prefix(link.string)
  117. for img in soup.find_all('img'):
  118. img['alt'] = clean_link_prefix(img['alt'])
  119. if is_internal_link(img['src'], host):
  120. img['src'] = clean_internal_link(img['src'], host)
  121. result['images'].append(img['src'])
  122. img['src'] = clean_attachment_link(img['src'], force_shva)
  123. else:
  124. result['images'].append(img['src'])
  125. # [6:-7] trims <body></body> wrap
  126. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  127. def is_internal_link(link, host):
  128. if link.startswith('/') and not link.startswith('//'):
  129. return True
  130. link = clean_link_prefix(link).lstrip('www.').lower()
  131. return link.lower().startswith(host.lstrip('www.'))
  132. def clean_link_prefix(link):
  133. if link.lower().startswith('https:'):
  134. link = link[6:]
  135. if link.lower().startswith('http:'):
  136. link = link[5:]
  137. if link.startswith('//'):
  138. link = link[2:]
  139. return link
  140. def clean_internal_link(link, host):
  141. link = clean_link_prefix(link)
  142. if link.lower().startswith('www.'):
  143. link = link[4:]
  144. if host.lower().startswith('www.'):
  145. host = host[4:]
  146. if link.lower().startswith(host):
  147. link = link[len(host):]
  148. return link or '/'
  149. def clean_attachment_link(link, force_shva=False):
  150. try:
  151. resolution = resolve(link)
  152. url_name = ':'.join(resolution.namespaces + [resolution.url_name])
  153. except (Http404, ValueError):
  154. return link
  155. if url_name in MISAGO_ATTACHMENT_VIEWS:
  156. if force_shva:
  157. link = '{}?shva=1'.format(link)
  158. elif link.endswith('?shva=1'):
  159. link = link[:-7]
  160. return link
  161. def minify_result(result):
  162. # [25:-14] trims <html><head></head><body> and </body></html>
  163. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  164. result['parsed_text'] = result['parsed_text'][25:-14]