parser.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. from __future__ import unicode_literals
  2. import warnings
  3. import bleach
  4. import markdown
  5. from bs4 import BeautifulSoup
  6. from htmlmin.minify import html_minify
  7. from django.http import Http404
  8. from django.urls import resolve
  9. from django.utils import six
  10. from markdown.extensions.fenced_code import FencedCodeExtension
  11. from .bbcode import blocks, inline
  12. from .md.shortimgs import ShortImagesExtension
  13. from .md.striketrough import StriketroughExtension
  14. from .mentions import add_mentions
  15. from .pipeline import pipeline
  16. __all__ = ['parse']
  17. MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')
  18. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  19. allow_images=True, allow_blocks=True, force_shva=False, minify=True):
  20. """
  21. Message parser
  22. Utility for flavours to call
  23. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  24. headers, lists, images, spoilers, text styles
  25. Returns dict object
  26. """
  27. md = md_factory(
  28. allow_links=allow_links,
  29. allow_images=allow_images,
  30. allow_blocks=allow_blocks,
  31. )
  32. parsing_result = {
  33. 'original_text': text,
  34. 'parsed_text': '',
  35. 'markdown': md,
  36. 'mentions': [],
  37. 'images': [],
  38. 'outgoing_links': [],
  39. 'inside_links': []
  40. }
  41. # Parse text
  42. parsed_text = md.convert(text)
  43. # Clean and store parsed text
  44. parsing_result['parsed_text'] = parsed_text.strip()
  45. if allow_links:
  46. linkify_paragraphs(parsing_result)
  47. parsing_result = pipeline.process_result(parsing_result)
  48. if allow_mentions:
  49. add_mentions(request, parsing_result)
  50. if allow_links or allow_images:
  51. clean_links(request, parsing_result, force_shva)
  52. if minify:
  53. minify_result(parsing_result)
  54. return parsing_result
  55. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  56. """
  57. Create and configure markdown object
  58. """
  59. md = markdown.Markdown(safe_mode='escape', extensions=['nl2br'])
  60. # Remove references
  61. del md.preprocessors['reference']
  62. del md.inlinePatterns['reference']
  63. del md.inlinePatterns['image_reference']
  64. del md.inlinePatterns['short_reference']
  65. # Add [b], [i], [u]
  66. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  67. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  68. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  69. # Add ~~deleted~~
  70. striketrough_md = StriketroughExtension()
  71. striketrough_md.extendMarkdown(md)
  72. if not allow_links:
  73. # Remove links
  74. del md.inlinePatterns['link']
  75. del md.inlinePatterns['autolink']
  76. del md.inlinePatterns['automail']
  77. if allow_images:
  78. # Add [img]
  79. short_images_md = ShortImagesExtension()
  80. short_images_md.extendMarkdown(md)
  81. else:
  82. # Remove images
  83. del md.inlinePatterns['image_link']
  84. if allow_blocks:
  85. # Add [hr] and [quote] blocks
  86. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  87. fenced_code = FencedCodeExtension()
  88. fenced_code.extendMarkdown(md, None)
  89. code_bbcode = blocks.CodeBlockExtension()
  90. code_bbcode.extendMarkdown(md)
  91. quote_bbcode = blocks.QuoteExtension()
  92. quote_bbcode.extendMarkdown(md)
  93. else:
  94. # Remove blocks
  95. del md.parser.blockprocessors['hashheader']
  96. del md.parser.blockprocessors['setextheader']
  97. del md.parser.blockprocessors['code']
  98. del md.parser.blockprocessors['quote']
  99. del md.parser.blockprocessors['hr']
  100. del md.parser.blockprocessors['olist']
  101. del md.parser.blockprocessors['ulist']
  102. return pipeline.extend_markdown(md)
  103. def linkify_paragraphs(result):
  104. result['parsed_text'] = bleach.linkify(
  105. result['parsed_text'], skip_pre=True, parse_email=True)
  106. # dirty fix for
  107. if '<code>' in result['parsed_text'] and '<a' in result['parsed_text']:
  108. with warnings.catch_warnings():
  109. warnings.simplefilter("ignore")
  110. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  111. for link in soup.select('code > a'):
  112. link.replace_with(BeautifulSoup(link.string, 'html.parser'))
  113. # [6:-7] trims <body></body> wrap
  114. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  115. def clean_links(request, result, force_shva=False):
  116. host = request.get_host()
  117. site_address = '%s://%s' % (request.scheme, request.get_host())
  118. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  119. for link in soup.find_all('a'):
  120. if is_internal_link(link['href'], host):
  121. link['href'] = clean_internal_link(link['href'], host)
  122. result['inside_links'].append(link['href'])
  123. link['href'] = clean_attachment_link(link['href'], force_shva)
  124. else:
  125. result['outgoing_links'].append(link['href'])
  126. if link.string:
  127. link.string = clean_link_prefix(link.string)
  128. for img in soup.find_all('img'):
  129. img['alt'] = clean_link_prefix(img['alt'])
  130. if is_internal_link(img['src'], host):
  131. img['src'] = clean_internal_link(img['src'], host)
  132. result['images'].append(img['src'])
  133. img['src'] = clean_attachment_link(img['src'], force_shva)
  134. else:
  135. result['images'].append(img['src'])
  136. # [6:-7] trims <body></body> wrap
  137. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  138. def is_internal_link(link, host):
  139. if link.startswith('/') and not link.startswith('//'):
  140. return True
  141. link = clean_link_prefix(link).lstrip('www.').lower()
  142. return link.lower().startswith(host.lstrip('www.'))
  143. def clean_link_prefix(link):
  144. if link.lower().startswith('https:'):
  145. link = link[6:]
  146. if link.lower().startswith('http:'):
  147. link = link[5:]
  148. if link.startswith('//'):
  149. link = link[2:]
  150. return link
  151. def clean_internal_link(link, host):
  152. link = clean_link_prefix(link)
  153. if link.lower().startswith('www.'):
  154. link = link[4:]
  155. if host.lower().startswith('www.'):
  156. host = host[4:]
  157. if link.lower().startswith(host):
  158. link = link[len(host):]
  159. return link or '/'
  160. def clean_attachment_link(link, force_shva=False):
  161. try:
  162. resolution = resolve(link)
  163. url_name = ':'.join(resolution.namespaces + [resolution.url_name])
  164. except (Http404, ValueError):
  165. return link
  166. if url_name in MISAGO_ATTACHMENT_VIEWS:
  167. if force_shva:
  168. link = '{}?shva=1'.format(link)
  169. elif link.endswith('?shva=1'):
  170. link = link[:-7]
  171. return link
  172. def minify_result(result):
  173. # [25:-14] trims <html><head></head><body> and </body></html>
  174. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  175. result['parsed_text'] = result['parsed_text'][25:-14]