parser.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import warnings
  2. import bleach
  3. import markdown
  4. from bs4 import BeautifulSoup
  5. from htmlmin.minify import html_minify
  6. from markdown.extensions.fenced_code import FencedCodeExtension
  7. from django.http import Http404
  8. from django.urls import resolve
  9. from misago.conf import settings
  10. from .bbcode import blocks, inline
  11. from .md.shortimgs import ShortImagesExtension
  12. from .md.striketrough import StriketroughExtension
  13. from .mentions import add_mentions
  14. from .pipeline import pipeline
  15. MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')
  16. def parse(
  17. text,
  18. request,
  19. poster,
  20. allow_mentions=True,
  21. allow_links=True,
  22. allow_images=True,
  23. allow_blocks=True,
  24. force_shva=False,
  25. minify=True
  26. ):
  27. """
  28. Message parser
  29. Utility for flavours to call
  30. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  31. headers, lists, images, spoilers, text styles
  32. Returns dict object
  33. """
  34. md = md_factory(
  35. allow_links=allow_links,
  36. allow_images=allow_images,
  37. allow_blocks=allow_blocks,
  38. )
  39. parsing_result = {
  40. 'original_text': text,
  41. 'parsed_text': '',
  42. 'markdown': md,
  43. 'mentions': [],
  44. 'images': [],
  45. 'internal_links': [],
  46. 'outgoing_links': [],
  47. }
  48. # Parse text
  49. parsed_text = md.convert(text)
  50. # Clean and store parsed text
  51. parsing_result['parsed_text'] = parsed_text.strip()
  52. if allow_links:
  53. linkify_paragraphs(parsing_result)
  54. parsing_result = pipeline.process_result(parsing_result)
  55. if allow_mentions:
  56. add_mentions(request, parsing_result)
  57. if allow_links or allow_images:
  58. clean_links(request, parsing_result, force_shva)
  59. if minify:
  60. minify_result(parsing_result)
  61. return parsing_result
  62. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  63. """creates and configures markdown object"""
  64. md = markdown.Markdown(extensions=[
  65. 'markdown.extensions.nl2br',
  66. ])
  67. # Remove HTML allowances
  68. del md.preprocessors['html_block']
  69. del md.inlinePatterns['html']
  70. # Remove references
  71. del md.preprocessors['reference']
  72. del md.inlinePatterns['reference']
  73. del md.inlinePatterns['image_reference']
  74. del md.inlinePatterns['short_reference']
  75. # Add [b], [i], [u]
  76. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  77. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  78. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  79. # Add ~~deleted~~
  80. striketrough_md = StriketroughExtension()
  81. striketrough_md.extendMarkdown(md)
  82. if allow_links:
  83. # Add [url]
  84. md.inlinePatterns.add('bb_url', inline.url(md), '<link')
  85. else:
  86. # Remove links
  87. del md.inlinePatterns['link']
  88. del md.inlinePatterns['autolink']
  89. del md.inlinePatterns['automail']
  90. if allow_images:
  91. # Add [img]
  92. md.inlinePatterns.add('bb_img', inline.image(md), '<image_link')
  93. short_images_md = ShortImagesExtension()
  94. short_images_md.extendMarkdown(md)
  95. else:
  96. # Remove images
  97. del md.inlinePatterns['image_link']
  98. if allow_blocks:
  99. # Add [hr] and [quote] blocks
  100. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  101. fenced_code = FencedCodeExtension()
  102. fenced_code.extendMarkdown(md, None)
  103. code_bbcode = blocks.CodeBlockExtension()
  104. code_bbcode.extendMarkdown(md)
  105. quote_bbcode = blocks.QuoteExtension()
  106. quote_bbcode.extendMarkdown(md)
  107. else:
  108. # Remove blocks
  109. del md.parser.blockprocessors['hashheader']
  110. del md.parser.blockprocessors['setextheader']
  111. del md.parser.blockprocessors['code']
  112. del md.parser.blockprocessors['quote']
  113. del md.parser.blockprocessors['hr']
  114. del md.parser.blockprocessors['olist']
  115. del md.parser.blockprocessors['ulist']
  116. return pipeline.extend_markdown(md)
  117. def linkify_paragraphs(result):
  118. result['parsed_text'] = bleach.linkify(
  119. result['parsed_text'],
  120. callbacks=settings.MISAGO_BLEACH_CALLBACKS,
  121. skip_tags=['a', 'code', 'pre'],
  122. parse_email=True,
  123. )
  124. def clean_links(request, result, force_shva=False):
  125. host = request.get_host()
  126. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  127. for link in soup.find_all('a'):
  128. if is_internal_link(link['href'], host):
  129. link['href'] = clean_internal_link(link['href'], host)
  130. result['internal_links'].append(link['href'])
  131. link['href'] = clean_attachment_link(link['href'], force_shva)
  132. else:
  133. result['outgoing_links'].append(clean_link_prefix(link['href']))
  134. link['href'] = assert_link_prefix(link['href'])
  135. link['rel'] = 'nofollow noopener'
  136. if link.string:
  137. link.string = clean_link_prefix(link.string)
  138. for img in soup.find_all('img'):
  139. img['alt'] = clean_link_prefix(img['alt'])
  140. if is_internal_link(img['src'], host):
  141. img['src'] = clean_internal_link(img['src'], host)
  142. result['images'].append(img['src'])
  143. img['src'] = clean_attachment_link(img['src'], force_shva)
  144. else:
  145. result['images'].append(clean_link_prefix(img['src']))
  146. img['src'] = assert_link_prefix(img['src'])
  147. # [6:-7] trims <body></body> wrap
  148. result['parsed_text'] = str(soup.body)[6:-7]
  149. def is_internal_link(link, host):
  150. if link.startswith('/') and not link.startswith('//'):
  151. return True
  152. link = clean_link_prefix(link).lstrip('www.').lower()
  153. return link.lower().startswith(host.lstrip('www.'))
  154. def clean_link_prefix(link):
  155. if link.lower().startswith('https:'):
  156. link = link[6:]
  157. if link.lower().startswith('http:'):
  158. link = link[5:]
  159. if link.startswith('//'):
  160. link = link[2:]
  161. return link
  162. def assert_link_prefix(link):
  163. if link.lower().startswith('https:'):
  164. return link
  165. if link.lower().startswith('http:'):
  166. return link
  167. if link.startswith('//'):
  168. return 'http:{}'.format(link)
  169. return 'http://{}'.format(link)
  170. def clean_internal_link(link, host):
  171. link = clean_link_prefix(link)
  172. if link.lower().startswith('www.'):
  173. link = link[4:]
  174. if host.lower().startswith('www.'):
  175. host = host[4:]
  176. if link.lower().startswith(host):
  177. link = link[len(host):]
  178. return link or '/'
  179. def clean_attachment_link(link, force_shva=False):
  180. try:
  181. resolution = resolve(link)
  182. url_name = ':'.join(resolution.namespaces + [resolution.url_name])
  183. except (Http404, ValueError):
  184. return link
  185. if url_name in MISAGO_ATTACHMENT_VIEWS:
  186. if force_shva:
  187. link = '{}?shva=1'.format(link)
  188. elif link.endswith('?shva=1'):
  189. link = link[:-7]
  190. return link
  191. def minify_result(result):
  192. # [25:-14] trims <html><head></head><body> and </body></html>
  193. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  194. result['parsed_text'] = result['parsed_text'][25:-14]