parser.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. from __future__ import unicode_literals
  2. import warnings
  3. import bleach
  4. import markdown
  5. from bs4 import BeautifulSoup
  6. from htmlmin.minify import html_minify
  7. from markdown.extensions.fenced_code import FencedCodeExtension
  8. from django.http import Http404
  9. from django.urls import resolve
  10. from django.utils import six
  11. from misago.conf import settings
  12. from .bbcode import blocks, inline
  13. from .md.shortimgs import ShortImagesExtension
  14. from .md.striketrough import StriketroughExtension
  15. from .mentions import add_mentions
  16. from .pipeline import pipeline
  17. MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')
  18. def parse(
  19. text,
  20. request,
  21. poster,
  22. allow_mentions=True,
  23. allow_links=True,
  24. allow_images=True,
  25. allow_blocks=True,
  26. force_shva=False,
  27. minify=True
  28. ):
  29. """
  30. Message parser
  31. Utility for flavours to call
  32. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  33. headers, lists, images, spoilers, text styles
  34. Returns dict object
  35. """
  36. md = md_factory(
  37. allow_links=allow_links,
  38. allow_images=allow_images,
  39. allow_blocks=allow_blocks,
  40. )
  41. parsing_result = {
  42. 'original_text': text,
  43. 'parsed_text': '',
  44. 'markdown': md,
  45. 'mentions': [],
  46. 'images': [],
  47. 'internal_links': [],
  48. 'outgoing_links': [],
  49. }
  50. # Parse text
  51. parsed_text = md.convert(text)
  52. # Clean and store parsed text
  53. parsing_result['parsed_text'] = parsed_text.strip()
  54. if allow_links:
  55. linkify_paragraphs(parsing_result)
  56. parsing_result = pipeline.process_result(parsing_result)
  57. if allow_mentions:
  58. add_mentions(request, parsing_result)
  59. if allow_links or allow_images:
  60. clean_links(request, parsing_result, force_shva)
  61. if minify:
  62. minify_result(parsing_result)
  63. return parsing_result
  64. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  65. """creates and configures markdown object"""
  66. md = markdown.Markdown(extensions=[
  67. 'markdown.extensions.nl2br',
  68. ])
  69. # Remove HTML allowances
  70. del md.preprocessors['html_block']
  71. del md.inlinePatterns['html']
  72. # Remove references
  73. del md.preprocessors['reference']
  74. del md.inlinePatterns['reference']
  75. del md.inlinePatterns['image_reference']
  76. del md.inlinePatterns['short_reference']
  77. # Add [b], [i], [u]
  78. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  79. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  80. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  81. # Add ~~deleted~~
  82. striketrough_md = StriketroughExtension()
  83. striketrough_md.extendMarkdown(md)
  84. if allow_links:
  85. # Add [url]
  86. md.inlinePatterns.add('bb_url', inline.url(md), '<link')
  87. else:
  88. # Remove links
  89. del md.inlinePatterns['link']
  90. del md.inlinePatterns['autolink']
  91. del md.inlinePatterns['automail']
  92. if allow_images:
  93. # Add [img]
  94. md.inlinePatterns.add('bb_img', inline.image(md), '<image_link')
  95. short_images_md = ShortImagesExtension()
  96. short_images_md.extendMarkdown(md)
  97. else:
  98. # Remove images
  99. del md.inlinePatterns['image_link']
  100. if allow_blocks:
  101. # Add [hr] and [quote] blocks
  102. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  103. fenced_code = FencedCodeExtension()
  104. fenced_code.extendMarkdown(md, None)
  105. code_bbcode = blocks.CodeBlockExtension()
  106. code_bbcode.extendMarkdown(md)
  107. quote_bbcode = blocks.QuoteExtension()
  108. quote_bbcode.extendMarkdown(md)
  109. else:
  110. # Remove blocks
  111. del md.parser.blockprocessors['hashheader']
  112. del md.parser.blockprocessors['setextheader']
  113. del md.parser.blockprocessors['code']
  114. del md.parser.blockprocessors['quote']
  115. del md.parser.blockprocessors['hr']
  116. del md.parser.blockprocessors['olist']
  117. del md.parser.blockprocessors['ulist']
  118. return pipeline.extend_markdown(md)
  119. def linkify_paragraphs(result):
  120. result['parsed_text'] = bleach.linkify(
  121. result['parsed_text'],
  122. callbacks=settings.MISAGO_BLEACH_CALLBACKS,
  123. skip_tags=['a', 'code', 'pre'],
  124. parse_email=True,
  125. )
  126. def clean_links(request, result, force_shva=False):
  127. host = request.get_host()
  128. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  129. for link in soup.find_all('a'):
  130. if is_internal_link(link['href'], host):
  131. link['href'] = clean_internal_link(link['href'], host)
  132. result['internal_links'].append(link['href'])
  133. link['href'] = clean_attachment_link(link['href'], force_shva)
  134. else:
  135. result['outgoing_links'].append(clean_link_prefix(link['href']))
  136. link['href'] = assert_link_prefix(link['href'])
  137. link['rel'] = 'nofollow noopener'
  138. if link.string:
  139. link.string = clean_link_prefix(link.string)
  140. for img in soup.find_all('img'):
  141. img['alt'] = clean_link_prefix(img['alt'])
  142. if is_internal_link(img['src'], host):
  143. img['src'] = clean_internal_link(img['src'], host)
  144. result['images'].append(img['src'])
  145. img['src'] = clean_attachment_link(img['src'], force_shva)
  146. else:
  147. result['images'].append(clean_link_prefix(img['src']))
  148. img['src'] = assert_link_prefix(img['src'])
  149. # [6:-7] trims <body></body> wrap
  150. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  151. def is_internal_link(link, host):
  152. if link.startswith('/') and not link.startswith('//'):
  153. return True
  154. link = clean_link_prefix(link).lstrip('www.').lower()
  155. return link.lower().startswith(host.lstrip('www.'))
  156. def clean_link_prefix(link):
  157. if link.lower().startswith('https:'):
  158. link = link[6:]
  159. if link.lower().startswith('http:'):
  160. link = link[5:]
  161. if link.startswith('//'):
  162. link = link[2:]
  163. return link
  164. def assert_link_prefix(link):
  165. if link.lower().startswith('https:'):
  166. return link
  167. if link.lower().startswith('http:'):
  168. return link
  169. if link.startswith('//'):
  170. return 'http:{}'.format(link)
  171. return 'http://{}'.format(link)
  172. def clean_internal_link(link, host):
  173. link = clean_link_prefix(link)
  174. if link.lower().startswith('www.'):
  175. link = link[4:]
  176. if host.lower().startswith('www.'):
  177. host = host[4:]
  178. if link.lower().startswith(host):
  179. link = link[len(host):]
  180. return link or '/'
  181. def clean_attachment_link(link, force_shva=False):
  182. try:
  183. resolution = resolve(link)
  184. url_name = ':'.join(resolution.namespaces + [resolution.url_name])
  185. except (Http404, ValueError):
  186. return link
  187. if url_name in MISAGO_ATTACHMENT_VIEWS:
  188. if force_shva:
  189. link = '{}?shva=1'.format(link)
  190. elif link.endswith('?shva=1'):
  191. link = link[:-7]
  192. return link
  193. def minify_result(result):
  194. # [25:-14] trims <html><head></head><body> and </body></html>
  195. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  196. result['parsed_text'] = result['parsed_text'][25:-14]