parser.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import bleach
  2. import markdown
  3. from bs4 import BeautifulSoup
  4. from django.http import Http404
  5. from django.urls import resolve
  6. from htmlmin.minify import html_minify
  7. from markdown.extensions.fenced_code import FencedCodeExtension
  8. from ..conf import settings
  9. from .bbcode.code import CodeBlockExtension
  10. from .bbcode.hr import BBCodeHRProcessor
  11. from .bbcode.inline import bold, image, italics, underline, url
  12. from .bbcode.quote import QuoteExtension
  13. from .md.shortimgs import ShortImagesExtension
  14. from .md.strikethrough import StrikethroughExtension
  15. from .mentions import add_mentions
  16. from .pipeline import pipeline
  17. MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
  18. def parse(
  19. text,
  20. request,
  21. poster,
  22. allow_mentions=True,
  23. allow_links=True,
  24. allow_images=True,
  25. allow_blocks=True,
  26. force_shva=False,
  27. minify=True,
  28. ):
  29. """
  30. Message parser
  31. Utility for flavours to call
  32. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  33. headers, lists, images, spoilers, text styles
  34. Returns dict object
  35. """
  36. md = md_factory(
  37. allow_links=allow_links, allow_images=allow_images, allow_blocks=allow_blocks
  38. )
  39. parsing_result = {
  40. "original_text": text,
  41. "parsed_text": "",
  42. "markdown": md,
  43. "mentions": [],
  44. "images": [],
  45. "internal_links": [],
  46. "outgoing_links": [],
  47. }
  48. # Parse text
  49. parsed_text = md.convert(text)
  50. # Clean and store parsed text
  51. parsing_result["parsed_text"] = parsed_text.strip()
  52. if allow_links:
  53. linkify_paragraphs(parsing_result)
  54. parsing_result = pipeline.process_result(parsing_result)
  55. if allow_mentions:
  56. add_mentions(request, parsing_result)
  57. if allow_links or allow_images:
  58. clean_links(request, parsing_result, force_shva)
  59. if minify:
  60. minify_result(parsing_result)
  61. return parsing_result
  62. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  63. """creates and configures markdown object"""
  64. md = markdown.Markdown(extensions=["markdown.extensions.nl2br"])
  65. # Remove HTML allowances
  66. del md.preprocessors["html_block"]
  67. del md.inlinePatterns["html"]
  68. # Remove references
  69. del md.preprocessors["reference"]
  70. del md.inlinePatterns["reference"]
  71. del md.inlinePatterns["image_reference"]
  72. del md.inlinePatterns["short_reference"]
  73. # Add [b], [i], [u]
  74. md.inlinePatterns.add("bb_b", bold, "<strong")
  75. md.inlinePatterns.add("bb_i", italics, "<emphasis")
  76. md.inlinePatterns.add("bb_u", underline, "<emphasis2")
  77. # Add ~~deleted~~
  78. striketrough_md = StrikethroughExtension()
  79. striketrough_md.extendMarkdown(md)
  80. if allow_links:
  81. # Add [url]
  82. md.inlinePatterns.add("bb_url", url(md), "<link")
  83. else:
  84. # Remove links
  85. del md.inlinePatterns["link"]
  86. del md.inlinePatterns["autolink"]
  87. del md.inlinePatterns["automail"]
  88. if allow_images:
  89. # Add [img]
  90. md.inlinePatterns.add("bb_img", image(md), "<image_link")
  91. short_images_md = ShortImagesExtension()
  92. short_images_md.extendMarkdown(md)
  93. else:
  94. # Remove images
  95. del md.inlinePatterns["image_link"]
  96. if allow_blocks:
  97. # Add [hr] and [quote] blocks
  98. md.parser.blockprocessors.add("bb_hr", BBCodeHRProcessor(md.parser), ">hr")
  99. fenced_code = FencedCodeExtension()
  100. fenced_code.extendMarkdown(md, None)
  101. code_bbcode = CodeBlockExtension()
  102. code_bbcode.extendMarkdown(md)
  103. quote_bbcode = QuoteExtension()
  104. quote_bbcode.extendMarkdown(md)
  105. else:
  106. # Remove blocks
  107. del md.parser.blockprocessors["hashheader"]
  108. del md.parser.blockprocessors["setextheader"]
  109. del md.parser.blockprocessors["code"]
  110. del md.parser.blockprocessors["quote"]
  111. del md.parser.blockprocessors["hr"]
  112. del md.parser.blockprocessors["olist"]
  113. del md.parser.blockprocessors["ulist"]
  114. return pipeline.extend_markdown(md)
  115. def linkify_paragraphs(result):
  116. result["parsed_text"] = bleach.linkify(
  117. result["parsed_text"],
  118. callbacks=settings.MISAGO_BLEACH_CALLBACKS,
  119. skip_tags=["a", "code", "pre"],
  120. parse_email=True,
  121. )
  122. def clean_links(request, result, force_shva=False):
  123. host = request.get_host()
  124. soup = BeautifulSoup(result["parsed_text"], "html5lib")
  125. for link in soup.find_all("a"):
  126. if is_internal_link(link["href"], host):
  127. link["href"] = clean_internal_link(link["href"], host)
  128. result["internal_links"].append(link["href"])
  129. link["href"] = clean_attachment_link(link["href"], force_shva)
  130. else:
  131. result["outgoing_links"].append(clean_link_prefix(link["href"]))
  132. link["href"] = assert_link_prefix(link["href"])
  133. link["rel"] = "nofollow noopener"
  134. if link.string:
  135. link.string = clean_link_prefix(link.string)
  136. for img in soup.find_all("img"):
  137. img["alt"] = clean_link_prefix(img["alt"])
  138. if is_internal_link(img["src"], host):
  139. img["src"] = clean_internal_link(img["src"], host)
  140. result["images"].append(img["src"])
  141. img["src"] = clean_attachment_link(img["src"], force_shva)
  142. else:
  143. result["images"].append(clean_link_prefix(img["src"]))
  144. img["src"] = assert_link_prefix(img["src"])
  145. # [6:-7] trims <body></body> wrap
  146. result["parsed_text"] = str(soup.body)[6:-7]
  147. def is_internal_link(link, host):
  148. if link.startswith("/") and not link.startswith("//"):
  149. return True
  150. link = clean_link_prefix(link).lstrip("www.").lower()
  151. return link.lower().startswith(host.lstrip("www."))
  152. def clean_link_prefix(link):
  153. if link.lower().startswith("https:"):
  154. link = link[6:]
  155. if link.lower().startswith("http:"):
  156. link = link[5:]
  157. if link.startswith("//"):
  158. link = link[2:]
  159. return link
  160. def assert_link_prefix(link):
  161. if link.lower().startswith("https:"):
  162. return link
  163. if link.lower().startswith("http:"):
  164. return link
  165. if link.startswith("//"):
  166. return "http:%s" % link
  167. return "http://%s" % link
  168. def clean_internal_link(link, host):
  169. link = clean_link_prefix(link)
  170. if link.lower().startswith("www."):
  171. link = link[4:]
  172. if host.lower().startswith("www."):
  173. host = host[4:]
  174. if link.lower().startswith(host):
  175. link = link[len(host) :]
  176. return link or "/"
  177. def clean_attachment_link(link, force_shva=False):
  178. try:
  179. resolution = resolve(link)
  180. if not resolution.namespaces:
  181. return link
  182. url_name = ":".join(resolution.namespaces + [resolution.url_name])
  183. except (Http404, ValueError):
  184. return link
  185. if url_name in MISAGO_ATTACHMENT_VIEWS:
  186. if force_shva:
  187. link = "%s?shva=1" % link
  188. elif link.endswith("?shva=1"):
  189. link = link[:-7]
  190. return link
  191. def minify_result(result):
  192. result["parsed_text"] = html_minify(result["parsed_text"])
  193. result["parsed_text"] = strip_html_head_body(result["parsed_text"])
  194. def strip_html_head_body(parsed_text):
  195. # [25:-14] trims <html><head></head><body> and </body></html>
  196. return parsed_text[25:-14]