parser.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. import bleach
  2. import markdown
  3. from bs4 import BeautifulSoup
  4. from django.http import Http404
  5. from django.urls import resolve
  6. from htmlmin.minify import html_minify
  7. from markdown.extensions.fenced_code import FencedCodeExtension
  8. from ..conf import settings
  9. from .bbcode.code import CodeBlockExtension
  10. from .bbcode.hr import BBCodeHRProcessor
  11. from .bbcode.inline import bold, image, italics, underline, url
  12. from .bbcode.quote import QuoteExtension
  13. from .bbcode.spoiler import SpoilerExtension
  14. from .md.shortimgs import ShortImagesExtension
  15. from .md.strikethrough import StrikethroughExtension
  16. from .mentions import add_mentions
  17. from .pipeline import pipeline
  18. MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
  19. def parse(
  20. text,
  21. request,
  22. poster,
  23. allow_mentions=True,
  24. allow_links=True,
  25. allow_images=True,
  26. allow_blocks=True,
  27. force_shva=False,
  28. minify=True,
  29. ):
  30. """
  31. Message parser
  32. Utility for flavours to call
  33. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  34. headers, lists, images, spoilers, text styles
  35. Returns dict object
  36. """
  37. md = md_factory(
  38. allow_links=allow_links, allow_images=allow_images, allow_blocks=allow_blocks
  39. )
  40. parsing_result = {
  41. "original_text": text,
  42. "parsed_text": "",
  43. "markdown": md,
  44. "mentions": [],
  45. "images": [],
  46. "internal_links": [],
  47. "outgoing_links": [],
  48. }
  49. # Parse text
  50. parsed_text = md.convert(text)
  51. # Clean and store parsed text
  52. parsing_result["parsed_text"] = parsed_text.strip()
  53. if allow_links:
  54. linkify_paragraphs(parsing_result)
  55. parsing_result = pipeline.process_result(parsing_result)
  56. if allow_mentions:
  57. add_mentions(request, parsing_result)
  58. if allow_links or allow_images:
  59. clean_links(request, parsing_result, force_shva)
  60. if minify:
  61. minify_result(parsing_result)
  62. return parsing_result
  63. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  64. """creates and configures markdown object"""
  65. md = markdown.Markdown(extensions=["markdown.extensions.nl2br"])
  66. # Remove HTML allowances
  67. md.preprocessors.deregister("html_block")
  68. md.inlinePatterns.deregister("html")
  69. # Remove references
  70. md.parser.blockprocessors.deregister("reference")
  71. md.inlinePatterns.deregister("reference")
  72. md.inlinePatterns.deregister("image_reference")
  73. md.inlinePatterns.deregister("short_reference")
  74. # Add [b], [i], [u]
  75. md.inlinePatterns.register(bold, "bb_b", 55)
  76. md.inlinePatterns.register(italics, "bb_i", 55)
  77. md.inlinePatterns.register(underline, "bb_u", 55)
  78. # Add ~~deleted~~
  79. strikethrough_md = StrikethroughExtension()
  80. strikethrough_md.extendMarkdown(md)
  81. if allow_links:
  82. # Add [url]
  83. md.inlinePatterns.register(url(md), "bb_url", 155)
  84. else:
  85. # Remove links
  86. md.inlinePatterns.deregister("link")
  87. md.inlinePatterns.deregister("autolink")
  88. md.inlinePatterns.deregister("automail")
  89. if allow_images:
  90. # Add [img]
  91. md.inlinePatterns.register(image(md), "bb_img", 145)
  92. short_images_md = ShortImagesExtension()
  93. short_images_md.extendMarkdown(md)
  94. else:
  95. # Remove images
  96. md.inlinePatterns.deregister("image_link")
  97. if allow_blocks:
  98. # Add [hr] and [quote] blocks
  99. md.parser.blockprocessors.register(BBCodeHRProcessor(md.parser), "bb_hr", 45)
  100. fenced_code = FencedCodeExtension(lang_prefix="")
  101. fenced_code.extendMarkdown(md)
  102. code_bbcode = CodeBlockExtension()
  103. code_bbcode.extendMarkdown(md)
  104. quote_bbcode = QuoteExtension()
  105. quote_bbcode.extendMarkdown(md)
  106. spoiler_bbcode = SpoilerExtension()
  107. spoiler_bbcode.extendMarkdown(md)
  108. else:
  109. # Remove blocks
  110. md.parser.blockprocessors.deregister("hashheader")
  111. md.parser.blockprocessors.deregister("setextheader")
  112. md.parser.blockprocessors.deregister("code")
  113. md.parser.blockprocessors.deregister("quote")
  114. md.parser.blockprocessors.deregister("hr")
  115. md.parser.blockprocessors.deregister("olist")
  116. md.parser.blockprocessors.deregister("ulist")
  117. return pipeline.extend_markdown(md)
  118. def linkify_paragraphs(result):
  119. result["parsed_text"] = bleach.linkify(
  120. result["parsed_text"],
  121. callbacks=settings.MISAGO_BLEACH_CALLBACKS,
  122. skip_tags=["a", "code", "pre"],
  123. parse_email=True,
  124. )
  125. def clean_links(request, result, force_shva=False):
  126. host = request.get_host()
  127. soup = BeautifulSoup(result["parsed_text"], "html5lib")
  128. for link in soup.find_all("a"):
  129. if is_internal_link(link["href"], host):
  130. link["href"] = clean_internal_link(link["href"], host)
  131. result["internal_links"].append(link["href"])
  132. link["href"] = clean_attachment_link(link["href"], force_shva)
  133. else:
  134. result["outgoing_links"].append(clean_link_prefix(link["href"]))
  135. link["href"] = assert_link_prefix(link["href"])
  136. link["rel"] = "external nofollow noopener"
  137. link["target"] = "_blank"
  138. if link.string:
  139. link.string = clean_link_prefix(link.string)
  140. for img in soup.find_all("img"):
  141. img["alt"] = clean_link_prefix(img["alt"])
  142. if is_internal_link(img["src"], host):
  143. img["src"] = clean_internal_link(img["src"], host)
  144. result["images"].append(img["src"])
  145. img["src"] = clean_attachment_link(img["src"], force_shva)
  146. else:
  147. result["images"].append(clean_link_prefix(img["src"]))
  148. img["src"] = assert_link_prefix(img["src"])
  149. # [6:-7] trims <body></body> wrap
  150. result["parsed_text"] = str(soup.body)[6:-7]
  151. def is_internal_link(link, host):
  152. if link.startswith("/") and not link.startswith("//"):
  153. return True
  154. link = clean_link_prefix(link).lstrip("www.").lower()
  155. return link.lower().startswith(host.lstrip("www."))
  156. def clean_link_prefix(link):
  157. if link.lower().startswith("https:"):
  158. link = link[6:]
  159. if link.lower().startswith("http:"):
  160. link = link[5:]
  161. if link.startswith("//"):
  162. link = link[2:]
  163. return link
  164. def assert_link_prefix(link):
  165. if link.lower().startswith("https:"):
  166. return link
  167. if link.lower().startswith("http:"):
  168. return link
  169. if link.startswith("//"):
  170. return "http:%s" % link
  171. return "http://%s" % link
  172. def clean_internal_link(link, host):
  173. link = clean_link_prefix(link)
  174. if link.lower().startswith("www."):
  175. link = link[4:]
  176. if host.lower().startswith("www."):
  177. host = host[4:]
  178. if link.lower().startswith(host):
  179. link = link[len(host) :]
  180. return link or "/"
  181. def clean_attachment_link(link, force_shva=False):
  182. try:
  183. resolution = resolve(link)
  184. if not resolution.namespaces:
  185. return link
  186. url_name = ":".join(resolution.namespaces + [resolution.url_name])
  187. except (Http404, ValueError):
  188. return link
  189. if url_name in MISAGO_ATTACHMENT_VIEWS:
  190. if force_shva:
  191. link = "%s?shva=1" % link
  192. elif link.endswith("?shva=1"):
  193. link = link[:-7]
  194. return link
  195. def minify_result(result):
  196. result["parsed_text"] = html_minify(result["parsed_text"])
  197. result["parsed_text"] = strip_html_head_body(result["parsed_text"])
  198. def strip_html_head_body(parsed_text):
  199. # [25:-14] trims <html><head></head><body> and </body></html>
  200. return parsed_text[25:-14]