parser.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. import bleach
  2. import markdown
  3. from bs4 import BeautifulSoup
  4. from django.http import Http404
  5. from django.urls import resolve
  6. from htmlmin.minify import html_minify
  7. from markdown.extensions.fenced_code import FencedCodeExtension
  8. from ..conf import settings
  9. from .bbcode import blocks, inline
  10. from .md.shortimgs import ShortImagesExtension
  11. from .md.striketrough import StriketroughExtension
  12. from .mentions import add_mentions
  13. from .pipeline import pipeline
  14. MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
  15. def parse(
  16. text,
  17. request,
  18. poster,
  19. allow_mentions=True,
  20. allow_links=True,
  21. allow_images=True,
  22. allow_blocks=True,
  23. force_shva=False,
  24. minify=True,
  25. ):
  26. """
  27. Message parser
  28. Utility for flavours to call
  29. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  30. headers, lists, images, spoilers, text styles
  31. Returns dict object
  32. """
  33. md = md_factory(
  34. allow_links=allow_links, allow_images=allow_images, allow_blocks=allow_blocks
  35. )
  36. parsing_result = {
  37. "original_text": text,
  38. "parsed_text": "",
  39. "markdown": md,
  40. "mentions": [],
  41. "images": [],
  42. "internal_links": [],
  43. "outgoing_links": [],
  44. }
  45. # Parse text
  46. parsed_text = md.convert(text)
  47. # Clean and store parsed text
  48. parsing_result["parsed_text"] = parsed_text.strip()
  49. if allow_links:
  50. linkify_paragraphs(parsing_result)
  51. parsing_result = pipeline.process_result(parsing_result)
  52. if allow_mentions:
  53. add_mentions(request, parsing_result)
  54. if allow_links or allow_images:
  55. clean_links(request, parsing_result, force_shva)
  56. if minify:
  57. minify_result(parsing_result)
  58. return parsing_result
  59. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  60. """creates and configures markdown object"""
  61. md = markdown.Markdown(extensions=["markdown.extensions.nl2br"])
  62. # Remove HTML allowances
  63. del md.preprocessors["html_block"]
  64. del md.inlinePatterns["html"]
  65. # Remove references
  66. del md.preprocessors["reference"]
  67. del md.inlinePatterns["reference"]
  68. del md.inlinePatterns["image_reference"]
  69. del md.inlinePatterns["short_reference"]
  70. # Add [b], [i], [u]
  71. md.inlinePatterns.add("bb_b", inline.bold, "<strong")
  72. md.inlinePatterns.add("bb_i", inline.italics, "<emphasis")
  73. md.inlinePatterns.add("bb_u", inline.underline, "<emphasis2")
  74. # Add ~~deleted~~
  75. striketrough_md = StriketroughExtension()
  76. striketrough_md.extendMarkdown(md)
  77. if allow_links:
  78. # Add [url]
  79. md.inlinePatterns.add("bb_url", inline.url(md), "<link")
  80. else:
  81. # Remove links
  82. del md.inlinePatterns["link"]
  83. del md.inlinePatterns["autolink"]
  84. del md.inlinePatterns["automail"]
  85. if allow_images:
  86. # Add [img]
  87. md.inlinePatterns.add("bb_img", inline.image(md), "<image_link")
  88. short_images_md = ShortImagesExtension()
  89. short_images_md.extendMarkdown(md)
  90. else:
  91. # Remove images
  92. del md.inlinePatterns["image_link"]
  93. if allow_blocks:
  94. # Add [hr] and [quote] blocks
  95. md.parser.blockprocessors.add(
  96. "bb_hr", blocks.BBCodeHRProcessor(md.parser), ">hr"
  97. )
  98. fenced_code = FencedCodeExtension()
  99. fenced_code.extendMarkdown(md, None)
  100. code_bbcode = blocks.CodeBlockExtension()
  101. code_bbcode.extendMarkdown(md)
  102. quote_bbcode = blocks.QuoteExtension()
  103. quote_bbcode.extendMarkdown(md)
  104. else:
  105. # Remove blocks
  106. del md.parser.blockprocessors["hashheader"]
  107. del md.parser.blockprocessors["setextheader"]
  108. del md.parser.blockprocessors["code"]
  109. del md.parser.blockprocessors["quote"]
  110. del md.parser.blockprocessors["hr"]
  111. del md.parser.blockprocessors["olist"]
  112. del md.parser.blockprocessors["ulist"]
  113. return pipeline.extend_markdown(md)
  114. def linkify_paragraphs(result):
  115. result["parsed_text"] = bleach.linkify(
  116. result["parsed_text"],
  117. callbacks=settings.MISAGO_BLEACH_CALLBACKS,
  118. skip_tags=["a", "code", "pre"],
  119. parse_email=True,
  120. )
  121. def clean_links(request, result, force_shva=False):
  122. host = request.get_host()
  123. soup = BeautifulSoup(result["parsed_text"], "html5lib")
  124. for link in soup.find_all("a"):
  125. if is_internal_link(link["href"], host):
  126. link["href"] = clean_internal_link(link["href"], host)
  127. result["internal_links"].append(link["href"])
  128. link["href"] = clean_attachment_link(link["href"], force_shva)
  129. else:
  130. result["outgoing_links"].append(clean_link_prefix(link["href"]))
  131. link["href"] = assert_link_prefix(link["href"])
  132. link["rel"] = "nofollow noopener"
  133. if link.string:
  134. link.string = clean_link_prefix(link.string)
  135. for img in soup.find_all("img"):
  136. img["alt"] = clean_link_prefix(img["alt"])
  137. if is_internal_link(img["src"], host):
  138. img["src"] = clean_internal_link(img["src"], host)
  139. result["images"].append(img["src"])
  140. img["src"] = clean_attachment_link(img["src"], force_shva)
  141. else:
  142. result["images"].append(clean_link_prefix(img["src"]))
  143. img["src"] = assert_link_prefix(img["src"])
  144. # [6:-7] trims <body></body> wrap
  145. result["parsed_text"] = str(soup.body)[6:-7]
  146. def is_internal_link(link, host):
  147. if link.startswith("/") and not link.startswith("//"):
  148. return True
  149. link = clean_link_prefix(link).lstrip("www.").lower()
  150. return link.lower().startswith(host.lstrip("www."))
  151. def clean_link_prefix(link):
  152. if link.lower().startswith("https:"):
  153. link = link[6:]
  154. if link.lower().startswith("http:"):
  155. link = link[5:]
  156. if link.startswith("//"):
  157. link = link[2:]
  158. return link
  159. def assert_link_prefix(link):
  160. if link.lower().startswith("https:"):
  161. return link
  162. if link.lower().startswith("http:"):
  163. return link
  164. if link.startswith("//"):
  165. return "http:%s" % link
  166. return "http://%s" % link
  167. def clean_internal_link(link, host):
  168. link = clean_link_prefix(link)
  169. if link.lower().startswith("www."):
  170. link = link[4:]
  171. if host.lower().startswith("www."):
  172. host = host[4:]
  173. if link.lower().startswith(host):
  174. link = link[len(host) :]
  175. return link or "/"
  176. def clean_attachment_link(link, force_shva=False):
  177. try:
  178. resolution = resolve(link)
  179. url_name = ":".join(resolution.namespaces + [resolution.url_name])
  180. except (Http404, ValueError):
  181. return link
  182. if url_name in MISAGO_ATTACHMENT_VIEWS:
  183. if force_shva:
  184. link = "%s?shva=1" % link
  185. elif link.endswith("?shva=1"):
  186. link = link[:-7]
  187. return link
  188. def minify_result(result):
  189. result["parsed_text"] = html_minify(result["parsed_text"])
  190. result["parsed_text"] = strip_html_head_body(result["parsed_text"])
  191. def strip_html_head_body(parsed_text):
  192. # [25:-14] trims <html><head></head><body> and </body></html>
  193. return parsed_text[25:-14]