123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- import bleach
- import markdown
- from bs4 import BeautifulSoup
- from django.http import Http404
- from django.urls import resolve
- from htmlmin.minify import html_minify
- from markdown.extensions.fenced_code import FencedCodeExtension
- from ..conf import settings
- from .bbcode import blocks, inline
- from .md.shortimgs import ShortImagesExtension
- from .md.striketrough import StriketroughExtension
- from .mentions import add_mentions
- from .pipeline import pipeline
- MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")
- def parse(
- text,
- request,
- poster,
- allow_mentions=True,
- allow_links=True,
- allow_images=True,
- allow_blocks=True,
- force_shva=False,
- minify=True,
- ):
- """
- Message parser
- Utility for flavours to call
- Breaks text into paragraphs, supports code, spoiler and quote blocks,
- headers, lists, images, spoilers, text styles
- Returns dict object
- """
- md = md_factory(
- allow_links=allow_links, allow_images=allow_images, allow_blocks=allow_blocks
- )
- parsing_result = {
- "original_text": text,
- "parsed_text": "",
- "markdown": md,
- "mentions": [],
- "images": [],
- "internal_links": [],
- "outgoing_links": [],
- }
- # Parse text
- parsed_text = md.convert(text)
- # Clean and store parsed text
- parsing_result["parsed_text"] = parsed_text.strip()
- if allow_links:
- linkify_paragraphs(parsing_result)
- parsing_result = pipeline.process_result(parsing_result)
- if allow_mentions:
- add_mentions(request, parsing_result)
- if allow_links or allow_images:
- clean_links(request, parsing_result, force_shva)
- if minify:
- minify_result(parsing_result)
- return parsing_result
- def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
- """creates and configures markdown object"""
- md = markdown.Markdown(extensions=["markdown.extensions.nl2br"])
- # Remove HTML allowances
- del md.preprocessors["html_block"]
- del md.inlinePatterns["html"]
- # Remove references
- del md.preprocessors["reference"]
- del md.inlinePatterns["reference"]
- del md.inlinePatterns["image_reference"]
- del md.inlinePatterns["short_reference"]
- # Add [b], [i], [u]
- md.inlinePatterns.add("bb_b", inline.bold, "<strong")
- md.inlinePatterns.add("bb_i", inline.italics, "<emphasis")
- md.inlinePatterns.add("bb_u", inline.underline, "<emphasis2")
- # Add ~~deleted~~
- striketrough_md = StriketroughExtension()
- striketrough_md.extendMarkdown(md)
- if allow_links:
- # Add [url]
- md.inlinePatterns.add("bb_url", inline.url(md), "<link")
- else:
- # Remove links
- del md.inlinePatterns["link"]
- del md.inlinePatterns["autolink"]
- del md.inlinePatterns["automail"]
- if allow_images:
- # Add [img]
- md.inlinePatterns.add("bb_img", inline.image(md), "<image_link")
- short_images_md = ShortImagesExtension()
- short_images_md.extendMarkdown(md)
- else:
- # Remove images
- del md.inlinePatterns["image_link"]
- if allow_blocks:
- # Add [hr] and [quote] blocks
- md.parser.blockprocessors.add(
- "bb_hr", blocks.BBCodeHRProcessor(md.parser), ">hr"
- )
- fenced_code = FencedCodeExtension()
- fenced_code.extendMarkdown(md, None)
- code_bbcode = blocks.CodeBlockExtension()
- code_bbcode.extendMarkdown(md)
- quote_bbcode = blocks.QuoteExtension()
- quote_bbcode.extendMarkdown(md)
- else:
- # Remove blocks
- del md.parser.blockprocessors["hashheader"]
- del md.parser.blockprocessors["setextheader"]
- del md.parser.blockprocessors["code"]
- del md.parser.blockprocessors["quote"]
- del md.parser.blockprocessors["hr"]
- del md.parser.blockprocessors["olist"]
- del md.parser.blockprocessors["ulist"]
- return pipeline.extend_markdown(md)
- def linkify_paragraphs(result):
- result["parsed_text"] = bleach.linkify(
- result["parsed_text"],
- callbacks=settings.MISAGO_BLEACH_CALLBACKS,
- skip_tags=["a", "code", "pre"],
- parse_email=True,
- )
- def clean_links(request, result, force_shva=False):
- host = request.get_host()
- soup = BeautifulSoup(result["parsed_text"], "html5lib")
- for link in soup.find_all("a"):
- if is_internal_link(link["href"], host):
- link["href"] = clean_internal_link(link["href"], host)
- result["internal_links"].append(link["href"])
- link["href"] = clean_attachment_link(link["href"], force_shva)
- else:
- result["outgoing_links"].append(clean_link_prefix(link["href"]))
- link["href"] = assert_link_prefix(link["href"])
- link["rel"] = "nofollow noopener"
- if link.string:
- link.string = clean_link_prefix(link.string)
- for img in soup.find_all("img"):
- img["alt"] = clean_link_prefix(img["alt"])
- if is_internal_link(img["src"], host):
- img["src"] = clean_internal_link(img["src"], host)
- result["images"].append(img["src"])
- img["src"] = clean_attachment_link(img["src"], force_shva)
- else:
- result["images"].append(clean_link_prefix(img["src"]))
- img["src"] = assert_link_prefix(img["src"])
- # [6:-7] trims <body></body> wrap
- result["parsed_text"] = str(soup.body)[6:-7]
- def is_internal_link(link, host):
- if link.startswith("/") and not link.startswith("//"):
- return True
- link = clean_link_prefix(link).lstrip("www.").lower()
- return link.lower().startswith(host.lstrip("www."))
- def clean_link_prefix(link):
- if link.lower().startswith("https:"):
- link = link[6:]
- if link.lower().startswith("http:"):
- link = link[5:]
- if link.startswith("//"):
- link = link[2:]
- return link
- def assert_link_prefix(link):
- if link.lower().startswith("https:"):
- return link
- if link.lower().startswith("http:"):
- return link
- if link.startswith("//"):
- return "http:%s" % link
- return "http://%s" % link
- def clean_internal_link(link, host):
- link = clean_link_prefix(link)
- if link.lower().startswith("www."):
- link = link[4:]
- if host.lower().startswith("www."):
- host = host[4:]
- if link.lower().startswith(host):
- link = link[len(host) :]
- return link or "/"
- def clean_attachment_link(link, force_shva=False):
- try:
- resolution = resolve(link)
- url_name = ":".join(resolution.namespaces + [resolution.url_name])
- except (Http404, ValueError):
- return link
- if url_name in MISAGO_ATTACHMENT_VIEWS:
- if force_shva:
- link = "%s?shva=1" % link
- elif link.endswith("?shva=1"):
- link = link[:-7]
- return link
- def minify_result(result):
- result["parsed_text"] = html_minify(result["parsed_text"])
- result["parsed_text"] = strip_html_head_body(result["parsed_text"])
- def strip_html_head_body(parsed_text):
- # [25:-14] trims <html><head></head><body> and </body></html>
- return parsed_text[25:-14]
|