221V
/
Misago


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
							import bleach
import markdown
from bs4 import BeautifulSoup
from django.http import Http404
from django.urls import resolve
from htmlmin.minify import html_minify
from markdown.extensions.fenced_code import FencedCodeExtension

from ..conf import settings
from .bbcode.code import CodeBlockExtension
from .bbcode.hr import BBCodeHRProcessor
from .bbcode.inline import bold, image, italics, underline, url
from .bbcode.quote import QuoteExtension
from .bbcode.spoiler import SpoilerExtension
from .md.shortimgs import ShortImagesExtension
from .md.strikethrough import StrikethroughExtension
from .mentions import add_mentions
from .pipeline import pipeline

MISAGO_ATTACHMENT_VIEWS = ("misago:attachment", "misago:attachment-thumbnail")


def parse(
    text,
    request,
    poster,
    allow_mentions=True,
    allow_links=True,
    allow_images=True,
    allow_blocks=True,
    force_shva=False,
    minify=True,
):
    """
    Message parser

    Utility for flavours to call

    Breaks text into paragraphs, supports code, spoiler and quote blocks,
    headers, lists, images, spoilers, text styles

    Returns dict object
    """
    md = md_factory(
        allow_links=allow_links, allow_images=allow_images, allow_blocks=allow_blocks
    )

    parsing_result = {
        "original_text": text,
        "parsed_text": "",
        "markdown": md,
        "mentions": [],
        "images": [],
        "internal_links": [],
        "outgoing_links": [],
    }

    # Parse text
    parsed_text = md.convert(text)

    # Clean and store parsed text
    parsing_result["parsed_text"] = parsed_text.strip()

    if allow_links:
        linkify_paragraphs(parsing_result)

    parsing_result = pipeline.process_result(parsing_result)

    if allow_mentions:
        add_mentions(request, parsing_result)

    if allow_links or allow_images:
        clean_links(request, parsing_result, force_shva)

    if minify:
        minify_result(parsing_result)
    return parsing_result


def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
    """creates and configures markdown object"""
    md = markdown.Markdown(extensions=["markdown.extensions.nl2br"])

    # Remove HTML allowances
    md.preprocessors.deregister("html_block")
    md.inlinePatterns.deregister("html")

    # Remove references
    md.parser.blockprocessors.deregister("reference")
    md.inlinePatterns.deregister("reference")
    md.inlinePatterns.deregister("image_reference")
    md.inlinePatterns.deregister("short_reference")

    # Add [b], [i], [u]
    md.inlinePatterns.register(bold, "bb_b", 55)
    md.inlinePatterns.register(italics, "bb_i", 55)
    md.inlinePatterns.register(underline, "bb_u", 55)

    # Add ~~deleted~~
    strikethrough_md = StrikethroughExtension()
    strikethrough_md.extendMarkdown(md)

    if allow_links:
        # Add [url]
        md.inlinePatterns.register(url(md), "bb_url", 155)
    else:
        # Remove links
        md.inlinePatterns.deregister("link")
        md.inlinePatterns.deregister("autolink")
        md.inlinePatterns.deregister("automail")

    if allow_images:
        # Add [img]
        md.inlinePatterns.register(image(md), "bb_img", 145)
        short_images_md = ShortImagesExtension()
        short_images_md.extendMarkdown(md)
    else:
        # Remove images
        md.inlinePatterns.deregister("image_link")

    if allow_blocks:
        # Add [hr] and [quote] blocks
        md.parser.blockprocessors.register(BBCodeHRProcessor(md.parser), "bb_hr", 45)

        fenced_code = FencedCodeExtension(lang_prefix="")
        fenced_code.extendMarkdown(md)

        code_bbcode = CodeBlockExtension()
        code_bbcode.extendMarkdown(md)

        quote_bbcode = QuoteExtension()
        quote_bbcode.extendMarkdown(md)

        spoiler_bbcode = SpoilerExtension()
        spoiler_bbcode.extendMarkdown(md)
    else:
        # Remove blocks
        md.parser.blockprocessors.deregister("hashheader")
        md.parser.blockprocessors.deregister("setextheader")
        md.parser.blockprocessors.deregister("code")
        md.parser.blockprocessors.deregister("quote")
        md.parser.blockprocessors.deregister("hr")
        md.parser.blockprocessors.deregister("olist")
        md.parser.blockprocessors.deregister("ulist")

    return pipeline.extend_markdown(md)


def linkify_paragraphs(result):
    result["parsed_text"] = bleach.linkify(
        result["parsed_text"],
        callbacks=settings.MISAGO_BLEACH_CALLBACKS,
        skip_tags=["a", "code", "pre"],
        parse_email=True,
    )


def clean_links(request, result, force_shva=False):
    host = request.get_host()

    soup = BeautifulSoup(result["parsed_text"], "html5lib")
    for link in soup.find_all("a"):
        if is_internal_link(link["href"], host):
            link["href"] = clean_internal_link(link["href"], host)
            result["internal_links"].append(link["href"])
            link["href"] = clean_attachment_link(link["href"], force_shva)
        else:
            result["outgoing_links"].append(clean_link_prefix(link["href"]))
            link["href"] = assert_link_prefix(link["href"])
            link["rel"] = "external nofollow noopener"

        link["target"] = "_blank"

        if link.string:
            link.string = clean_link_prefix(link.string)

    for img in soup.find_all("img"):
        img["alt"] = clean_link_prefix(img["alt"])
        if is_internal_link(img["src"], host):
            img["src"] = clean_internal_link(img["src"], host)
            result["images"].append(img["src"])
            img["src"] = clean_attachment_link(img["src"], force_shva)
        else:
            result["images"].append(clean_link_prefix(img["src"]))
            img["src"] = assert_link_prefix(img["src"])

    # [6:-7] trims <body></body> wrap
    result["parsed_text"] = str(soup.body)[6:-7]


def is_internal_link(link, host):
    if link.startswith("/") and not link.startswith("//"):
        return True

    link = clean_link_prefix(link).lstrip("www.").lower()
    return link.lower().startswith(host.lstrip("www."))


def clean_link_prefix(link):
    if link.lower().startswith("https:"):
        link = link[6:]
    if link.lower().startswith("http:"):
        link = link[5:]
    if link.startswith("//"):
        link = link[2:]
    return link


def assert_link_prefix(link):
    if link.lower().startswith("https:"):
        return link
    if link.lower().startswith("http:"):
        return link
    if link.startswith("//"):
        return "http:%s" % link

    return "http://%s" % link


def clean_internal_link(link, host):
    link = clean_link_prefix(link)

    if link.lower().startswith("www."):
        link = link[4:]
    if host.lower().startswith("www."):
        host = host[4:]

    if link.lower().startswith(host):
        link = link[len(host) :]

    return link or "/"


def clean_attachment_link(link, force_shva=False):
    try:
        resolution = resolve(link)
        if not resolution.namespaces:
            return link
        url_name = ":".join(resolution.namespaces + [resolution.url_name])
    except (Http404, ValueError):
        return link

    if url_name in MISAGO_ATTACHMENT_VIEWS:
        if force_shva:
            link = "%s?shva=1" % link
        elif link.endswith("?shva=1"):
            link = link[:-7]
    return link


def minify_result(result):
    result["parsed_text"] = html_minify(result["parsed_text"])
    result["parsed_text"] = strip_html_head_body(result["parsed_text"])


def strip_html_head_body(parsed_text):
    # [25:-14] trims <html><head></head><body> and </body></html>
    return parsed_text[25:-14]