221V
/
Misago


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
							from __future__ import unicode_literals

import warnings

import bleach
import markdown
from bs4 import BeautifulSoup
from htmlmin.minify import html_minify
from markdown.extensions.fenced_code import FencedCodeExtension

from django.http import Http404
from django.urls import resolve
from django.utils import six

from .bbcode import blocks, inline
from .md.shortimgs import ShortImagesExtension
from .md.striketrough import StriketroughExtension
from .mentions import add_mentions
from .pipeline import pipeline


MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')


def parse(
        text,
        request,
        poster,
        allow_mentions=True,
        allow_links=True,
        allow_images=True,
        allow_blocks=True,
        force_shva=False,
        minify=True
):
    """
    Message parser

    Utility for flavours to call

    Breaks text into paragraphs, supports code, spoiler and quote blocks,
    headers, lists, images, spoilers, text styles

    Returns dict object
    """
    md = md_factory(
        allow_links=allow_links,
        allow_images=allow_images,
        allow_blocks=allow_blocks,
    )

    parsing_result = {
        'original_text': text,
        'parsed_text': '',
        'markdown': md,
        'mentions': [],
        'images': [],
        'outgoing_links': [],
        'inside_links': [],
    }

    # Parse text
    parsed_text = md.convert(text)

    # Clean and store parsed text
    parsing_result['parsed_text'] = parsed_text.strip()

    if allow_links:
        linkify_paragraphs(parsing_result)

    parsing_result = pipeline.process_result(parsing_result)

    if allow_mentions:
        add_mentions(request, parsing_result)

    if allow_links or allow_images:
        clean_links(request, parsing_result, force_shva)

    if minify:
        minify_result(parsing_result)
    return parsing_result


def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
    """
    Create and configure markdown object
    """
    md = markdown.Markdown(safe_mode='escape', extensions=['nl2br'])

    # Remove references
    del md.preprocessors['reference']
    del md.inlinePatterns['reference']
    del md.inlinePatterns['image_reference']
    del md.inlinePatterns['short_reference']

    # Add [b], [i], [u]
    md.inlinePatterns.add('bb_b', inline.bold, '<strong')
    md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
    md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')

    # Add ~~deleted~~
    striketrough_md = StriketroughExtension()
    striketrough_md.extendMarkdown(md)

    if not allow_links:
        # Remove links
        del md.inlinePatterns['link']
        del md.inlinePatterns['autolink']
        del md.inlinePatterns['automail']

    if allow_images:
        # Add [img]
        short_images_md = ShortImagesExtension()
        short_images_md.extendMarkdown(md)
    else:
        # Remove images
        del md.inlinePatterns['image_link']

    if allow_blocks:
        # Add [hr] and [quote] blocks
        md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')

        fenced_code = FencedCodeExtension()
        fenced_code.extendMarkdown(md, None)

        code_bbcode = blocks.CodeBlockExtension()
        code_bbcode.extendMarkdown(md)

        quote_bbcode = blocks.QuoteExtension()
        quote_bbcode.extendMarkdown(md)
    else:
        # Remove blocks
        del md.parser.blockprocessors['hashheader']
        del md.parser.blockprocessors['setextheader']
        del md.parser.blockprocessors['code']
        del md.parser.blockprocessors['quote']
        del md.parser.blockprocessors['hr']
        del md.parser.blockprocessors['olist']
        del md.parser.blockprocessors['ulist']

    return pipeline.extend_markdown(md)


def linkify_paragraphs(result):
    result['parsed_text'] = bleach.linkify(result['parsed_text'], skip_pre=True, parse_email=True)

    # dirty fix for
    if '<code>' in result['parsed_text'] and '<a' in result['parsed_text']:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            soup = BeautifulSoup(result['parsed_text'], 'html5lib')
            for link in soup.select('code > a'):
                link.replace_with(BeautifulSoup(link.string, 'html.parser'))
            # [6:-7] trims <body></body> wrap
            result['parsed_text'] = six.text_type(soup.body)[6:-7]


def clean_links(request, result, force_shva=False):
    host = request.get_host()

    soup = BeautifulSoup(result['parsed_text'], 'html5lib')
    for link in soup.find_all('a'):
        if is_internal_link(link['href'], host):
            link['href'] = clean_internal_link(link['href'], host)
            result['inside_links'].append(link['href'])
            link['href'] = clean_attachment_link(link['href'], force_shva)
        else:
            result['outgoing_links'].append(link['href'])

        if link.string:
            link.string = clean_link_prefix(link.string)

    for img in soup.find_all('img'):
        img['alt'] = clean_link_prefix(img['alt'])
        if is_internal_link(img['src'], host):
            img['src'] = clean_internal_link(img['src'], host)
            result['images'].append(img['src'])
            img['src'] = clean_attachment_link(img['src'], force_shva)
        else:
            result['images'].append(img['src'])

    # [6:-7] trims <body></body> wrap
    result['parsed_text'] = six.text_type(soup.body)[6:-7]


def is_internal_link(link, host):
    if link.startswith('/') and not link.startswith('//'):
        return True

    link = clean_link_prefix(link).lstrip('www.').lower()
    return link.lower().startswith(host.lstrip('www.'))


def clean_link_prefix(link):
    if link.lower().startswith('https:'):
        link = link[6:]
    if link.lower().startswith('http:'):
        link = link[5:]
    if link.startswith('//'):
        link = link[2:]
    return link


def clean_internal_link(link, host):
    link = clean_link_prefix(link)

    if link.lower().startswith('www.'):
        link = link[4:]
    if host.lower().startswith('www.'):
        host = host[4:]

    if link.lower().startswith(host):
        link = link[len(host):]

    return link or '/'


def clean_attachment_link(link, force_shva=False):
    try:
        resolution = resolve(link)
        url_name = ':'.join(resolution.namespaces + [resolution.url_name])
    except (Http404, ValueError):
        return link

    if url_name in MISAGO_ATTACHMENT_VIEWS:
        if force_shva:
            link = '{}?shva=1'.format(link)
        elif link.endswith('?shva=1'):
            link = link[:-7]
    return link


def minify_result(result):
    # [25:-14] trims <html><head></head><body> and </body></html>
    result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
    result['parsed_text'] = result['parsed_text'][25:-14]