parser.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. from __future__ import unicode_literals
  2. import markdown
  3. import bleach
  4. from bs4 import BeautifulSoup
  5. from django.core.urlresolvers import resolve
  6. from django.http import Http404
  7. from django.utils import six
  8. from htmlmin.minify import html_minify
  9. from .bbcode import blocks, inline
  10. from .md.shortimgs import ShortImagesExtension
  11. from .md.striketrough import StriketroughExtension
  12. from .mentions import add_mentions
  13. from .pipeline import pipeline
  14. __all__ = ['parse']
  15. MISAGO_ATTACHMENT_VIEWS = ('misago:attachment', 'misago:attachment-thumbnail')
  16. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  17. allow_images=True, allow_blocks=True, force_shva=False, minify=True):
  18. """
  19. Message parser
  20. Utility for flavours to call
  21. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  22. headers, lists, images, spoilers, text styles
  23. Returns dict object
  24. """
  25. md = md_factory(
  26. allow_links=allow_links,
  27. allow_images=allow_images,
  28. allow_blocks=allow_blocks,
  29. )
  30. parsing_result = {
  31. 'original_text': text,
  32. 'parsed_text': '',
  33. 'markdown': md,
  34. 'mentions': [],
  35. 'images': [],
  36. 'outgoing_links': [],
  37. 'inside_links': []
  38. }
  39. # Parse text
  40. parsed_text = md.convert(text)
  41. # Clean and store parsed text
  42. parsing_result['parsed_text'] = parsed_text.strip()
  43. if allow_links:
  44. linkify_paragraphs(parsing_result)
  45. parsing_result = pipeline.process_result(parsing_result)
  46. if allow_mentions:
  47. add_mentions(request, parsing_result)
  48. if allow_links or allow_images:
  49. clean_links(request, parsing_result, force_shva)
  50. if minify:
  51. minify_result(parsing_result)
  52. return parsing_result
  53. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  54. """
  55. Create and configure markdown object
  56. """
  57. md = markdown.Markdown(safe_mode='escape', extensions=['nl2br'])
  58. # Remove references
  59. del md.preprocessors['reference']
  60. del md.inlinePatterns['reference']
  61. del md.inlinePatterns['image_reference']
  62. del md.inlinePatterns['short_reference']
  63. # Add [b], [i], [u]
  64. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  65. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  66. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  67. # Add ~~deleted~~
  68. striketrough_md = StriketroughExtension()
  69. striketrough_md.extendMarkdown(md)
  70. if not allow_links:
  71. # Remove links
  72. del md.inlinePatterns['link']
  73. del md.inlinePatterns['autolink']
  74. del md.inlinePatterns['automail']
  75. if allow_images:
  76. # Add [img]
  77. short_images_md = ShortImagesExtension()
  78. short_images_md.extendMarkdown(md)
  79. else:
  80. # Remove images
  81. del md.inlinePatterns['image_link']
  82. if allow_blocks:
  83. # Add [hr] and [quote] blocks
  84. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  85. quote_bbcode = blocks.QuoteExtension()
  86. quote_bbcode.extendMarkdown(md)
  87. else:
  88. # Remove blocks
  89. del md.parser.blockprocessors['hashheader']
  90. del md.parser.blockprocessors['setextheader']
  91. del md.parser.blockprocessors['code']
  92. del md.parser.blockprocessors['quote']
  93. del md.parser.blockprocessors['hr']
  94. del md.parser.blockprocessors['olist']
  95. del md.parser.blockprocessors['ulist']
  96. return pipeline.extend_markdown(md)
  97. def linkify_paragraphs(result):
  98. result['parsed_text'] = bleach.linkify(result['parsed_text'], skip_pre=True, parse_email=True)
  99. def clean_links(request, result, force_shva=False):
  100. host = request.get_host()
  101. site_address = '%s://%s' % (request.scheme, request.get_host())
  102. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  103. for link in soup.find_all('a'):
  104. if is_internal_link(link['href'], host):
  105. link['href'] = clean_internal_link(link['href'], host)
  106. result['inside_links'].append(link['href'])
  107. link['href'] = clean_attachment_link(link['href'], force_shva)
  108. else:
  109. result['outgoing_links'].append(link['href'])
  110. if link.string:
  111. link.string = clean_link_prefix(link.string)
  112. for img in soup.find_all('img'):
  113. img['alt'] = clean_link_prefix(img['alt'])
  114. if is_internal_link(img['src'], host):
  115. img['src'] = clean_internal_link(img['src'], host)
  116. result['images'].append(img['src'])
  117. img['src'] = clean_attachment_link(img['src'], force_shva)
  118. else:
  119. result['images'].append(img['src'])
  120. # [6:-7] trims <body></body> wrap
  121. result['parsed_text'] = six.text_type(soup.body)[6:-7]
  122. def is_internal_link(link, host):
  123. if link.startswith('/') and not link.startswith('//'):
  124. return True
  125. link = clean_link_prefix(link).lstrip('www.').lower()
  126. return link.lower().startswith(host.lstrip('www.'))
  127. def clean_link_prefix(link):
  128. if link.lower().startswith('https:'):
  129. link = link[6:]
  130. if link.lower().startswith('http:'):
  131. link = link[5:]
  132. if link.startswith('//'):
  133. link = link[2:]
  134. return link
  135. def clean_internal_link(link, host):
  136. link = clean_link_prefix(link)
  137. if link.lower().startswith('www.'):
  138. link = link[4:]
  139. if host.lower().startswith('www.'):
  140. host = host[4:]
  141. if link.lower().startswith(host):
  142. link = link[len(host):]
  143. return link or '/'
  144. def clean_attachment_link(link, force_shva=False):
  145. try:
  146. resolution = resolve(link)
  147. url_name = ':'.join(resolution.namespaces + [resolution.url_name])
  148. except (Http404, ValueError):
  149. return link
  150. if url_name in MISAGO_ATTACHMENT_VIEWS:
  151. if force_shva:
  152. link = '{}?shva=1'.format(link)
  153. elif link.endswith('?shva=1'):
  154. link = link[:-7]
  155. return link
  156. def minify_result(result):
  157. # [25:-14] trims <html><head></head><body> and </body></html>
  158. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  159. result['parsed_text'] = result['parsed_text'][25:-14]