parser.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. import markdown
  2. import bleach
  3. from bs4 import BeautifulSoup
  4. from htmlmin.minify import html_minify
  5. from .bbcode import blocks, inline
  6. from .md.shortimgs import ShortImagesExtension
  7. from .mentions import add_mentions
  8. from .pipeline import pipeline
  9. __all__ = ['parse']
  10. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  11. allow_images=True, allow_blocks=True, minify=True):
  12. """
  13. Message parser
  14. Utility for flavours to call
  15. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  16. headers, lists, images, spoilers, text styles
  17. Returns dict object
  18. """
  19. md = md_factory(
  20. allow_links=allow_links,
  21. allow_images=allow_images,
  22. allow_blocks=allow_blocks,
  23. )
  24. parsing_result = {
  25. 'original_text': text,
  26. 'parsed_text': '',
  27. 'markdown': md,
  28. 'mentions': [],
  29. 'images': [],
  30. 'outgoing_links': [],
  31. 'inside_links': []
  32. }
  33. # Parse text
  34. parsed_text = md.convert(text)
  35. # Clean and store parsed text
  36. parsing_result['parsed_text'] = parsed_text.strip()
  37. if allow_links:
  38. linkify_paragraphs(parsing_result)
  39. parsing_result = pipeline.process_result(parsing_result)
  40. if allow_mentions:
  41. add_mentions(request, parsing_result)
  42. if allow_links or allow_images:
  43. clean_links(request, parsing_result)
  44. if minify:
  45. minify_result(parsing_result)
  46. return parsing_result
  47. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  48. """
  49. Create and configure markdown object
  50. """
  51. md = markdown.Markdown(safe_mode='escape',
  52. extensions=['nl2br'])
  53. # Remove references
  54. del md.preprocessors['reference']
  55. del md.inlinePatterns['reference']
  56. del md.inlinePatterns['image_reference']
  57. del md.inlinePatterns['short_reference']
  58. # Add [b], [i], [u]
  59. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  60. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  61. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  62. if allow_links:
  63. # Add [url]
  64. pass
  65. else:
  66. # Remove links
  67. del md.inlinePatterns['link']
  68. del md.inlinePatterns['autolink']
  69. del md.inlinePatterns['automail']
  70. if allow_images:
  71. # Add [img]
  72. short_images_md = ShortImagesExtension()
  73. short_images_md.extendMarkdown(md)
  74. else:
  75. # Remove images
  76. del md.inlinePatterns['image_link']
  77. if allow_blocks:
  78. # Add [hr] [quote], [spoiler], [list] and [code] blocks
  79. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  80. else:
  81. # Remove blocks
  82. del md.parser.blockprocessors['hashheader']
  83. del md.parser.blockprocessors['setextheader']
  84. del md.parser.blockprocessors['code']
  85. del md.parser.blockprocessors['quote']
  86. del md.parser.blockprocessors['hr']
  87. del md.parser.blockprocessors['olist']
  88. del md.parser.blockprocessors['ulist']
  89. return pipeline.extend_markdown(md)
  90. def linkify_paragraphs(result):
  91. result['parsed_text'] = bleach.linkify(result['parsed_text'], skip_pre=True, parse_email=True)
  92. def clean_links(request, result):
  93. site_address = '%s://%s' % (request.scheme, request.get_host())
  94. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  95. for link in soup.find_all('a'):
  96. if link['href'].lower().startswith(site_address):
  97. result['inside_links'].append(link['href'])
  98. if link['href'].lower() == site_address:
  99. link['href'] = '/'
  100. else:
  101. link['href'] = link['href'].lower()[len(site_address):]
  102. else:
  103. result['outgoing_links'].append(link['href'])
  104. if link.string.startswith('http://'):
  105. link.string = link.string[7:].strip()
  106. if link.string.startswith('https://'):
  107. link.string = link.string[8:].strip()
  108. for img in soup.find_all('img'):
  109. result['images'].append(img['src'])
  110. if img['src'].lower().startswith(site_address):
  111. if img['src'].lower() == site_address:
  112. img['src'] = '/'
  113. else:
  114. img['src'] = img['src'].lower()[len(site_address):]
  115. if img['alt'].startswith('http://'):
  116. img['alt'] = img['alt'][7:].strip()
  117. if img['alt'].startswith('https://'):
  118. img['alt'] = img['alt'][8:].strip()
  119. if result['outgoing_links'] or result['inside_links'] or result['images']:
  120. result['parsed_text'] = soup.prettify()
  121. def minify_result(result):
  122. # [25:-14] trims <html><head></head><body> and </body></html>
  123. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  124. result['parsed_text'] = result['parsed_text'][25:-14]