parser.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. import bleach
  2. import markdown
  3. from bs4 import BeautifulSoup
  4. from htmlmin.minify import html_minify
  5. from .bbcode import blocks, inline
  6. from .md.shortimgs import ShortImagesExtension
  7. from .pipeline import pipeline
  8. __all__ = ['parse']
  9. def parse(text, request, poster, allow_mentions=True, allow_links=True,
  10. allow_images=True, allow_blocks=True, minify=True):
  11. """
  12. Message parser
  13. Utility for flavours to call
  14. Breaks text into paragraphs, supports code, spoiler and quote blocks,
  15. headers, lists, images, spoilers, text styles
  16. Returns dict object
  17. """
  18. md = md_factory(
  19. allow_links=allow_links,
  20. allow_images=allow_images,
  21. allow_blocks=allow_blocks,
  22. )
  23. parsing_result = {
  24. 'original_text': text,
  25. 'parsed_text': '',
  26. 'markdown': md,
  27. 'mentions': [],
  28. 'images': [],
  29. 'outgoing_links': [],
  30. 'inside_links': []
  31. }
  32. # Parse text
  33. parsed_text = md.convert(text)
  34. # Clean and store parsed text
  35. parsing_result['parsed_text'] = parsed_text.strip()
  36. if allow_links:
  37. linkify_paragraphs(parsing_result)
  38. parsing_result = pipeline.process_result(parsing_result)
  39. if allow_links or allow_images:
  40. clean_links(parsing_result, request)
  41. if minify:
  42. minify_result(parsing_result)
  43. return parsing_result
  44. def md_factory(allow_links=True, allow_images=True, allow_blocks=True):
  45. """
  46. Create and configure markdown object
  47. """
  48. md = markdown.Markdown(safe_mode='escape',
  49. extensions=['nl2br'])
  50. # Remove references
  51. del md.preprocessors['reference']
  52. del md.inlinePatterns['reference']
  53. del md.inlinePatterns['image_reference']
  54. del md.inlinePatterns['short_reference']
  55. # Add [b], [i], [u]
  56. md.inlinePatterns.add('bb_b', inline.bold, '<strong')
  57. md.inlinePatterns.add('bb_i', inline.italics, '<emphasis')
  58. md.inlinePatterns.add('bb_u', inline.underline, '<emphasis2')
  59. if allow_links:
  60. # Add [url]
  61. pass
  62. else:
  63. # Remove links
  64. del md.inlinePatterns['link']
  65. del md.inlinePatterns['autolink']
  66. del md.inlinePatterns['automail']
  67. if allow_images:
  68. # Add [img]
  69. short_images_md = ShortImagesExtension()
  70. short_images_md.extendMarkdown(md)
  71. else:
  72. # Remove images
  73. del md.inlinePatterns['image_link']
  74. if allow_blocks:
  75. # Add [hr] [quote], [spoiler], [list] and [code] blocks
  76. md.parser.blockprocessors.add('bb_hr', blocks.BBCodeHRProcessor(md.parser), '>hr')
  77. else:
  78. # Remove blocks
  79. del md.parser.blockprocessors['hashheader']
  80. del md.parser.blockprocessors['setextheader']
  81. del md.parser.blockprocessors['code']
  82. del md.parser.blockprocessors['quote']
  83. del md.parser.blockprocessors['hr']
  84. del md.parser.blockprocessors['olist']
  85. del md.parser.blockprocessors['ulist']
  86. return pipeline.extend_markdown(md)
  87. def linkify_paragraphs(result):
  88. result['parsed_text'] = bleach.linkify(
  89. result['parsed_text'], skip_pre=True, parse_email=True)
  90. def clean_links(result, request):
  91. site_address = '%s://%s' % (request.scheme, request.get_host())
  92. soup = BeautifulSoup(result['parsed_text'], 'html5lib')
  93. for link in soup.find_all('a'):
  94. if link['href'].lower().startswith(site_address):
  95. result['inside_links'].append(link['href'])
  96. if link['href'].lower() == site_address:
  97. link['href'] = '/'
  98. else:
  99. link['href'] = link['href'].lower()[len(site_address):]
  100. else:
  101. result['outgoing_links'].append(link['href'])
  102. if link.string.startswith('http://'):
  103. link.string = link.string[7:].strip()
  104. if link.string.startswith('https://'):
  105. link.string = link.string[8:].strip()
  106. for img in soup.find_all('img'):
  107. result['images'].append(img['src'])
  108. if img['src'].lower().startswith(site_address):
  109. if img['src'].lower() == site_address:
  110. img['src'] = '/'
  111. else:
  112. img['src'] = img['src'].lower()[len(site_address):]
  113. if img['alt'].startswith('http://'):
  114. img['alt'] = img['alt'][7:].strip()
  115. if img['alt'].startswith('https://'):
  116. img['alt'] = img['alt'][8:].strip()
  117. if result['outgoing_links'] or result['inside_links'] or result['images']:
  118. result['parsed_text'] = soup.prettify()
  119. def minify_result(result):
  120. # [25:-14] trims <html><head></head><body> and </body></html>
  121. result['parsed_text'] = html_minify(result['parsed_text'].encode('utf-8'))
  122. result['parsed_text'] = result['parsed_text'][25:-14]