parsers.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. from HTMLParser import HTMLParser
  2. from urlparse import urlparse
  3. from django.conf import settings
  4. from misago.utils.strings import random_string
  5. class RemoveHTMLParser(HTMLParser):
  6. def __init__(self):
  7. HTMLParser.__init__(self)
  8. self.clean_text = ''
  9. self.lookback = []
  10. def handle_entityref(self, name):
  11. if name == 'gt':
  12. self.clean_text += '>'
  13. if name == 'lt':
  14. self.clean_text += '<'
  15. def handle_starttag(self, tag, attrs):
  16. if tag == 'img':
  17. self.handle_startendtag(tag, attrs)
  18. else:
  19. self.lookback.append(tag)
  20. def handle_endtag(self, tag):
  21. try:
  22. if self.lookback[-1] == tag:
  23. self.lookback.pop()
  24. except IndexError:
  25. pass
  26. def handle_startendtag(self, tag, attrs):
  27. try:
  28. if tag == 'img':
  29. for attr in attrs:
  30. if attr[0] == 'alt':
  31. self.clean_text += attr[1]
  32. break
  33. except KeyError:
  34. pass
  35. def handle_data(self, data):
  36. # String does not repeat itself
  37. if self.clean_text[-len(data):] != data:
  38. # String is not "QUOTE"
  39. try:
  40. if self.lookback[-1] in ('strong', 'em'):
  41. self.clean_text += data
  42. elif not (data == 'Quote' and self.lookback[-1] == 'h3' and self.lookback[-2] == 'blockquote'):
  43. self.clean_text += data
  44. except IndexError:
  45. self.clean_text += data