parsers.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. from HTMLParser import HTMLParser
  2. from urlparse import urlparse
  3. from misago.utils.strings import random_string
  4. class RemoveHTMLParser(HTMLParser):
  5. def __init__(self):
  6. HTMLParser.__init__(self)
  7. self.clean_text = ''
  8. self.lookback = []
  9. def handle_entityref(self, name):
  10. if name == 'gt':
  11. self.clean_text += '>'
  12. if name == 'lt':
  13. self.clean_text += '<'
  14. def handle_starttag(self, tag, attrs):
  15. if tag == 'img':
  16. self.handle_startendtag(tag, attrs)
  17. else:
  18. self.lookback.append(tag)
  19. def handle_endtag(self, tag):
  20. try:
  21. if self.lookback[-1] == tag:
  22. self.lookback.pop()
  23. except IndexError:
  24. pass
  25. def handle_startendtag(self, tag, attrs):
  26. try:
  27. if tag == 'img':
  28. for attr in attrs:
  29. if attr[0] == 'alt':
  30. self.clean_text += attr[1]
  31. break
  32. except KeyError:
  33. pass
  34. def handle_data(self, data):
  35. # String does not repeat itself
  36. if self.clean_text[-len(data):] != data:
  37. # String is not "QUOTE"
  38. try:
  39. if self.lookback[-1] in ('strong', 'em'):
  40. self.clean_text += data
  41. elif not (data == 'Quote' and self.lookback[-1] == 'h3' and self.lookback[-2] == 'blockquote'):
  42. self.clean_text += data
  43. except IndexError:
  44. self.clean_text += data