htmlparser.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. import html
  2. from dataclasses import dataclass
  3. import html5lib
  4. SINGLETON_TAGS = (
  5. "area",
  6. "base",
  7. "br",
  8. "col",
  9. "command",
  10. "embed",
  11. "hr",
  12. "img",
  13. "input",
  14. "keygen",
  15. "link",
  16. "meta",
  17. "param",
  18. "source",
  19. "track",
  20. "wbr",
  21. )
  22. class Node:
  23. def __str__(self):
  24. raise NotImplementedError("Subclasses of 'Node' need to implement __str__")
  25. @dataclass
  26. class RootNode(Node):
  27. tag = None
  28. children: list
  29. def __str__(self):
  30. return "".join(str(child) for child in self.children)
  31. @dataclass
  32. class ElementNode(Node):
  33. tag: str
  34. attrs: dict
  35. children: list
  36. def __str__(self):
  37. attrs_padding = " " if self.attrs else ""
  38. attrs = " ".join(self.attrs_str())
  39. if self.tag in SINGLETON_TAGS:
  40. return f"<{self.tag}{attrs_padding}{attrs} />"
  41. children = "".join(str(child) for child in self.children)
  42. return f"<{self.tag}{attrs_padding}{attrs}>{children}</{self.tag}>"
  43. def attrs_str(self):
  44. for name, value in self.attrs.items():
  45. if value is True or not value:
  46. yield html.escape(str(name))
  47. else:
  48. yield (f'{html.escape(str(name))}="{html.escape(str(value))}"')
  49. @dataclass
  50. class TextNode(Node):
  51. text: str
  52. def __str__(self):
  53. return html.escape(self.text)
  54. def parse_html_string(string: str) -> RootNode:
  55. element = html5lib.parse(
  56. string,
  57. namespaceHTMLElements=False,
  58. )
  59. body = element.find("body")
  60. root_node = RootNode(children=[])
  61. if body.text:
  62. root_node.children.append(TextNode(text=body.text))
  63. for child in body:
  64. add_child_node(root_node, child)
  65. return root_node
  66. def add_child_node(parent, element):
  67. node = ElementNode(
  68. tag=element.tag,
  69. attrs=element.attrib,
  70. children=[],
  71. )
  72. if element.text:
  73. node.children.append(TextNode(text=element.text))
  74. parent.children.append(node)
  75. if element.tail:
  76. parent.children.append(TextNode(text=element.tail))
  77. for child in element:
  78. add_child_node(node, child)
  79. def print_html_string(root_node: RootNode) -> str:
  80. return str(root_node)