123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- import html
- from dataclasses import dataclass
- import html5lib
- SINGLETON_TAGS = (
- "area",
- "base",
- "br",
- "col",
- "command",
- "embed",
- "hr",
- "img",
- "input",
- "keygen",
- "link",
- "meta",
- "param",
- "source",
- "track",
- "wbr",
- )
- class Node:
- def __str__(self):
- raise NotImplementedError("Subclasses of 'Node' need to implement __str__")
- @dataclass
- class RootNode(Node):
- tag = None
- children: list
- def __str__(self):
- return "".join(str(child) for child in self.children)
- @dataclass
- class ElementNode(Node):
- tag: str
- attrs: dict
- children: list
- def __str__(self):
- attrs_padding = " " if self.attrs else ""
- attrs = " ".join(self.attrs_str())
- if self.tag in SINGLETON_TAGS:
- return f"<{self.tag}{attrs_padding}{attrs} />"
- children = "".join(str(child) for child in self.children)
- return f"<{self.tag}{attrs_padding}{attrs}>{children}</{self.tag}>"
- def attrs_str(self):
- for name, value in self.attrs.items():
- if value is True or not value:
- yield html.escape(str(name))
- else:
- yield (f'{html.escape(str(name))}="{html.escape(str(value))}"')
- @dataclass
- class TextNode(Node):
- text: str
- def __str__(self):
- return html.escape(self.text)
- def parse_html_string(string: str) -> RootNode:
- element = html5lib.parse(
- string,
- namespaceHTMLElements=False,
- )
- body = element.find("body")
- root_node = RootNode(children=[])
- if body.text:
- root_node.children.append(TextNode(text=body.text))
- for child in body:
- add_child_node(root_node, child)
- return root_node
- def add_child_node(parent, element):
- node = ElementNode(
- tag=element.tag,
- attrs=element.attrib,
- children=[],
- )
- if element.text:
- node.children.append(TextNode(text=element.text))
- parent.children.append(node)
- if element.tail:
- parent.children.append(TextNode(text=element.tail))
- for child in element:
- add_child_node(node, child)
- def print_html_string(root_node: RootNode) -> str:
- return str(root_node)
|