Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions graphtage/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,61 @@ def __init__(self):
'application/xhtml+xml'
)

def build_tree(self, path: str, options: Optional[BuildOptions] = None) -> TreeNode:
"""Builds a tree from an HTML file using lxml's HTML parser.

This method uses lxml.html instead of xml.etree.ElementTree to properly handle
HTML5 syntax, including unquoted attributes like <meta name=foo>.

Args:
path: The path to the HTML file to parse.
options: Optional build options.

Returns:
TreeNode: The root XMLElement node representing the HTML document.
"""
try:
from lxml import html as lxml_html
except ImportError:
# Fallback to XML parser if lxml is not available
return super().build_tree(path, options)

# Parse HTML file using lxml's lenient HTML parser
with open(path, 'rb') as f:
tree = lxml_html.parse(f)

# Convert lxml element to xml.etree.ElementTree.Element
root = tree.getroot()

# Convert lxml tree to standard ET format
et_root = self._lxml_to_et(root)

# Use the existing build_tree function with the converted ET element
return build_tree(et_root, options)

@staticmethod
def _lxml_to_et(lxml_elem) -> ET.Element:
"""Converts an lxml element to a standard xml.etree.ElementTree.Element.

Args:
lxml_elem: An lxml.etree.Element object.

Returns:
ET.Element: A standard ElementTree Element object.
"""
# Create new ET element with same tag and attributes
et_elem = ET.Element(lxml_elem.tag, attrib=dict(lxml_elem.attrib))

# Copy text and tail
et_elem.text = lxml_elem.text
et_elem.tail = lxml_elem.tail

# Recursively convert children
for child in lxml_elem:
et_elem.append(HTML._lxml_to_et(child))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This recursion will exhaust Python's tiny stack if the HTML DOM is very deep. _lxml_to_et needs to be converted to an iterative function (e.g., using a stack).


return et_elem


# Tell JSON how to format XML:
def _json_print_XMLElement(self: JSONFormatter, printer: Printer, node: XMLElement):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dependencies = [
"fickling>=0.1.3",
"intervaltree",
"json5==0.9.5",
"lxml>=4.9.0",
"numpy>=1.19.4",
"PyYAML",
"scipy>=1.4.0",
Expand Down
60 changes: 57 additions & 3 deletions test/test_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,63 @@ def test_xml_formatting(self):
return orig_obj, str(orig_obj)

def test_html_formatting(self):
# For now, HTML support is implemented through XML, so we don't need a separate test.
# However, test_formatter_coverage will complain unless this function is here!
pass
# Test basic HTML parsing and formatting
# This tests both quoted and unquoted attributes (issue #25)
import tempfile
import os

# Test with unquoted attributes
html_unquoted = """<!DOCTYPE html>
<html>
<head>
<meta name=foo content=bar>
<meta charset=utf-8>
</head>
<body>
<h1>Test</h1>
</body>
</html>"""

# Test with quoted attributes
html_quoted = """<!DOCTYPE html>
<html>
<head>
<meta name="foo" content="bar">
<meta charset="utf-8">
</head>
<body>
<h1>Test</h1>
</body>
</html>"""

for html_content in [html_unquoted, html_quoted]:
with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
f.write(html_content)
temp_file = f.name

try:
# Should parse without errors
filetype = graphtage.FILETYPES_BY_TYPENAME['html']
tree = filetype.build_tree(temp_file)

# Verify it's an XMLElement
self.assertIsInstance(tree, xml.XMLElement)

# Verify it can be formatted
formatter = filetype.get_default_formatter()
from io import StringIO
from graphtage.printer import Printer
output = StringIO()
printer = Printer(output)
formatter.print(printer, tree)
result = output.getvalue()

# Verify output contains expected elements
self.assertIn('<html>', result)
self.assertIn('<meta', result)
self.assertIn('</html>', result)
finally:
os.unlink(temp_file)

def test_json5_formatting(self):
# For now, JSON5 support is implemented using the regular JSON formatter, so we don't need a separate test.
Expand Down
Loading
Loading