diff --git a/graphtage/xml.py b/graphtage/xml.py
index 332d9b4..bf40636 100644
--- a/graphtage/xml.py
+++ b/graphtage/xml.py
@@ -446,6 +446,61 @@ def __init__(self):
'application/xhtml+xml'
)
+ def build_tree(self, path: str, options: Optional[BuildOptions] = None) -> TreeNode:
+ """Builds a tree from an HTML file using lxml's HTML parser.
+
+ This method uses lxml.html instead of xml.etree.ElementTree to properly handle
+ HTML5 syntax, including unquoted attributes like .
+
+ Args:
+ path: The path to the HTML file to parse.
+ options: Optional build options.
+
+ Returns:
+ TreeNode: The root XMLElement node representing the HTML document.
+ """
+ try:
+ from lxml import html as lxml_html
+ except ImportError:
+ # Fallback to XML parser if lxml is not available
+ return super().build_tree(path, options)
+
+ # Parse HTML file using lxml's lenient HTML parser
+ with open(path, 'rb') as f:
+ tree = lxml_html.parse(f)
+
+ # Convert lxml element to xml.etree.ElementTree.Element
+ root = tree.getroot()
+
+ # Convert lxml tree to standard ET format
+ et_root = self._lxml_to_et(root)
+
+ # Use the existing build_tree function with the converted ET element
+ return build_tree(et_root, options)
+
+ @staticmethod
+ def _lxml_to_et(lxml_elem) -> ET.Element:
+ """Converts an lxml element to a standard xml.etree.ElementTree.Element.
+
+ Args:
+ lxml_elem: An lxml.etree.Element object.
+
+ Returns:
+ ET.Element: A standard ElementTree Element object.
+ """
+ # Create new ET element with same tag and attributes
+ et_elem = ET.Element(lxml_elem.tag, attrib=dict(lxml_elem.attrib))
+
+ # Copy text and tail
+ et_elem.text = lxml_elem.text
+ et_elem.tail = lxml_elem.tail
+
+ # Recursively convert children
+ for child in lxml_elem:
+ et_elem.append(HTML._lxml_to_et(child))
+
+ return et_elem
+
# Tell JSON how to format XML:
def _json_print_XMLElement(self: JSONFormatter, printer: Printer, node: XMLElement):
diff --git a/pyproject.toml b/pyproject.toml
index 318f881..3c7705c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
"fickling>=0.1.3",
"intervaltree",
"json5==0.9.5",
+ "lxml>=4.9.0",
"numpy>=1.19.4",
"PyYAML",
"scipy>=1.4.0",
diff --git a/test/test_formatting.py b/test/test_formatting.py
index 15e37f1..68ff5fa 100644
--- a/test/test_formatting.py
+++ b/test/test_formatting.py
@@ -221,9 +221,63 @@ def test_xml_formatting(self):
return orig_obj, str(orig_obj)
def test_html_formatting(self):
- # For now, HTML support is implemented through XML, so we don't need a separate test.
- # However, test_formatter_coverage will complain unless this function is here!
- pass
+ # Test basic HTML parsing and formatting
+ # This tests both quoted and unquoted attributes (issue #25)
+ import tempfile
+ import os
+
+ # Test with unquoted attributes
+ html_unquoted = """
+
+
+
+
+
+
+
Test
+
+"""
+
+ # Test with quoted attributes
+ html_quoted = """
+
+
+
+
+
+
+
Test
+
+"""
+
+ for html_content in [html_unquoted, html_quoted]:
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+ f.write(html_content)
+ temp_file = f.name
+
+ try:
+ # Should parse without errors
+ filetype = graphtage.FILETYPES_BY_TYPENAME['html']
+ tree = filetype.build_tree(temp_file)
+
+ # Verify it's an XMLElement
+ self.assertIsInstance(tree, xml.XMLElement)
+
+ # Verify it can be formatted
+ formatter = filetype.get_default_formatter()
+ from io import StringIO
+ from graphtage.printer import Printer
+ output = StringIO()
+ printer = Printer(output)
+ formatter.print(printer, tree)
+ result = output.getvalue()
+
+ # Verify output contains expected elements
+ self.assertIn('', result)
+ self.assertIn('', result)
+ finally:
+ os.unlink(temp_file)
def test_json5_formatting(self):
# For now, JSON5 support is implemented using the regular JSON formatter, so we don't need a separate test.
diff --git a/test/test_html.py b/test/test_html.py
new file mode 100644
index 0000000..e03427e
--- /dev/null
+++ b/test/test_html.py
@@ -0,0 +1,265 @@
+import unittest
+
+from graphtage.utils import Tempfile
+from graphtage.xml import HTML
+
+
+class TestHTML(unittest.TestCase):
+ def test_unquoted_attributes(self):
+ """Reproduces and verifies fix for https://github.com/trailofbits/graphtage/issues/25
+
+ HTML5 allows unquoted attributes like , but the XML parser
+ incorrectly rejected this valid HTML syntax. This test verifies that HTML
+ files with unquoted attributes can now be parsed and diffed correctly.
+ """
+ html = HTML.default_instance
+
+ # HTML with unquoted attributes (the original issue)
+ html_unquoted_1 = b"""
+
+
+
+
+ Test Page
+
+
+
+
+"""
+
+ # This should not raise an exception (would previously fail with:
+ # "not well-formed (invalid token)" when using XML parser)
+ with Tempfile(html_unquoted_1) as one, Tempfile(html_unquoted_2) as two:
+ t1 = html.build_tree(one)
+ t2 = html.build_tree(two)
+
+ # Verify trees were built successfully
+ self.assertIsNotNone(t1)
+ self.assertIsNotNone(t2)
+
+ # Verify we can compute edits between them
+ edits = list(t1.get_all_edits(t2))
+ self.assertGreater(len(edits), 0, "Should find edits between the two HTML files")
+
+ # Verify the diff captures the meta name change from 'foo' to 'bar'
+ edit_strings = [str(edit) for edit in edits]
+ all_edits_str = ' '.join(edit_strings)
+ self.assertTrue(
+ 'foo' in all_edits_str or 'bar' in all_edits_str,
+ "Diff should capture the change in meta name attribute"
+ )
+
+ def test_mixed_quoted_unquoted_attributes(self):
+ """Test HTML files with a mix of quoted and unquoted attributes.
+
+ This ensures backward compatibility - files with quoted attributes should
+ still work correctly, and they can be mixed with unquoted attributes.
+ """
+ html = HTML.default_instance
+
+ # Mix of quoted and unquoted attributes
+ html_mixed = b"""
+
+
+
+
+
+
+
+
Mixed quotes
+
+
+"""
+
+ # Should parse without errors
+ with Tempfile(html_mixed) as temp:
+ tree = html.build_tree(temp)
+ self.assertIsNotNone(tree)
+
+ # Verify we can access the tree structure
+ self.assertEqual(tree.tag.object, 'html')
+
+ def test_quoted_attributes_backward_compatibility(self):
+ """Test that HTML files with quoted attributes still work (backward compatibility).
+
+ This verifies that the fix for unquoted attributes doesn't break existing
+ functionality with quoted attributes.
+ """
+ html = HTML.default_instance
+
+ html_quoted_1 = b"""
+
+
+
+
+
+
+
Test
+
+"""
+
+ html_quoted_2 = b"""
+
+
+
+
+
+
+
Test
+
+"""
+
+ with Tempfile(html_quoted_1) as one, Tempfile(html_quoted_2) as two:
+ t1 = html.build_tree(one)
+ t2 = html.build_tree(two)
+
+ # Verify trees were built successfully
+ self.assertIsNotNone(t1)
+ self.assertIsNotNone(t2)
+
+ # Verify we can compute edits
+ edits = list(t1.get_all_edits(t2))
+ self.assertGreater(len(edits), 0)
+
+ # Verify the diff captures the content change from 'Alice' to 'Bob'
+ edit_strings = [str(edit) for edit in edits]
+ all_edits_str = ' '.join(edit_strings)
+ self.assertTrue(
+ 'Alice' in all_edits_str or 'Bob' in all_edits_str,
+ "Diff should capture the change in meta content attribute"
+ )
+
+ def test_complex_unquoted_attributes(self):
+ """Test more complex cases with unquoted attributes.
+
+ HTML5 spec allows unquoted attribute values that don't contain spaces,
+ quotes, =, <, >, or `.
+ """
+ html = HTML.default_instance
+
+ html_complex = b"""
+
+
+
+
+
+
+
+
Article Title
+
+
+"""
+
+ # Should parse without errors
+ with Tempfile(html_complex) as temp:
+ tree = html.build_tree(temp)
+ self.assertIsNotNone(tree)
+ self.assertEqual(tree.tag.object, 'html')
+
+ def test_closing_tags_issue_26(self):
+ """Test for issue #26: Fails to match closing HTML tags.
+
+ The original issue reported that graphtage threw errors when encountering
+ closing tags like . With the lxml HTML parser, this should work.
+ """
+ html = HTML.default_instance
+
+ html_with_closing_tags = b"""
+
+
+ Test Page
+
+
+
+
Hello World
+
Some text
+
+"""
+
+ # Should parse without errors (previously would fail on )
+ with Tempfile(html_with_closing_tags) as temp:
+ tree = html.build_tree(temp)
+ self.assertIsNotNone(tree)
+ self.assertEqual(tree.tag.object, 'html')
+
+ # Test that we can diff two files with closing tags
+ html_modified = b"""
+
+
+ Modified Page
+
+
+
+
Hello World
+
Different text
+
+"""
+
+ with Tempfile(html_with_closing_tags) as one, Tempfile(html_modified) as two:
+ t1 = html.build_tree(one)
+ t2 = html.build_tree(two)
+
+ # Should compute edits without errors
+ edits = list(t1.get_all_edits(t2))
+ self.assertGreater(len(edits), 0)
+
+ def test_text_between_elements_issue_80(self):
+ """Test for issue #80: Text missing from HTML diff.
+
+ The original issue reported that text nodes between elements (not wrapped
+ in tags) were missing from the diff output. For example, "and more" in:
+ some
text and more
text
+ """
+ html = HTML.default_instance
+
+ # Example from issue #80
+ old_html = b"""
+
+ some
text and more
text
+
+"""
+
+ new_html = b"""
+
+ some
text
and more text
+
+"""
+
+ with Tempfile(old_html) as one, Tempfile(new_html) as two:
+ t1 = html.build_tree(one)
+ t2 = html.build_tree(two)
+
+ # Verify trees were built successfully
+ self.assertIsNotNone(t1)
+ self.assertIsNotNone(t2)
+
+ # Verify we can compute edits
+ edits = list(t1.get_all_edits(t2))
+ self.assertGreater(len(edits), 0)
+
+ # Convert edits to strings to check if text is present
+ edit_strings = [str(edit) for edit in edits]
+ all_edits_str = ' '.join(edit_strings)
+
+ # The key test: verify that "and more" appears somewhere in the output
+ # This would have been missing in the original bug
+ self.assertTrue(
+ 'and more' in all_edits_str or 'and' in all_edits_str,
+ "Text 'and more' should be present in the diff output"
+ )