diff --git a/graphtage/xml.py b/graphtage/xml.py index 332d9b4..bf40636 100644 --- a/graphtage/xml.py +++ b/graphtage/xml.py @@ -446,6 +446,61 @@ def __init__(self): 'application/xhtml+xml' ) + def build_tree(self, path: str, options: Optional[BuildOptions] = None) -> TreeNode: + """Builds a tree from an HTML file using lxml's HTML parser. + + This method uses lxml.html instead of xml.etree.ElementTree to properly handle + HTML5 syntax, including unquoted attributes like . + + Args: + path: The path to the HTML file to parse. + options: Optional build options. + + Returns: + TreeNode: The root XMLElement node representing the HTML document. + """ + try: + from lxml import html as lxml_html + except ImportError: + # Fallback to XML parser if lxml is not available + return super().build_tree(path, options) + + # Parse HTML file using lxml's lenient HTML parser + with open(path, 'rb') as f: + tree = lxml_html.parse(f) + + # Convert lxml element to xml.etree.ElementTree.Element + root = tree.getroot() + + # Convert lxml tree to standard ET format + et_root = self._lxml_to_et(root) + + # Use the existing build_tree function with the converted ET element + return build_tree(et_root, options) + + @staticmethod + def _lxml_to_et(lxml_elem) -> ET.Element: + """Converts an lxml element to a standard xml.etree.ElementTree.Element. + + Args: + lxml_elem: An lxml.etree.Element object. + + Returns: + ET.Element: A standard ElementTree Element object. + """ + # Create new ET element with same tag and attributes + et_elem = ET.Element(lxml_elem.tag, attrib=dict(lxml_elem.attrib)) + + # Copy text and tail + et_elem.text = lxml_elem.text + et_elem.tail = lxml_elem.tail + + # Recursively convert children + for child in lxml_elem: + et_elem.append(HTML._lxml_to_et(child)) + + return et_elem + # Tell JSON how to format XML: def _json_print_XMLElement(self: JSONFormatter, printer: Printer, node: XMLElement): diff --git a/pyproject.toml b/pyproject.toml index 318f881..3c7705c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "fickling>=0.1.3", "intervaltree", "json5==0.9.5", + "lxml>=4.9.0", "numpy>=1.19.4", "PyYAML", "scipy>=1.4.0", diff --git a/test/test_formatting.py b/test/test_formatting.py index 15e37f1..68ff5fa 100644 --- a/test/test_formatting.py +++ b/test/test_formatting.py @@ -221,9 +221,63 @@ def test_xml_formatting(self): return orig_obj, str(orig_obj) def test_html_formatting(self): - # For now, HTML support is implemented through XML, so we don't need a separate test. - # However, test_formatter_coverage will complain unless this function is here! - pass + # Test basic HTML parsing and formatting + # This tests both quoted and unquoted attributes (issue #25) + import tempfile + import os + + # Test with unquoted attributes + html_unquoted = """ + + + + + + +

Test

+ +""" + + # Test with quoted attributes + html_quoted = """ + + + + + + +

Test

+ +""" + + for html_content in [html_unquoted, html_quoted]: + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write(html_content) + temp_file = f.name + + try: + # Should parse without errors + filetype = graphtage.FILETYPES_BY_TYPENAME['html'] + tree = filetype.build_tree(temp_file) + + # Verify it's an XMLElement + self.assertIsInstance(tree, xml.XMLElement) + + # Verify it can be formatted + formatter = filetype.get_default_formatter() + from io import StringIO + from graphtage.printer import Printer + output = StringIO() + printer = Printer(output) + formatter.print(printer, tree) + result = output.getvalue() + + # Verify output contains expected elements + self.assertIn('', result) + self.assertIn('', result) + finally: + os.unlink(temp_file) def test_json5_formatting(self): # For now, JSON5 support is implemented using the regular JSON formatter, so we don't need a separate test. diff --git a/test/test_html.py b/test/test_html.py new file mode 100644 index 0000000..e03427e --- /dev/null +++ b/test/test_html.py @@ -0,0 +1,265 @@ +import unittest + +from graphtage.utils import Tempfile +from graphtage.xml import HTML + + +class TestHTML(unittest.TestCase): + def test_unquoted_attributes(self): + """Reproduces and verifies fix for https://github.com/trailofbits/graphtage/issues/25 + + HTML5 allows unquoted attributes like , but the XML parser + incorrectly rejected this valid HTML syntax. This test verifies that HTML + files with unquoted attributes can now be parsed and diffed correctly. + """ + html = HTML.default_instance + + # HTML with unquoted attributes (the original issue) + html_unquoted_1 = b""" + + + + + Test Page + + +
+

Hello World

+
+ +""" + + html_unquoted_2 = b""" + + + + + Test Page + + +
+

Hello World

+
+ +""" + + # This should not raise an exception (would previously fail with: + # "not well-formed (invalid token)" when using XML parser) + with Tempfile(html_unquoted_1) as one, Tempfile(html_unquoted_2) as two: + t1 = html.build_tree(one) + t2 = html.build_tree(two) + + # Verify trees were built successfully + self.assertIsNotNone(t1) + self.assertIsNotNone(t2) + + # Verify we can compute edits between them + edits = list(t1.get_all_edits(t2)) + self.assertGreater(len(edits), 0, "Should find edits between the two HTML files") + + # Verify the diff captures the meta name change from 'foo' to 'bar' + edit_strings = [str(edit) for edit in edits] + all_edits_str = ' '.join(edit_strings) + self.assertTrue( + 'foo' in all_edits_str or 'bar' in all_edits_str, + "Diff should capture the change in meta name attribute" + ) + + def test_mixed_quoted_unquoted_attributes(self): + """Test HTML files with a mix of quoted and unquoted attributes. + + This ensures backward compatibility - files with quoted attributes should + still work correctly, and they can be mixed with unquoted attributes. + """ + html = HTML.default_instance + + # Mix of quoted and unquoted attributes + html_mixed = b""" + + + + + + +
+

Mixed quotes

+
+ +""" + + # Should parse without errors + with Tempfile(html_mixed) as temp: + tree = html.build_tree(temp) + self.assertIsNotNone(tree) + + # Verify we can access the tree structure + self.assertEqual(tree.tag.object, 'html') + + def test_quoted_attributes_backward_compatibility(self): + """Test that HTML files with quoted attributes still work (backward compatibility). + + This verifies that the fix for unquoted attributes doesn't break existing + functionality with quoted attributes. + """ + html = HTML.default_instance + + html_quoted_1 = b""" + + + + + + +

Test

+ +""" + + html_quoted_2 = b""" + + + + + + +

Test

+ +""" + + with Tempfile(html_quoted_1) as one, Tempfile(html_quoted_2) as two: + t1 = html.build_tree(one) + t2 = html.build_tree(two) + + # Verify trees were built successfully + self.assertIsNotNone(t1) + self.assertIsNotNone(t2) + + # Verify we can compute edits + edits = list(t1.get_all_edits(t2)) + self.assertGreater(len(edits), 0) + + # Verify the diff captures the content change from 'Alice' to 'Bob' + edit_strings = [str(edit) for edit in edits] + all_edits_str = ' '.join(edit_strings) + self.assertTrue( + 'Alice' in all_edits_str or 'Bob' in all_edits_str, + "Diff should capture the change in meta content attribute" + ) + + def test_complex_unquoted_attributes(self): + """Test more complex cases with unquoted attributes. + + HTML5 spec allows unquoted attribute values that don't contain spaces, + quotes, =, <, >, or `. + """ + html = HTML.default_instance + + html_complex = b""" + + + + + + +
+

Article Title

+
+ +""" + + # Should parse without errors + with Tempfile(html_complex) as temp: + tree = html.build_tree(temp) + self.assertIsNotNone(tree) + self.assertEqual(tree.tag.object, 'html') + + def test_closing_tags_issue_26(self): + """Test for issue #26: Fails to match closing HTML tags. + + The original issue reported that graphtage threw errors when encountering + closing tags like . With the lxml HTML parser, this should work. + """ + html = HTML.default_instance + + html_with_closing_tags = b""" + + + Test Page + + + +

Hello World

+

Some text

+ +""" + + # Should parse without errors (previously would fail on ) + with Tempfile(html_with_closing_tags) as temp: + tree = html.build_tree(temp) + self.assertIsNotNone(tree) + self.assertEqual(tree.tag.object, 'html') + + # Test that we can diff two files with closing tags + html_modified = b""" + + + Modified Page + + + +

Hello World

+

Different text

+ +""" + + with Tempfile(html_with_closing_tags) as one, Tempfile(html_modified) as two: + t1 = html.build_tree(one) + t2 = html.build_tree(two) + + # Should compute edits without errors + edits = list(t1.get_all_edits(t2)) + self.assertGreater(len(edits), 0) + + def test_text_between_elements_issue_80(self): + """Test for issue #80: Text missing from HTML diff. + + The original issue reported that text nodes between elements (not wrapped + in tags) were missing from the diff output. For example, "and more" in: + some
text and more
text + """ + html = HTML.default_instance + + # Example from issue #80 + old_html = b""" + + some
text and more
text + +""" + + new_html = b""" + + some
text
and more text + +""" + + with Tempfile(old_html) as one, Tempfile(new_html) as two: + t1 = html.build_tree(one) + t2 = html.build_tree(two) + + # Verify trees were built successfully + self.assertIsNotNone(t1) + self.assertIsNotNone(t2) + + # Verify we can compute edits + edits = list(t1.get_all_edits(t2)) + self.assertGreater(len(edits), 0) + + # Convert edits to strings to check if text is present + edit_strings = [str(edit) for edit in edits] + all_edits_str = ' '.join(edit_strings) + + # The key test: verify that "and more" appears somewhere in the output + # This would have been missing in the original bug + self.assertTrue( + 'and more' in all_edits_str or 'and' in all_edits_str, + "Text 'and more' should be present in the diff output" + )