trailofbits · pbottine · Nov 24, 2025 · Nov 24, 2025 · Nov 25, 2025 · ESultanik
@@ -446,6 +446,61 @@ def __init__(self):
             'application/xhtml+xml'
         )
 
+    def build_tree(self, path: str, options: Optional[BuildOptions] = None) -> TreeNode:
+        """Builds a tree from an HTML file using lxml's HTML parser.
+
+        This method uses lxml.html instead of xml.etree.ElementTree to properly handle
+        HTML5 syntax, including unquoted attributes like <meta name=foo>.
+
+        Args:
+            path: The path to the HTML file to parse.
+            options: Optional build options.
+
+        Returns:
+            TreeNode: The root XMLElement node representing the HTML document.
+        """
+        try:
+            from lxml import html as lxml_html
+        except ImportError:
+            # Fallback to XML parser if lxml is not available
+            return super().build_tree(path, options)
+
+        # Parse HTML file using lxml's lenient HTML parser
+        with open(path, 'rb') as f:
+            tree = lxml_html.parse(f)
+
+        # Convert lxml element to xml.etree.ElementTree.Element
+        root = tree.getroot()
+
+        # Convert lxml tree to standard ET format
+        et_root = self._lxml_to_et(root)
+
+        # Use the existing build_tree function with the converted ET element
+        return build_tree(et_root, options)
+
+    @staticmethod
+    def _lxml_to_et(lxml_elem) -> ET.Element:
+        """Converts an lxml element to a standard xml.etree.ElementTree.Element.
+
+        Args:
+            lxml_elem: An lxml.etree.Element object.
+
+        Returns:
+            ET.Element: A standard ElementTree Element object.
+        """
+        # Create new ET element with same tag and attributes
+        et_elem = ET.Element(lxml_elem.tag, attrib=dict(lxml_elem.attrib))
+
+        # Copy text and tail
+        et_elem.text = lxml_elem.text
+        et_elem.tail = lxml_elem.tail
+
+        # Recursively convert children
+        for child in lxml_elem:
+            et_elem.append(HTML._lxml_to_et(child))
+
+        return et_elem
+
 
 # Tell JSON how to format XML:
 def _json_print_XMLElement(self: JSONFormatter, printer: Printer, node: XMLElement):

@@ -30,6 +30,7 @@ dependencies = [
     "fickling>=0.1.3",
     "intervaltree",
     "json5==0.9.5",
+    "lxml>=4.9.0",
     "numpy>=1.19.4",
     "PyYAML",
     "scipy>=1.4.0",

@@ -221,9 +221,63 @@ def test_xml_formatting(self):
         return orig_obj, str(orig_obj)
 
     def test_html_formatting(self):
-        # For now, HTML support is implemented through XML, so we don't need a separate test.
-        # However, test_formatter_coverage will complain unless this function is here!
-        pass
+        # Test basic HTML parsing and formatting
+        # This tests both quoted and unquoted attributes (issue #25)
+        import tempfile
+        import os
+
+        # Test with unquoted attributes
+        html_unquoted = """<!DOCTYPE html>
+<html>
+<head>
+    <meta name=foo content=bar>
+    <meta charset=utf-8>
+</head>
+<body>
+    <h1>Test</h1>
+</body>
+</html>"""
+
+        # Test with quoted attributes
+        html_quoted = """<!DOCTYPE html>
+<html>
+<head>
+    <meta name="foo" content="bar">
+    <meta charset="utf-8">
+</head>
+<body>
+    <h1>Test</h1>
+</body>
+</html>"""
+
+        for html_content in [html_unquoted, html_quoted]:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f:
+                f.write(html_content)
+                temp_file = f.name
+
+            try:
+                # Should parse without errors
+                filetype = graphtage.FILETYPES_BY_TYPENAME['html']
+                tree = filetype.build_tree(temp_file)
+
+                # Verify it's an XMLElement
+                self.assertIsInstance(tree, xml.XMLElement)
+
+                # Verify it can be formatted
+                formatter = filetype.get_default_formatter()
+                from io import StringIO
+                from graphtage.printer import Printer
+                output = StringIO()
+                printer = Printer(output)
+                formatter.print(printer, tree)
+                result = output.getvalue()
+
+                # Verify output contains expected elements
+                self.assertIn('<html>', result)
+                self.assertIn('<meta', result)
+                self.assertIn('</html>', result)
+            finally:
+                os.unlink(temp_file)
 
     def test_json5_formatting(self):
         # For now, JSON5 support is implemented using the regular JSON formatter, so we don't need a separate test.