Skip to content

Commit 5368197

Browse files
authored
feat: map <input> tags by type + add coverage (#4068)
Implements type-aware classification of `<input>` elements in `extract_tag_and_ontology_class_from_tag` (checkbox → `Checkbox`, radio → `RadioButton`, else → `FormFieldValue`) and updates/extends the HTML-to-ontology test suite to validate the new behaviour.
1 parent d24dec5 commit 5368197

File tree

4 files changed

+60
-7
lines changed

4 files changed

+60
-7
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
## 0.18.11-dev0
1+
## 0.18.11-dev1
22

33
### Enhancements
44

55
### Features
6+
- **Type-aware `<input>` mapping in HTML transformations** Bare `<input>` elements are now classified by their `type` attribute (checkbox → Checkbox, radio → RadioButton, others → FormFieldValue).
7+
68

79
### Fixes
810
- **Recognize '|' as a delimiter** csv parser will now recognize '|' as a delimiter in addition to ',' and ';'.

test_unstructured/partition/html/test_html_to_ontology_parsing.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,17 @@
1+
from typing import Optional, Type
2+
3+
import pytest
14
from bs4 import BeautifulSoup
25

3-
from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page
6+
from unstructured.documents.ontology import (
7+
Checkbox,
8+
Form,
9+
FormFieldValue,
10+
Image,
11+
OntologyElement,
12+
Page,
13+
RadioButton,
14+
)
415
from unstructured.partition.html.html_utils import indent_html
516
from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology
617

@@ -330,7 +341,7 @@ def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mappi
330341
<div class="Page">
331342
<form class="Form">
332343
<label class="FormField" for="option1">
333-
<input class="Checkbox" type="radio" name="option1" value="2" checked />
344+
<input class="RadioButton" type="radio" name="option1" value="2" checked />
334345
<span class="UncategorizedText">
335346
Option 1 (Checked)
336347
</span>
@@ -713,3 +724,30 @@ def test_uncategorizedtest_has_image_and_no_text():
713724
element = ontology.children[0].children[0]
714725
assert type(element) is Image
715726
assert element.css_class_name == "Image"
727+
728+
729+
@pytest.mark.parametrize(
730+
("input_type", "expected_class"),
731+
[
732+
("checkbox", Checkbox),
733+
("radio", RadioButton),
734+
("text", FormFieldValue), # explicit non-specialised type
735+
(None, FormFieldValue), # missing type attribute
736+
],
737+
)
738+
def test_input_tag_type_is_mapped_to_correct_ontology_class(
739+
input_type: Optional[str], expected_class: Type[OntologyElement]
740+
) -> None:
741+
"""Ensure bare <input> tags are classified based on their *type* attribute."""
742+
743+
type_attr = f' type="{input_type}"' if input_type is not None else ""
744+
html_snippet = f'<div class="Page"><input{type_attr} name="field" /></div>'
745+
746+
page = parse_html_to_ontology(html_snippet)
747+
assert len(page.children) == 1
748+
element = page.children[0]
749+
750+
# Validate chosen ontology class and preserved HTML semantics
751+
assert isinstance(element, expected_class)
752+
assert element.html_tag_name == "input"
753+
assert element.css_class_name == expected_class.__name__

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.11-dev0" # pragma: no cover
1+
__version__ = "0.18.11-dev1" # pragma: no cover

unstructured/partition/html/transformations.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -423,11 +423,24 @@ def extract_tag_and_ontology_class_from_tag(
423423
element_class = CSS_CLASS_TO_ELEMENT_TYPE_MAP.get(soup.attrs["class"][0])
424424
html_tag = element_class().allowed_tags[0]
425425

426-
# Scenario 3: CSS class incorrect, but HTML tag correct and exclusive in ontology
426+
# Scenario 3: <input> elements, handled explicitly based on their 'type' attribute
427+
if not element_class and soup.name == "input":
428+
input_type = (str(soup.get("type")) or "").lower()
429+
if input_type == "checkbox":
430+
element_class = ontology.Checkbox
431+
elif input_type == "radio":
432+
element_class = ontology.RadioButton
433+
else:
434+
# Any other input (including missing type or text/number/etc.) is considered
435+
# a generic form field value.
436+
element_class = ontology.FormFieldValue
437+
html_tag = "input"
438+
439+
# Scenario 4: CSS class incorrect, but HTML tag correct and exclusive in ontology
427440
if not element_class and soup.name in HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP:
428441
html_tag, element_class = soup.name, HTML_TAG_TO_DEFAULT_ELEMENT_TYPE_MAP[soup.name]
429442

430-
# Scenario 4: CSS class incorrect, HTML tag incorrect
443+
# Scenario 5: CSS class incorrect, HTML tag incorrect
431444
# Fallback to default UncategorizedText
432445
if not element_class:
433446
# TODO (Pluto): Sometimes we could infer that from parent type and soup.name
@@ -436,7 +449,7 @@ def extract_tag_and_ontology_class_from_tag(
436449
html_tag = "span"
437450
element_class = ontology.UncategorizedText
438451

439-
# Scenario 5: UncategorizedText has image and no text
452+
# Scenario 6: UncategorizedText has image and no text
440453
# Typically, this happens with a span or div tag with an image inside
441454
if element_class == ontology.UncategorizedText and soup.find("img") and not soup.text.strip():
442455
element_class = ontology.Image

0 commit comments

Comments
 (0)