|
| 1 | +from typing import Optional, Type |
| 2 | + |
| 3 | +import pytest |
1 | 4 | from bs4 import BeautifulSoup |
2 | 5 |
|
3 | | -from unstructured.documents.ontology import Form, FormFieldValue, Image, OntologyElement, Page |
| 6 | +from unstructured.documents.ontology import ( |
| 7 | + Checkbox, |
| 8 | + Form, |
| 9 | + FormFieldValue, |
| 10 | + Image, |
| 11 | + OntologyElement, |
| 12 | + Page, |
| 13 | + RadioButton, |
| 14 | +) |
4 | 15 | from unstructured.partition.html.html_utils import indent_html |
5 | 16 | from unstructured.partition.html.transformations import RECURSION_LIMIT, parse_html_to_ontology |
6 | 17 |
|
@@ -330,7 +341,7 @@ def test_when_unknown_element_keyword_only_attributes_are_preserved_during_mappi |
330 | 341 | <div class="Page"> |
331 | 342 | <form class="Form"> |
332 | 343 | <label class="FormField" for="option1"> |
333 | | - <input class="Checkbox" type="radio" name="option1" value="2" checked /> |
| 344 | + <input class="RadioButton" type="radio" name="option1" value="2" checked /> |
334 | 345 | <span class="UncategorizedText"> |
335 | 346 | Option 1 (Checked) |
336 | 347 | </span> |
@@ -713,3 +724,30 @@ def test_uncategorizedtest_has_image_and_no_text(): |
713 | 724 | element = ontology.children[0].children[0] |
714 | 725 | assert type(element) is Image |
715 | 726 | assert element.css_class_name == "Image" |
| 727 | + |
| 728 | + |
| 729 | +@pytest.mark.parametrize( |
| 730 | + ("input_type", "expected_class"), |
| 731 | + [ |
| 732 | + ("checkbox", Checkbox), |
| 733 | + ("radio", RadioButton), |
| 734 | + ("text", FormFieldValue), # explicit non-specialised type |
| 735 | + (None, FormFieldValue), # missing type attribute |
| 736 | + ], |
| 737 | +) |
| 738 | +def test_input_tag_type_is_mapped_to_correct_ontology_class( |
| 739 | + input_type: Optional[str], expected_class: Type[OntologyElement] |
| 740 | +) -> None: |
| 741 | + """Ensure bare <input> tags are classified based on their *type* attribute.""" |
| 742 | + |
| 743 | + type_attr = f' type="{input_type}"' if input_type is not None else "" |
| 744 | + html_snippet = f'<div class="Page"><input{type_attr} name="field" /></div>' |
| 745 | + |
| 746 | + page = parse_html_to_ontology(html_snippet) |
| 747 | + assert len(page.children) == 1 |
| 748 | + element = page.children[0] |
| 749 | + |
| 750 | + # Validate chosen ontology class and preserved HTML semantics |
| 751 | + assert isinstance(element, expected_class) |
| 752 | + assert element.html_tag_name == "input" |
| 753 | + assert element.css_class_name == expected_class.__name__ |
0 commit comments