fix: add empty string case for language metadata (#4062)

shreyanid · web-flow · commit 446826885bd3 · 2025-07-16T21:35:00.000Z
Add an empty string edge case for when the element text field is None or
not a string.

most of the diff is `make tidy`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.18.9-dev0
+## 0.18.9
 
 ### Enhancements
 
 ### Features
 - **Convert elements to markdown for output** Added function to convert elements to markdown format for easy viewing.
 
 ### Fixes
+- *Language detection nit** Handle empty text
 
 ## 0.18.8
 
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -992,7 +992,6 @@ def test_json_content_type_is_disambiguated_for_ndjson():
 
 
 def test_office_files_when_document_archive_has_non_standard_prefix():
-
     predicted_type = detect_filetype(
         file_path=input_path("file_type/test_document_from_office365.docx")
     )
diff --git a/test_unstructured/metrics/test_evaluate.py b/test_unstructured/metrics/test_evaluate.py
@@ -203,7 +203,6 @@ def test_process_document_returns_the_correct_amount_of_values(
 def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
     mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
 ):
-
     output_dir = Path(TESTING_FILE_DIR) / output_dirname
     source_dir = Path(TESTING_FILE_DIR) / source_dirname
     mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
@@ -241,7 +240,6 @@ def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_do
 def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
     mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
 ):
-
     output_dir = Path(TESTING_FILE_DIR) / output_dirname
     source_dir = Path(TESTING_FILE_DIR) / source_dirname
     calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
@@ -278,7 +276,6 @@ def test_TableStructureMetricsCalculator_process_document_returns_the_correct_do
 def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
     mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
 ):
-
     output_dir = Path(TESTING_FILE_DIR) / output_dirname
     source_dir = Path(TESTING_FILE_DIR) / source_dirname
     calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
diff --git a/test_unstructured/partition/common/test_common.py b/test_unstructured/partition/common/test_common.py
@@ -331,7 +331,6 @@ def test_normalize_layout_element_bulleted_list():
 
 
 class MockRunOutput:
-
     def __init__(self, returncode, stdout, stderr):
         self.returncode = returncode
         self.stdout = stdout
diff --git a/test_unstructured/partition/common/test_metadata.py b/test_unstructured/partition/common/test_metadata.py
@@ -63,7 +63,6 @@ def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathl
 
 
 class Describe_set_element_hierarchy:
-
     def it_applies_default_ruleset(self):
         elements = [
             Title(element_id="0", text="Title0"),
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1600,7 +1600,6 @@ def _test(result):
 
 
 def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
-
     pdf.partition_pdf(
         filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
         strategy=PartitionStrategy.HI_RES,
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.18.9-dev0"  # pragma: no cover
+__version__ = "0.18.9"  # pragma: no cover
diff --git a/unstructured/metrics/evaluate.py b/unstructured/metrics/evaluate.py
@@ -780,7 +780,6 @@ def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]:
 
 
 class ObjectDetectionPerClassMetricsCalculator(ObjectDetectionMetricsCalculatorBase):
-
     def __post_init__(self):
         super().__post_init__()
         self.per_class_metric_names: list[str] | None = None
diff --git a/unstructured/metrics/table/table_eval.py b/unstructured/metrics/table/table_eval.py
@@ -288,11 +288,13 @@ def process_file(self) -> TableEvaluation:
                 cutoff=self.cutoff,
             )
 
-            table_detection_recall, table_detection_precision, table_detection_f1 = (
-                calculate_table_detection_metrics(
-                    matched_indices=matched_indices,
-                    ground_truth_tables_number=len(ground_truth_table_data),
-                )
+            (
+                table_detection_recall,
+                table_detection_precision,
+                table_detection_f1,
+            ) = calculate_table_detection_metrics(
+                matched_indices=matched_indices,
+                ground_truth_tables_number=len(ground_truth_table_data),
             )
 
             evaluation = TableEvaluation(
diff --git a/unstructured/partition/common/lang.py b/unstructured/partition/common/lang.py
@@ -487,7 +487,7 @@ def apply_lang_metadata(
     if not isinstance(elements, list):
         elements = list(elements)
 
-    full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
+    full_text = " ".join(str(e.text) for e in elements if hasattr(e, "text") and e.text)
     detected_languages = detect_languages(text=full_text, languages=languages)
     if (
         detected_languages is not None
@@ -501,7 +501,8 @@ def apply_lang_metadata(
     else:
         for e in elements:
             if hasattr(e, "text"):
-                e.metadata.languages = detect_languages(e.text)
+                text_value = str(e.text) if e.text is not None else ""
+                e.metadata.languages = detect_languages(text_value)
                 yield e
             else:
                 yield e
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -1235,7 +1235,6 @@ def document_to_element_list(
                 translation_mapping.extend([(layout_element, el) for el in element])
                 continue
             else:
-
                 element.metadata.links = (
                     get_links_in_element(links, layout_element.bbox) if links else []
                 )
diff --git a/unstructured/partition/pdf_image/analysis/bbox_visualisation.py b/unstructured/partition/pdf_image/analysis/bbox_visualisation.py
@@ -535,7 +535,6 @@ def get_element_type_color(self, element_type: str) -> str:
 
 
 class AnalysisDrawer(AnalysisProcessor):
-
     def __init__(
         self,
         filename: Optional[Union[str, Path]],
@@ -561,7 +560,6 @@ def add_drawer(self, drawer: LayoutDrawer):
         self.drawers.append(drawer)
 
     def process(self):
-
         filename_stem = Path(self.filename).stem
         analysis_save_dir = Path(self.save_dir) / "analysis" / filename_stem / "bboxes"
         analysis_save_dir.mkdir(parents=True, exist_ok=True)
diff --git a/unstructured/partition/pdf_image/analysis/layout_dump.py b/unstructured/partition/pdf_image/analysis/layout_dump.py
@@ -104,7 +104,6 @@ def extract_text_regions_info(layout: List[List[TextRegion]]) -> dict:
 
 
 class ExtractedLayoutDumper(LayoutDumper):
-
     layout_source = "pdfminer"
 
     def __init__(self, layout: List[List[TextRegion]]):
@@ -115,7 +114,6 @@ def dump(self) -> dict:
 
 
 class OCRLayoutDumper(LayoutDumper):
-
     layout_source = "ocr"
 
     def __init__(self):
@@ -162,7 +160,6 @@ def _extract_final_element_page_size(element: Element) -> dict:
 
 
 class FinalLayoutDumper(LayoutDumper):
-
     layout_source = "final"
 
     def __init__(self, layout: List[Element]):
diff --git a/unstructured/partition/pdf_image/analysis/processor.py b/unstructured/partition/pdf_image/analysis/processor.py
@@ -4,7 +4,6 @@
 
 
 class AnalysisProcessor(ABC):
-
     def __init__(
         self,
         filename: Union[str, Path],
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -737,7 +737,6 @@ def aggregate_embedded_text_by_block(
 
 
 def get_links_in_element(page_links: list, region: Rectangle) -> list:
-
     links_bboxes = [Rectangle(*link.get("bbox")) for link in page_links]
     results = bboxes1_is_almost_subregion_of_bboxes2(links_bboxes, [region])
     links = [
diff --git a/unstructured/partition/utils/ocr_models/google_vision_ocr.py b/unstructured/partition/utils/ocr_models/google_vision_ocr.py
@@ -58,7 +58,6 @@ def get_layout_from_image(self, image: PILImage.Image) -> TextRegions:
         return regions
 
     def get_layout_elements_from_image(self, image: PILImage.Image) -> LayoutElements:
-
         from unstructured.partition.pdf_image.inference_utils import (
             build_layout_elements_from_ocr_regions,
         )
diff --git a/unstructured/partition/utils/ocr_models/paddle_ocr.py b/unstructured/partition/utils/ocr_models/paddle_ocr.py
@@ -81,7 +81,6 @@ def get_layout_from_image(self, image: PILImage.Image) -> TextRegions:
 
     @requires_dependencies("unstructured_inference")
     def get_layout_elements_from_image(self, image: PILImage.Image) -> LayoutElements:
-
         ocr_regions = self.get_layout_from_image(image)
 
         # NOTE(christine): For paddle, there is no difference in `ocr_layout` and `ocr_text` in
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -108,7 +108,6 @@ def image_to_data_with_character_confidence_filter(
     def hocr_to_dataframe(
         self, hocr: str, character_confidence_threshold: float = 0.0
     ) -> pd.DataFrame:
-
         df_entries = []
 
         if not hocr:
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
@@ -225,7 +225,6 @@ def sort_text_regions(
     bboxes = elements.element_coords
 
     def _bboxes_ok(strict_points: bool):
-
         if np.isnan(bboxes).any():
             trace_logger.detail(  # type: ignore
                 "some or all elements are missing bboxes, skipping sort",

Original file line number	Diff line number	Diff line change
`@@ -992,7 +992,6 @@ def test_json_content_type_is_disambiguated_for_ndjson():`
`992`	`992`
`993`	`993`
`994`	`994`	`def test_office_files_when_document_archive_has_non_standard_prefix():`
`995`		`-`
`996`	`995`	`predicted_type = detect_filetype(`
`997`	`996`	`file_path=input_path("file_type/test_document_from_office365.docx")`
`998`	`997`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.18.9-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.18.9" # pragma: no cover`