Skip to content

Commit 4468268

Browse files
authored
fix: add empty string case for language metadata (#4062)
Add an empty string edge case for when the element text field is None or not a string. most of the diff is `make tidy`
1 parent c7c3e3c commit 4468268

File tree

19 files changed

+13
-29
lines changed

19 files changed

+13
-29
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.18.9-dev0
1+
## 0.18.9
22

33
### Enhancements
44

55
### Features
66
- **Convert elements to markdown for output** Added function to convert elements to markdown format for easy viewing.
77

88
### Fixes
9+
- *Language detection nit** Handle empty text
910

1011
## 0.18.8
1112

test_unstructured/file_utils/test_filetype.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -992,7 +992,6 @@ def test_json_content_type_is_disambiguated_for_ndjson():
992992

993993

994994
def test_office_files_when_document_archive_has_non_standard_prefix():
995-
996995
predicted_type = detect_filetype(
997996
file_path=input_path("file_type/test_document_from_office365.docx")
998997
)

test_unstructured/metrics/test_evaluate.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,6 @@ def test_process_document_returns_the_correct_amount_of_values(
203203
def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_doctype(
204204
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
205205
):
206-
207206
output_dir = Path(TESTING_FILE_DIR) / output_dirname
208207
source_dir = Path(TESTING_FILE_DIR) / source_dirname
209208
mock_calculate_accuracy = mock_dependencies["mock_calculate_accuracy"]
@@ -241,7 +240,6 @@ def test_TextExtractionMetricsCalculator_process_document_returns_the_correct_do
241240
def test_TableStructureMetricsCalculator_process_document_returns_the_correct_doctype(
242241
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
243242
):
244-
245243
output_dir = Path(TESTING_FILE_DIR) / output_dirname
246244
source_dir = Path(TESTING_FILE_DIR) / source_dirname
247245
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)
@@ -278,7 +276,6 @@ def test_TableStructureMetricsCalculator_process_document_returns_the_correct_do
278276
def test_ElementTypeMetricsCalculator_process_document_returns_the_correct_doctype(
279277
mock_dependencies, calculator_class, output_dirname, source_dirname, path, kwargs
280278
):
281-
282279
output_dir = Path(TESTING_FILE_DIR) / output_dirname
283280
source_dir = Path(TESTING_FILE_DIR) / source_dirname
284281
calculator = calculator_class(documents_dir=output_dir, ground_truths_dir=source_dir, **kwargs)

test_unstructured/partition/common/test_common.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,6 @@ def test_normalize_layout_element_bulleted_list():
331331

332332

333333
class MockRunOutput:
334-
335334
def __init__(self, returncode, stdout, stderr):
336335
self.returncode = returncode
337336
self.stdout = stdout

test_unstructured/partition/common/test_metadata.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ def but_it_returns_None_when_there_is_no_file_at_that_path(self, tmp_path: pathl
6363

6464

6565
class Describe_set_element_hierarchy:
66-
6766
def it_applies_default_ruleset(self):
6867
elements = [
6968
Title(element_id="0", text="Title0"),

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,6 @@ def _test(result):
16001600

16011601

16021602
def test_partition_pdf_with_specified_ocr_agents(mock_ocr_get_instance, mocker):
1603-
16041603
pdf.partition_pdf(
16051604
filename=example_doc_path("pdf/layout-parser-paper-with-table.pdf"),
16061605
strategy=PartitionStrategy.HI_RES,

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.9-dev0" # pragma: no cover
1+
__version__ = "0.18.9" # pragma: no cover

unstructured/metrics/evaluate.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,6 @@ def _generate_dataframes(self, rows) -> tuple[pd.DataFrame, pd.DataFrame]:
780780

781781

782782
class ObjectDetectionPerClassMetricsCalculator(ObjectDetectionMetricsCalculatorBase):
783-
784783
def __post_init__(self):
785784
super().__post_init__()
786785
self.per_class_metric_names: list[str] | None = None

unstructured/metrics/table/table_eval.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -288,11 +288,13 @@ def process_file(self) -> TableEvaluation:
288288
cutoff=self.cutoff,
289289
)
290290

291-
table_detection_recall, table_detection_precision, table_detection_f1 = (
292-
calculate_table_detection_metrics(
293-
matched_indices=matched_indices,
294-
ground_truth_tables_number=len(ground_truth_table_data),
295-
)
291+
(
292+
table_detection_recall,
293+
table_detection_precision,
294+
table_detection_f1,
295+
) = calculate_table_detection_metrics(
296+
matched_indices=matched_indices,
297+
ground_truth_tables_number=len(ground_truth_table_data),
296298
)
297299

298300
evaluation = TableEvaluation(

unstructured/partition/common/lang.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ def apply_lang_metadata(
487487
if not isinstance(elements, list):
488488
elements = list(elements)
489489

490-
full_text = " ".join(e.text for e in elements if hasattr(e, "text"))
490+
full_text = " ".join(str(e.text) for e in elements if hasattr(e, "text") and e.text)
491491
detected_languages = detect_languages(text=full_text, languages=languages)
492492
if (
493493
detected_languages is not None
@@ -501,7 +501,8 @@ def apply_lang_metadata(
501501
else:
502502
for e in elements:
503503
if hasattr(e, "text"):
504-
e.metadata.languages = detect_languages(e.text)
504+
text_value = str(e.text) if e.text is not None else ""
505+
e.metadata.languages = detect_languages(text_value)
505506
yield e
506507
else:
507508
yield e

0 commit comments

Comments
 (0)