Skip to content

Commit 37800c3

Browse files
feat: added new exception type to epub conversions (#4052)
Added UnprocessableEpubError to better handle the case when incoming epub file is actually damanged which makes pandoc lib crash with exit code 64.
1 parent 73d239f commit 37800c3

File tree

7 files changed

+47
-6
lines changed

7 files changed

+47
-6
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,5 @@ annotated/
212212
.aider*
213213
pcaps
214214
python-output
215+
216+
.vs/

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.6-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Improved epub partition errors** EPUB partition will now produce new type of error on unprocessable files.
9+
110
## 0.18.5
211

312
### Enhancements

test_unstructured/partition/test_api.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def test_partition_via_api_with_filename_correctly_calls_sdk(
4343

4444
elements = partition_via_api(filename=example_doc_path("eml/fake-email.eml"))
4545

46-
partition_mock_.assert_called_once_with(*expected_call_)
46+
partition_mock_.assert_called_once_with(
47+
expected_call_[0], request=expected_call_[1], retries=expected_call_[2]
48+
)
4749
assert isinstance(partition_mock_.call_args_list[0].args[0], General)
4850
assert len(elements) == 1
4951
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
@@ -66,7 +68,11 @@ def test_partition_via_api_with_file_correctly_calls_sdk(
6668
modified_expected_call = expected_call_[:]
6769
modified_expected_call[1].partition_parameters.files.content = f
6870

69-
partition_mock_.assert_called_once_with(*modified_expected_call)
71+
partition_mock_.assert_called_once_with(
72+
modified_expected_call[0],
73+
request=modified_expected_call[1],
74+
retries=modified_expected_call[2],
75+
)
7076
assert isinstance(partition_mock_.call_args_list[0].args[0], General)
7177
assert len(elements) == 1
7278
assert elements[0] == NarrativeText("This is a test email to use for unit tests.")
@@ -87,7 +93,11 @@ def test_partition_via_api_warns_with_file_and_filename_and_calls_sdk(
8793
modified_expected_call = expected_call_[:]
8894
modified_expected_call[1].partition_parameters.files.content = f
8995

90-
partition_mock_.assert_called_once_with(*modified_expected_call)
96+
partition_mock_.assert_called_once_with(
97+
modified_expected_call[0],
98+
request=modified_expected_call[1],
99+
retries=modified_expected_call[2],
100+
)
91101
assert "WARNING" in caplog.text
92102
assert "The file_filename kwarg will be deprecated" in caplog.text
93103

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.5" # pragma: no cover
1+
__version__ = "0.18.6-dev0" # pragma: no cover

unstructured/errors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,9 @@ def __init__(self, document_pages: int, pdf_hi_res_max_pages: int):
99
f"pages={document_pages}, maximum={pdf_hi_res_max_pages}."
1010
)
1111
super().__init__(self.message)
12+
13+
14+
class UnprocessableEntityError(Exception):
15+
"""Error raised when a file is not valid."""
16+
17+
pass

unstructured/file_utils/file_conversion.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from __future__ import annotations
22

33
import os
4+
import re
45
import tempfile
56
from typing import IO
67

8+
from unstructured.errors import UnprocessableEntityError
79
from unstructured.partition.common.common import exactly_one
810
from unstructured.utils import requires_dependencies
911

@@ -14,7 +16,9 @@ def convert_file_to_text(filename: str, source_format: str, target_format: str)
1416
import pypandoc
1517

1618
try:
17-
text = pypandoc.convert_file(filename, target_format, format=source_format, sandbox=True)
19+
text: str = pypandoc.convert_file(
20+
filename, target_format, format=source_format, sandbox=True
21+
)
1822
except FileNotFoundError as err:
1923
msg = (
2024
f"Error converting the file to text. Ensure you have the pandoc package installed on"
@@ -23,6 +27,14 @@ def convert_file_to_text(filename: str, source_format: str, target_format: str)
2327
)
2428
raise FileNotFoundError(msg)
2529
except RuntimeError as err:
30+
err_str = str(err)
31+
if source_format == "epub" and (
32+
"Couldn't extract ePub file" in err_str
33+
or "No entry on path" in err_str
34+
or re.search(r"exitcode ['\"]?64['\"]?", err_str)
35+
):
36+
raise UnprocessableEntityError(f"Invalid EPUB file: {err_str}")
37+
2638
supported_source_formats, _ = pypandoc.get_pandoc_formats()
2739

2840
if source_format == "rtf" and source_format not in supported_source_formats:

unstructured/partition/epub.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ def partition_epub(
4646
last_modified = get_last_modified_date(filename) if filename else None
4747

4848
html_text = convert_file_to_html_text_using_pandoc(
49-
source_format="epub", filename=filename, file=file
49+
source_format="epub",
50+
filename=filename,
51+
file=file,
5052
)
5153

5254
return partition_html(

0 commit comments

Comments
 (0)