Skip to content

Commit f66562b

Browse files
ds-filipknefelFilip Knefel
andauthored
fix: properly handle password protected xlsx (#4057)
### Issue Attempt at partitioning a password protected errors results in an obscure exception > Can't find workbook in OLE2 compound document ### Solution Utilize [msoffcrypto-tool](https://pypi.org/project/msoffcrypto-tool/) package (MIT License) to load XLSX file and check whether it's encrypted, if yes throw an `UnprocessableEntityError` exception detailing the reason for rejecting the file. --------- Co-authored-by: Filip Knefel <[email protected]>
1 parent 344202f commit f66562b

File tree

7 files changed

+58
-10
lines changed

7 files changed

+58
-10
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.18.8
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- **Properly handle password protected xlsx** - detect password protection on XLSX files and raise appropriate
9+
110
## 0.18.7
211

312
### Enhancements
8 KB
Binary file not shown.

requirements/extra-xlsx.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ openpyxl
55
pandas
66
xlrd
77
networkx
8+
msoffcrypto-tool

requirements/extra-xlsx.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,36 @@
44
#
55
# pip-compile ./extra-xlsx.in
66
#
7+
cffi==1.17.1
8+
# via
9+
# -c requirements/base.txt
10+
# cryptography
11+
cryptography==45.0.5
12+
# via
13+
# -c requirements/base.txt
14+
# msoffcrypto-tool
715
et-xmlfile==2.0.0
816
# via openpyxl
17+
msoffcrypto-tool==5.4.2
18+
# via -r ./extra-xlsx.in
919
networkx==3.4.2
1020
# via -r ./extra-xlsx.in
1121
numpy==2.2.6
1222
# via
1323
# -c requirements/base.txt
1424
# pandas
25+
olefile==0.47
26+
# via
27+
# -c requirements/base.txt
28+
# msoffcrypto-tool
1529
openpyxl==3.1.5
1630
# via -r ./extra-xlsx.in
1731
pandas==2.3.1
1832
# via -r ./extra-xlsx.in
33+
pycparser==2.22
34+
# via
35+
# -c requirements/base.txt
36+
# cffi
1937
python-dateutil==2.9.0.post0
2038
# via
2139
# -c requirements/base.txt

test_unstructured/partition/test_xlsx.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
)
2929
from unstructured.cleaners.core import clean_extra_whitespace
3030
from unstructured.documents.elements import ListItem, Table, Text, Title
31+
from unstructured.errors import UnprocessableEntityError
3132
from unstructured.partition.xlsx import (
3233
_ConnectedComponent,
3334
_SubtableParser,
@@ -168,6 +169,11 @@ def test_partition_xlsx_from_file_with_header():
168169
assert e.metadata.text_as_html is not None
169170

170171

172+
def test_partition_xlsx_password_protected_raises_exception():
173+
with pytest.raises(UnprocessableEntityError):
174+
partition_xlsx(filename="example-docs/password_protected.xlsx")
175+
176+
171177
# -- .metadata.last_modified ---------------------------------------------------------------------
172178

173179

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.18.7" # pragma: no cover
1+
__version__ = "0.18.8" # pragma: no cover

unstructured/partition/xlsx.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
from __future__ import annotations
44

55
import io
6-
from tempfile import SpooledTemporaryFile
76
from typing import IO, Any, Iterator, Optional
87

98
import networkx as nx
109
import numpy as np
1110
import pandas as pd
11+
from msoffcrypto import OfficeFile
12+
from msoffcrypto.exceptions import FileFormatError
1213
from typing_extensions import Self, TypeAlias
1314

1415
from unstructured.chunking import add_chunking_strategy
@@ -23,6 +24,7 @@
2324
Text,
2425
Title,
2526
)
27+
from unstructured.errors import UnprocessableEntityError
2628
from unstructured.file_utils.model import FileType
2729
from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date
2830
from unstructured.partition.text_type import (
@@ -187,16 +189,28 @@ def metadata_file_path(self) -> str | None:
187189
@lazyproperty
188190
def sheets(self) -> dict[str, pd.DataFrame]:
189191
"""The spreadsheet worksheets, each as a data-frame mapped by sheet-name."""
190-
if file_path := self._file_path:
191-
return pd.read_excel(file_path, sheet_name=None, header=self.header_row_idx)
192+
try:
193+
office_file = OfficeFile(io.BytesIO(self._file_bytes))
194+
except FileFormatError as e:
195+
raise UnprocessableEntityError("Not a valid XLSX file.") from e
192196

193-
if f := self._file:
194-
if isinstance(f, SpooledTemporaryFile):
195-
f.seek(0)
196-
f = io.BytesIO(f.read())
197-
return pd.read_excel(f, sheet_name=None, header=self.header_row_idx)
197+
if office_file.is_encrypted():
198+
raise UnprocessableEntityError("XLSX file is password protected.")
198199

199-
raise ValueError("Either 'filename' or 'file' argument must be specified.")
200+
return pd.read_excel(
201+
io.BytesIO(self._file_bytes), sheet_name=None, header=self.header_row_idx
202+
)
203+
204+
@lazyproperty
205+
def _file_bytes(self) -> bytes:
206+
if file := self._file:
207+
file.seek(0)
208+
return file.read()
209+
elif self._file_path:
210+
with open(self._file_path, "rb") as file:
211+
return file.read()
212+
else:
213+
raise ValueError("Either 'filename' or 'file' argument must be specified.")
200214

201215

202216
class _ConnectedComponent:

0 commit comments

Comments
 (0)