|
3 | 3 | from __future__ import annotations |
4 | 4 |
|
5 | 5 | import io |
6 | | -from tempfile import SpooledTemporaryFile |
7 | 6 | from typing import IO, Any, Iterator, Optional |
8 | 7 |
|
9 | 8 | import networkx as nx |
10 | 9 | import numpy as np |
11 | 10 | import pandas as pd |
| 11 | +from msoffcrypto import OfficeFile |
| 12 | +from msoffcrypto.exceptions import FileFormatError |
12 | 13 | from typing_extensions import Self, TypeAlias |
13 | 14 |
|
14 | 15 | from unstructured.chunking import add_chunking_strategy |
|
23 | 24 | Text, |
24 | 25 | Title, |
25 | 26 | ) |
| 27 | +from unstructured.errors import UnprocessableEntityError |
26 | 28 | from unstructured.file_utils.model import FileType |
27 | 29 | from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date |
28 | 30 | from unstructured.partition.text_type import ( |
@@ -187,16 +189,28 @@ def metadata_file_path(self) -> str | None: |
187 | 189 | @lazyproperty |
188 | 190 | def sheets(self) -> dict[str, pd.DataFrame]: |
189 | 191 | """The spreadsheet worksheets, each as a data-frame mapped by sheet-name.""" |
190 | | - if file_path := self._file_path: |
191 | | - return pd.read_excel(file_path, sheet_name=None, header=self.header_row_idx) |
| 192 | + try: |
| 193 | + office_file = OfficeFile(io.BytesIO(self._file_bytes)) |
| 194 | + except FileFormatError as e: |
| 195 | + raise UnprocessableEntityError("Not a valid XLSX file.") from e |
192 | 196 |
|
193 | | - if f := self._file: |
194 | | - if isinstance(f, SpooledTemporaryFile): |
195 | | - f.seek(0) |
196 | | - f = io.BytesIO(f.read()) |
197 | | - return pd.read_excel(f, sheet_name=None, header=self.header_row_idx) |
| 197 | + if office_file.is_encrypted(): |
| 198 | + raise UnprocessableEntityError("XLSX file is password protected.") |
198 | 199 |
|
199 | | - raise ValueError("Either 'filename' or 'file' argument must be specified.") |
| 200 | + return pd.read_excel( |
| 201 | + io.BytesIO(self._file_bytes), sheet_name=None, header=self.header_row_idx |
| 202 | + ) |
| 203 | + |
| 204 | + @lazyproperty |
| 205 | + def _file_bytes(self) -> bytes: |
| 206 | + if file := self._file: |
| 207 | + file.seek(0) |
| 208 | + return file.read() |
| 209 | + elif self._file_path: |
| 210 | + with open(self._file_path, "rb") as file: |
| 211 | + return file.read() |
| 212 | + else: |
| 213 | + raise ValueError("Either 'filename' or 'file' argument must be specified.") |
200 | 214 |
|
201 | 215 |
|
202 | 216 | class _ConnectedComponent: |
|
0 commit comments