Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit bdc2ae7

Browse files
cscanlin-kwhbosd
authored andcommitted
[IMP]: add support for file_bytes argument with managed_file_context()
1 parent e3c1115 commit bdc2ae7

File tree

5 files changed

+185
-91
lines changed

5 files changed

+185
-91
lines changed

camelot/handlers.py

Lines changed: 112 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,46 +1,72 @@
1+
import io
12
import multiprocessing as mp
23
import os
34
import sys
5+
from contextlib import contextmanager
46
from pathlib import Path
7+
from typing import IO
8+
from typing import Any
9+
from typing import TypeVar
510
from typing import Union
611

712
from pypdf import PdfReader
813
from pypdf import PdfWriter
9-
from pypdf._utils import StrByteType
1014

1115
from .core import TableList
1216
from .parsers import Lattice
1317
from .parsers import Stream
18+
from .utils import InvalidArguments
1419
from .utils import TemporaryDirectory
15-
from .utils import download_url
1620
from .utils import get_page_layout
1721
from .utils import get_rotation
1822
from .utils import get_text_objects
23+
from .utils import get_url_bytes
1924
from .utils import is_url
2025

2126

27+
FilePathType = TypeVar("FilePathType", str, IO[Any], Path, None)
28+
29+
2230
class PDFHandler:
2331
"""Handles all operations like temp directory creation, splitting
2432
file into single page PDFs, parsing each PDF and then removing the
2533
temp directory.
2634
2735
Parameters
2836
----------
29-
filepath : str
30-
Filepath or URL of the PDF file.
37+
filepath : str | pathlib.Path, optional (default: None)
38+
Filepath or URL of the PDF file. Required if file_bytes is not given
3139
pages : str, optional (default: '1')
3240
Comma-separated page numbers.
3341
Example: '1,3,4' or '1,4-end' or 'all'.
3442
password : str, optional (default: None)
3543
Password for decryption.
44+
file_bytes : io.IOBase, optional (default: None)
45+
A file-like stream. Required if filepath is not given
3646
3747
"""
3848

39-
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
49+
def __init__(
50+
self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None
51+
):
4052
if is_url(filepath):
41-
filepath = download_url(filepath)
42-
self.filepath: Union[StrByteType, Path] = filepath
53+
file_bytes = get_url_bytes(filepath)
54+
55+
if not filepath and not file_bytes:
56+
raise InvalidArguments("Either `filepath` or `file_bytes` is required")
57+
if not filepath:
58+
# filepath must either be passed, or taken from the name attribute
59+
try:
60+
filepath = getattr(file_bytes, "name")
61+
except AttributeError:
62+
msg = (
63+
"Either pass a `filepath`, or give the "
64+
"`file_bytes` argument a name attribute"
65+
)
66+
raise InvalidArguments(msg)
67+
self.file_bytes = file_bytes # ok to be None
4368

69+
self.filepath = filepath
4470
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
4571
raise NotImplementedError("File format not supported")
4672

@@ -52,13 +78,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5278
self.password = self.password.encode("ascii")
5379
self.pages = self._get_pages(pages)
5480

81+
@contextmanager
82+
def managed_file_context(self):
83+
"""Reads from either the `filepath` or `file_bytes`
84+
attribute of this instance, to return a file-like object.
85+
Closes any open file handles on exit or error.
86+
87+
Returns
88+
-------
89+
file_bytes : io.IOBase
90+
A readable, seekable, file-like object
91+
"""
92+
if self.file_bytes:
93+
# if we can't seek, write to a BytesIO object that can,
94+
# then seek to the beginning before yielding
95+
if not hasattr(self.file_bytes, "seek"):
96+
self.file_bytes = io.BytesIO(self.file_bytes.read())
97+
self.file_bytes.seek(0)
98+
yield self.file_bytes
99+
else:
100+
with open(self.filepath, "rb") as file_bytes:
101+
yield file_bytes
102+
55103
def _get_pages(self, pages):
56104
"""Converts pages string to list of ints.
57105
58106
Parameters
59107
----------
60-
filepath : str
61-
Filepath or URL of the PDF file.
108+
managed_file_context : io.IOBase
109+
A readable, seekable, file-like object
62110
pages : str, optional (default: '1')
63111
Comma-separated page numbers.
64112
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,82 +122,85 @@ def _get_pages(self, pages):
74122
if pages == "1":
75123
page_numbers.append({"start": 1, "end": 1})
76124
else:
77-
infile = PdfReader(self.filepath, strict=False)
125+
with self.managed_file_context() as f:
126+
infile = PdfReader(f, strict=False)
78127

79-
if infile.is_encrypted:
80-
infile.decrypt(self.password)
128+
if infile.is_encrypted:
129+
infile.decrypt(self.password)
81130

82-
if pages == "all":
83-
page_numbers.append({"start": 1, "end": len(infile.pages)})
84-
else:
85-
for r in pages.split(","):
86-
if "-" in r:
87-
a, b = r.split("-")
88-
if b == "end":
89-
b = len(infile.pages)
90-
page_numbers.append({"start": int(a), "end": int(b)})
91-
else:
92-
page_numbers.append({"start": int(r), "end": int(r)})
131+
if pages == "all":
132+
page_numbers.append({"start": 1, "end": len(infile.pages)})
133+
else:
134+
for r in pages.split(","):
135+
if "-" in r:
136+
a, b = r.split("-")
137+
if b == "end":
138+
b = len(infile.pages)
139+
page_numbers.append({"start": int(a), "end": int(b)})
140+
else:
141+
page_numbers.append({"start": int(r), "end": int(r)})
93142

94143
result = []
95144
for p in page_numbers:
96145
result.extend(range(p["start"], p["end"] + 1))
97146
return sorted(set(result))
98147

99-
def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
148+
def _save_page(self, filepath: FilePathType, page, temp):
100149
"""Saves specified page from PDF into a temporary directory.
101150
102151
Parameters
103152
----------
104-
filepath : str
105-
Filepath or URL of the PDF file.
153+
managed_file_context : io.IOBase
154+
A readable, seekable, file-like object
106155
page : int
107156
Page number.
108157
temp : str
109158
Tmp directory.
110159
111160
"""
112-
infile = PdfReader(filepath, strict=False)
113-
if infile.is_encrypted:
114-
infile.decrypt(self.password)
115-
fpath = os.path.join(temp, f"page-{page}.pdf")
116-
froot, fext = os.path.splitext(fpath)
117-
p = infile.pages[page - 1]
118-
outfile = PdfWriter()
119-
outfile.add_page(p)
120-
with open(fpath, "wb") as f:
121-
outfile.write(f)
122-
layout, dim = get_page_layout(fpath)
123-
# fix rotated PDF
124-
chars = get_text_objects(layout, ltype="char")
125-
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
126-
vertical_text = get_text_objects(layout, ltype="vertical_text")
127-
rotation = get_rotation(chars, horizontal_text, vertical_text)
128-
if rotation != "":
129-
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
130-
os.rename(fpath, fpath_new)
131-
instream = open(fpath_new, "rb")
132-
infile = PdfReader(instream, strict=False)
161+
162+
with self.managed_file_context() as fileobj:
163+
infile = PdfReader(fileobj, strict=False)
133164
if infile.is_encrypted:
134165
infile.decrypt(self.password)
166+
fpath = os.path.join(temp, f"page-{page}.pdf")
167+
froot, fext = os.path.splitext(fpath)
168+
p = infile.pages[page - 1]
135169
outfile = PdfWriter()
136-
p = infile.pages[0]
137-
if rotation == "anticlockwise":
138-
p.rotate(90)
139-
elif rotation == "clockwise":
140-
p.rotate(-90)
141170
outfile.add_page(p)
142171
with open(fpath, "wb") as f:
143172
outfile.write(f)
144-
instream.close()
173+
layout, dim = get_page_layout(fpath)
174+
# fix rotated PDF
175+
chars = get_text_objects(layout, ltype="char")
176+
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
177+
vertical_text = get_text_objects(layout, ltype="vertical_text")
178+
rotation = get_rotation(chars, horizontal_text, vertical_text)
179+
if rotation != "":
180+
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
181+
os.rename(fpath, fpath_new)
182+
instream = open(fpath_new, "rb")
183+
infile = PdfReader(instream, strict=False)
184+
if infile.is_encrypted:
185+
infile.decrypt(self.password)
186+
outfile = PdfWriter()
187+
p = infile.pages[0]
188+
if rotation == "anticlockwise":
189+
p.rotate(90)
190+
elif rotation == "clockwise":
191+
p.rotate(-90)
192+
outfile.add_page(p)
193+
with open(fpath, "wb") as f:
194+
outfile.write(f)
195+
instream.close()
145196

146197
def parse(
147198
self,
148199
flavor="lattice",
149200
suppress_stdout=False,
150201
parallel=False,
151202
layout_kwargs=None,
152-
**kwargs
203+
**kwargs,
153204
):
154205
"""Extracts tables by calling parser.get_tables on all single
155206
page PDFs.
@@ -189,7 +240,8 @@ def parse(
189240
jobs = []
190241
for p in self.pages:
191242
j = pool.apply_async(
192-
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
243+
self._parse_page,
244+
(p, tempdir, parser, suppress_stdout, layout_kwargs),
193245
)
194246
jobs.append(j)
195247

@@ -198,14 +250,14 @@ def parse(
198250
tables.extend(t)
199251
else:
200252
for p in self.pages:
201-
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
253+
t = self._parse_page(
254+
p, tempdir, parser, suppress_stdout, layout_kwargs
255+
)
202256
tables.extend(t)
203257

204258
return TableList(sorted(tables))
205259

206-
def _parse_page(
207-
self, page, tempdir, parser, suppress_stdout, layout_kwargs
208-
):
260+
def _parse_page(self, page, tempdir, parser, suppress_stdout, layout_kwargs):
209261
"""Extracts tables by calling parser.get_tables on a single
210262
page PDF.
211263
@@ -224,7 +276,7 @@ def _parse_page(
224276
-------
225277
tables : camelot.core.TableList
226278
List of tables found in PDF.
227-
279+
228280
"""
229281
self._save_page(self.filepath, page, tempdir)
230282
page_path = os.path.join(tempdir, f"page-{page}.pdf")

camelot/io.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
11
import warnings
22
from pathlib import Path
3-
from typing import Union
43

5-
from pypdf._utils import StrByteType
4+
from .handlers import PDFHandler, FilePathType
65

7-
from .handlers import PDFHandler
8-
from .utils import remove_extra
9-
from .utils import validate_input
6+
from .utils import (
7+
InvalidArguments,
8+
validate_input,
9+
remove_extra,
10+
)
1011

1112

1213
def read_pdf(
13-
filepath: Union[StrByteType, Path],
14+
filepath: FilePathType = None,
1415
pages="1",
1516
password=None,
1617
flavor="lattice",
1718
suppress_stdout=False,
1819
parallel=False,
1920
layout_kwargs=None,
21+
file_bytes=None,
2022
**kwargs
2123
):
2224
"""Read PDF and return extracted tables.
@@ -26,8 +28,8 @@ def read_pdf(
2628
2729
Parameters
2830
----------
29-
filepath : str, Path, IO
30-
Filepath or URL of the PDF file.
31+
filepath : str | pathlib.Path, optional (default: None)
32+
Filepath or URL of the PDF file. Required if file_bytes is not given
3133
pages : str, optional (default: '1')
3234
Comma-separated page numbers.
3335
Example: '1,3,4' or '1,4-end' or 'all'.
@@ -40,6 +42,8 @@ def read_pdf(
4042
Print all logs and warnings.
4143
parallel : bool, optional (default: False)
4244
Process pages in parallel using all available cpu cores.
45+
file_bytes : io.IOBase, optional (default: None)
46+
A file-like stream. Required if filepath is not given
4347
layout_kwargs : dict, optional (default: {})
4448
A dict of `pdfminer.layout.LAParams
4549
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -115,12 +119,15 @@ def read_pdf(
115119
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
116120
)
117121

122+
if not filepath and not file_bytes:
123+
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
124+
118125
with warnings.catch_warnings():
119126
if suppress_stdout:
120127
warnings.simplefilter("ignore")
121128

122129
validate_input(kwargs, flavor=flavor)
123-
p = PDFHandler(filepath, pages=pages, password=password)
130+
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
124131
kwargs = remove_extra(kwargs, flavor=flavor)
125132
tables = p.parse(
126133
flavor=flavor,

0 commit comments

Comments
 (0)