1+ import io
12import multiprocessing as mp
23import os
34import sys
5+ from contextlib import contextmanager
46from pathlib import Path
7+ from typing import IO
8+ from typing import Any
9+ from typing import TypeVar
510from typing import Union
611
712from pypdf import PdfReader
813from pypdf import PdfWriter
9- from pypdf ._utils import StrByteType
1014
1115from .core import TableList
1216from .parsers import Lattice
1317from .parsers import Stream
18+ from .utils import InvalidArguments
1419from .utils import TemporaryDirectory
15- from .utils import download_url
1620from .utils import get_page_layout
1721from .utils import get_rotation
1822from .utils import get_text_objects
23+ from .utils import get_url_bytes
1924from .utils import is_url
2025
2126
27+ FilePathType = TypeVar ("FilePathType" , str , IO [Any ], Path , None )
28+
29+
2230class PDFHandler :
2331 """Handles all operations like temp directory creation, splitting
2432 file into single page PDFs, parsing each PDF and then removing the
2533 temp directory.
2634
2735 Parameters
2836 ----------
29- filepath : str
30- Filepath or URL of the PDF file.
37+ filepath : str | pathlib.Path, optional (default: None)
38+ Filepath or URL of the PDF file. Required if file_bytes is not given
3139 pages : str, optional (default: '1')
3240 Comma-separated page numbers.
3341 Example: '1,3,4' or '1,4-end' or 'all'.
3442 password : str, optional (default: None)
3543 Password for decryption.
44+ file_bytes : io.IOBase, optional (default: None)
45+ A file-like stream. Required if filepath is not given
3646
3747 """
3848
39- def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None ):
49+ def __init__ (
50+ self , filepath : FilePathType = None , pages = "1" , password = None , file_bytes = None
51+ ):
4052 if is_url (filepath ):
41- filepath = download_url (filepath )
42- self .filepath : Union [StrByteType , Path ] = filepath
53+ file_bytes = get_url_bytes (filepath )
54+
55+ if not filepath and not file_bytes :
56+ raise InvalidArguments ("Either `filepath` or `file_bytes` is required" )
57+ if not filepath :
58+ # filepath must either be passed, or taken from the name attribute
59+ try :
60+ filepath = getattr (file_bytes , "name" )
61+ except AttributeError :
62+ msg = (
63+ "Either pass a `filepath`, or give the "
64+ "`file_bytes` argument a name attribute"
65+ )
66+ raise InvalidArguments (msg )
67+ self .file_bytes = file_bytes # ok to be None
4368
69+ self .filepath = filepath
4470 if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
4571 raise NotImplementedError ("File format not supported" )
4672
@@ -52,13 +78,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5278 self .password = self .password .encode ("ascii" )
5379 self .pages = self ._get_pages (pages )
5480
81+ @contextmanager
82+ def managed_file_context (self ):
83+ """Reads from either the `filepath` or `file_bytes`
84+ attribute of this instance, to return a file-like object.
85+ Closes any open file handles on exit or error.
86+
87+ Returns
88+ -------
89+ file_bytes : io.IOBase
90+ A readable, seekable, file-like object
91+ """
92+ if self .file_bytes :
93+ # if we can't seek, write to a BytesIO object that can,
94+ # then seek to the beginning before yielding
95+ if not hasattr (self .file_bytes , "seek" ):
96+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
97+ self .file_bytes .seek (0 )
98+ yield self .file_bytes
99+ else :
100+ with open (self .filepath , "rb" ) as file_bytes :
101+ yield file_bytes
102+
55103 def _get_pages (self , pages ):
56104 """Converts pages string to list of ints.
57105
58106 Parameters
59107 ----------
60- filepath : str
61- Filepath or URL of the PDF file.
108+ managed_file_context : io.IOBase
109+ A readable, seekable, file-like object
62110 pages : str, optional (default: '1')
63111 Comma-separated page numbers.
64112 Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,82 +122,85 @@ def _get_pages(self, pages):
74122 if pages == "1" :
75123 page_numbers .append ({"start" : 1 , "end" : 1 })
76124 else :
77- infile = PdfReader (self .filepath , strict = False )
125+ with self .managed_file_context () as f :
126+ infile = PdfReader (f , strict = False )
78127
79- if infile .is_encrypted :
80- infile .decrypt (self .password )
128+ if infile .is_encrypted :
129+ infile .decrypt (self .password )
81130
82- if pages == "all" :
83- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
84- else :
85- for r in pages .split ("," ):
86- if "-" in r :
87- a , b = r .split ("-" )
88- if b == "end" :
89- b = len (infile .pages )
90- page_numbers .append ({"start" : int (a ), "end" : int (b )})
91- else :
92- page_numbers .append ({"start" : int (r ), "end" : int (r )})
131+ if pages == "all" :
132+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
133+ else :
134+ for r in pages .split ("," ):
135+ if "-" in r :
136+ a , b = r .split ("-" )
137+ if b == "end" :
138+ b = len (infile .pages )
139+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
140+ else :
141+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
93142
94143 result = []
95144 for p in page_numbers :
96145 result .extend (range (p ["start" ], p ["end" ] + 1 ))
97146 return sorted (set (result ))
98147
99- def _save_page (self , filepath : Union [ StrByteType , Path ] , page , temp ):
148+ def _save_page (self , filepath : FilePathType , page , temp ):
100149 """Saves specified page from PDF into a temporary directory.
101150
102151 Parameters
103152 ----------
104- filepath : str
105- Filepath or URL of the PDF file.
153+ managed_file_context : io.IOBase
154+ A readable, seekable, file-like object
106155 page : int
107156 Page number.
108157 temp : str
109158 Tmp directory.
110159
111160 """
112- infile = PdfReader (filepath , strict = False )
113- if infile .is_encrypted :
114- infile .decrypt (self .password )
115- fpath = os .path .join (temp , f"page-{ page } .pdf" )
116- froot , fext = os .path .splitext (fpath )
117- p = infile .pages [page - 1 ]
118- outfile = PdfWriter ()
119- outfile .add_page (p )
120- with open (fpath , "wb" ) as f :
121- outfile .write (f )
122- layout , dim = get_page_layout (fpath )
123- # fix rotated PDF
124- chars = get_text_objects (layout , ltype = "char" )
125- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
126- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
127- rotation = get_rotation (chars , horizontal_text , vertical_text )
128- if rotation != "" :
129- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
130- os .rename (fpath , fpath_new )
131- instream = open (fpath_new , "rb" )
132- infile = PdfReader (instream , strict = False )
161+
162+ with self .managed_file_context () as fileobj :
163+ infile = PdfReader (fileobj , strict = False )
133164 if infile .is_encrypted :
134165 infile .decrypt (self .password )
166+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
167+ froot , fext = os .path .splitext (fpath )
168+ p = infile .pages [page - 1 ]
135169 outfile = PdfWriter ()
136- p = infile .pages [0 ]
137- if rotation == "anticlockwise" :
138- p .rotate (90 )
139- elif rotation == "clockwise" :
140- p .rotate (- 90 )
141170 outfile .add_page (p )
142171 with open (fpath , "wb" ) as f :
143172 outfile .write (f )
144- instream .close ()
173+ layout , dim = get_page_layout (fpath )
174+ # fix rotated PDF
175+ chars = get_text_objects (layout , ltype = "char" )
176+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
177+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
178+ rotation = get_rotation (chars , horizontal_text , vertical_text )
179+ if rotation != "" :
180+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
181+ os .rename (fpath , fpath_new )
182+ instream = open (fpath_new , "rb" )
183+ infile = PdfReader (instream , strict = False )
184+ if infile .is_encrypted :
185+ infile .decrypt (self .password )
186+ outfile = PdfWriter ()
187+ p = infile .pages [0 ]
188+ if rotation == "anticlockwise" :
189+ p .rotate (90 )
190+ elif rotation == "clockwise" :
191+ p .rotate (- 90 )
192+ outfile .add_page (p )
193+ with open (fpath , "wb" ) as f :
194+ outfile .write (f )
195+ instream .close ()
145196
146197 def parse (
147198 self ,
148199 flavor = "lattice" ,
149200 suppress_stdout = False ,
150201 parallel = False ,
151202 layout_kwargs = None ,
152- ** kwargs
203+ ** kwargs ,
153204 ):
154205 """Extracts tables by calling parser.get_tables on all single
155206 page PDFs.
@@ -189,7 +240,8 @@ def parse(
189240 jobs = []
190241 for p in self .pages :
191242 j = pool .apply_async (
192- self ._parse_page ,(p , tempdir , parser , suppress_stdout , layout_kwargs )
243+ self ._parse_page ,
244+ (p , tempdir , parser , suppress_stdout , layout_kwargs ),
193245 )
194246 jobs .append (j )
195247
@@ -198,14 +250,14 @@ def parse(
198250 tables .extend (t )
199251 else :
200252 for p in self .pages :
201- t = self ._parse_page (p , tempdir , parser , suppress_stdout , layout_kwargs )
253+ t = self ._parse_page (
254+ p , tempdir , parser , suppress_stdout , layout_kwargs
255+ )
202256 tables .extend (t )
203257
204258 return TableList (sorted (tables ))
205259
206- def _parse_page (
207- self , page , tempdir , parser , suppress_stdout , layout_kwargs
208- ):
260+ def _parse_page (self , page , tempdir , parser , suppress_stdout , layout_kwargs ):
209261 """Extracts tables by calling parser.get_tables on a single
210262 page PDF.
211263
@@ -224,7 +276,7 @@ def _parse_page(
224276 -------
225277 tables : camelot.core.TableList
226278 List of tables found in PDF.
227-
279+
228280 """
229281 self ._save_page (self .filepath , page , tempdir )
230282 page_path = os .path .join (tempdir , f"page-{ page } .pdf" )
0 commit comments