11import multiprocessing as mp
2+ from contextlib import contextmanager
3+ import io
24import os
35import sys
46from pathlib import Path
5- from typing import Union
7+ from typing import Union , Any , IO , TypeVar
68
79from pypdf import PdfReader
810from pypdf import PdfWriter
9- from pypdf ._utils import StrByteType
1011
1112from .core import TableList
1213from .parsers import Lattice
1314from .parsers import Stream
1415from .utils import TemporaryDirectory
15- from .utils import download_url
16+ from .utils import InvalidArguments
17+ from .utils import get_url_bytes
1618from .utils import get_page_layout
1719from .utils import get_rotation
1820from .utils import get_text_objects
1921from .utils import is_url
2022
23+ FilePathType = TypeVar (Union [str , IO [Any ], Path , None ])
2124
2225class PDFHandler :
2326 """Handles all operations like temp directory creation, splitting
@@ -26,21 +29,35 @@ class PDFHandler:
2629
2730 Parameters
2831 ----------
29- filepath : str
30- Filepath or URL of the PDF file.
32+ filepath : str | pathlib.Path, optional (default: None)
33+ Filepath or URL of the PDF file. Required if file_bytes is not given
3134 pages : str, optional (default: '1')
3235 Comma-separated page numbers.
3336 Example: '1,3,4' or '1,4-end' or 'all'.
3437 password : str, optional (default: None)
3538 Password for decryption.
39+ file_bytes : io.IOBase, optional (default: None)
40+ A file-like stream. Required if filepath is not given
3641
3742 """
3843
39- def __init__ (self , filepath : Union [ StrByteType , Path ] , pages = "1" , password = None ):
44+ def __init__ (self , filepath : FilePathType = None , pages = "1" , password = None , file_bytes = None ):
4045 if is_url (filepath ):
41- filepath = download_url (filepath )
42- self .filepath : Union [StrByteType , Path ] = filepath
46+ file_bytes = get_url_bytes (filepath )
4347
48+ if not filepath and not file_bytes :
49+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
50+ if not filepath :
51+ # filepath must either be passed, or taken from the name attribute
52+ try :
53+ filepath = getattr (file_bytes , 'name' )
54+ except AttributeError :
55+ msg = ('Either pass a `filepath`, or give the '
56+ '`file_bytes` argument a name attribute' )
57+ raise InvalidArguments (msg )
58+ self .file_bytes = file_bytes # ok to be None
59+
60+ self .filepath = filepath
4461 if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
4562 raise NotImplementedError ("File format not supported" )
4663
@@ -52,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5269 self .password = self .password .encode ("ascii" )
5370 self .pages = self ._get_pages (pages )
5471
72+ @contextmanager
73+ def managed_file_context (self ):
74+ """Reads from either the `filepath` or `file_bytes`
75+ attribute of this instance, to return a file-like object.
76+ Closes any open file handles on exit or error.
77+
78+ Returns
79+ -------
80+ file_bytes : io.IOBase
81+ A readable, seekable, file-like object
82+ """
83+ if self .file_bytes :
84+ # if we can't seek, write to a BytesIO object that can,
85+ # then seek to the beginning before yielding
86+ if not hasattr (self .file_bytes , 'seek' ):
87+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
88+ self .file_bytes .seek (0 )
89+ yield self .file_bytes
90+ else :
91+ with open (self .filepath , "rb" ) as file_bytes :
92+ yield file_bytes
93+
5594 def _get_pages (self , pages ):
5695 """Converts pages string to list of ints.
5796
5897 Parameters
5998 ----------
60- filepath : str
61- Filepath or URL of the PDF file.
99+ managed_file_context : io.IOBase
100+ A readable, seekable, file-like object
62101 pages : str, optional (default: '1')
63102 Comma-separated page numbers.
64103 Example: '1,3,4' or '1,4-end' or 'all'.
@@ -74,74 +113,77 @@ def _get_pages(self, pages):
74113 if pages == "1" :
75114 page_numbers .append ({"start" : 1 , "end" : 1 })
76115 else :
77- infile = PdfReader (self .filepath , strict = False )
116+ with self .managed_file_context () as f :
117+ infile = PdfReader (f , strict = False )
78118
79- if infile .is_encrypted :
80- infile .decrypt (self .password )
119+ if infile .is_encrypted :
120+ infile .decrypt (self .password )
81121
82- if pages == "all" :
83- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
84- else :
85- for r in pages .split ("," ):
86- if "-" in r :
87- a , b = r .split ("-" )
88- if b == "end" :
89- b = len (infile .pages )
90- page_numbers .append ({"start" : int (a ), "end" : int (b )})
91- else :
92- page_numbers .append ({"start" : int (r ), "end" : int (r )})
122+ if pages == "all" :
123+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
124+ else :
125+ for r in pages .split ("," ):
126+ if "-" in r :
127+ a , b = r .split ("-" )
128+ if b == "end" :
129+ b = len (infile .pages )
130+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
131+ else :
132+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
93133
94134 result = []
95135 for p in page_numbers :
96136 result .extend (range (p ["start" ], p ["end" ] + 1 ))
97137 return sorted (set (result ))
98138
99- def _save_page (self , filepath : Union [ StrByteType , Path ], page , temp ):
139+ def _save_page (self , page , temp ):
100140 """Saves specified page from PDF into a temporary directory.
101141
102142 Parameters
103143 ----------
104- filepath : str
105- Filepath or URL of the PDF file.
144+ managed_file_context : io.IOBase
145+ A readable, seekable, file-like object
106146 page : int
107147 Page number.
108148 temp : str
109149 Tmp directory.
110150
111151 """
112- infile = PdfReader (filepath , strict = False )
113- if infile .is_encrypted :
114- infile .decrypt (self .password )
115- fpath = os .path .join (temp , f"page-{ page } .pdf" )
116- froot , fext = os .path .splitext (fpath )
117- p = infile .pages [page - 1 ]
118- outfile = PdfWriter ()
119- outfile .add_page (p )
120- with open (fpath , "wb" ) as f :
121- outfile .write (f )
122- layout , dim = get_page_layout (fpath )
123- # fix rotated PDF
124- chars = get_text_objects (layout , ltype = "char" )
125- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
126- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
127- rotation = get_rotation (chars , horizontal_text , vertical_text )
128- if rotation != "" :
129- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
130- os .rename (fpath , fpath_new )
131- instream = open (fpath_new , "rb" )
132- infile = PdfReader (instream , strict = False )
152+
153+ with self .managed_file_context () as fileobj :
154+ infile = PdfReader (fileobj , strict = False )
133155 if infile .is_encrypted :
134156 infile .decrypt (self .password )
157+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
158+ froot , fext = os .path .splitext (fpath )
159+ p = infile .pages [page - 1 ]
135160 outfile = PdfWriter ()
136- p = infile .pages [0 ]
137- if rotation == "anticlockwise" :
138- p .rotate (90 )
139- elif rotation == "clockwise" :
140- p .rotate (- 90 )
141161 outfile .add_page (p )
142162 with open (fpath , "wb" ) as f :
143163 outfile .write (f )
144- instream .close ()
164+ layout , dim = get_page_layout (fpath )
165+ # fix rotated PDF
166+ chars = get_text_objects (layout , ltype = "char" )
167+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
168+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
169+ rotation = get_rotation (chars , horizontal_text , vertical_text )
170+ if rotation != "" :
171+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
172+ os .rename (fpath , fpath_new )
173+ instream = open (fpath_new , "rb" )
174+ infile = PdfReader (instream , strict = False )
175+ if infile .is_encrypted :
176+ infile .decrypt (self .password )
177+ outfile = PdfWriter ()
178+ p = infile .pages [0 ]
179+ if rotation == "anticlockwise" :
180+ p .rotate (90 )
181+ elif rotation == "clockwise" :
182+ p .rotate (- 90 )
183+ outfile .add_page (p )
184+ with open (fpath , "wb" ) as f :
185+ outfile .write (f )
186+ instream .close ()
145187
146188 def parse (
147189 self ,
@@ -181,6 +223,7 @@ def parse(
181223 tables = []
182224 parser = Lattice (** kwargs ) if flavor == "lattice" else Stream (** kwargs )
183225 with TemporaryDirectory () as tempdir :
226+ < << << << HEAD
184227 cpu_count = mp .cpu_count ()
185228 # Using multiprocessing only when cpu_count > 1 to prevent a stallness issue
186229 # when cpu_count is 1
0 commit comments