1+ from contextlib import contextmanager
2+ import io
13import os
24import sys
35from pathlib import Path
1113from .parsers import Lattice
1214from .parsers import Stream
1315from .utils import TemporaryDirectory
14- from .utils import download_url
16+ from .utils import InvalidArguments
17+ from .utils import get_url_bytes
1518from .utils import get_page_layout
1619from .utils import get_rotation
1720from .utils import get_text_objects
@@ -25,21 +28,36 @@ class PDFHandler:
2528
2629 Parameters
2730 ----------
28- filepath : str
29- Filepath or URL of the PDF file.
31+ filepath : str | pathlib.Path, optional (default: None)
32+ Filepath or URL of the PDF file. Required if file_bytes is not given
3033 pages : str, optional (default: '1')
3134 Comma-separated page numbers.
3235 Example: '1,3,4' or '1,4-end' or 'all'.
3336 password : str, optional (default: None)
3437 Password for decryption.
38+ file_bytes : io.IOBase, optional (default: None)
39+ A file-like stream. Required if filepath is not given
3540
3641 """
3742
38- def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None ):
43+ def __init__ (self , filepath : Union [StrByteType , Path , None ], pages = "1" , password = None , file_bytes = None ):
3944 if is_url (filepath ):
40- filepath = download_url (filepath )
45+ file_bytes = get_url_bytes (filepath )
46+
47+ if not filepath and not file_bytes :
48+ raise InvalidArguments ('Either `filepath` or `file_bytes` is required' )
49+ if not filepath :
50+ # filepath must either be passed, or taken from the name attribute
51+ filepath = getattr (file_bytes , 'name' )
52+ if not filepath :
53+ msg = ('Either pass a `filepath`, or give the '
54+ '`file_bytes` argument a name attribute' )
55+ raise InvalidArguments (msg )
56+ self .file_bytes = file_bytes # ok to be None
57+
58+ # self.filepath = filepath
59+ # or
4160 self .filepath : Union [StrByteType , Path ] = filepath
42-
4361 if isinstance (filepath , str ) and not filepath .lower ().endswith (".pdf" ):
4462 raise NotImplementedError ("File format not supported" )
4563
@@ -51,6 +69,28 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
5169 self .password = self .password .encode ("ascii" )
5270 self .pages = self ._get_pages (pages )
5371
72+ @contextmanager
73+ def managed_file_context (self ):
74+ """Reads from either the `filepath` or `file_bytes`
75+ attribute of this instance, to return a file-like object.
76+ Closes any open file handles on exit or error.
77+
78+ Returns
79+ -------
80+ file_bytes : io.IOBase
81+ A readable, seekable, file-like object
82+ """
83+ if self .file_bytes :
84+ # if we can't seek, write to a BytesIO object that can,
85+ # then seek to the beginning before yielding
86+ if not hasattr (self .file_bytes , 'seek' ):
87+ self .file_bytes = io .BytesIO (self .file_bytes .read ())
88+ self .file_bytes .seek (0 )
89+ yield self .file_bytes
90+ else :
91+ with open (self .filepath , "rb" ) as file_bytes :
92+ yield file_bytes
93+
5494 def _get_pages (self , pages ):
5595 """Converts pages string to list of ints.
5696
@@ -73,29 +113,30 @@ def _get_pages(self, pages):
73113 if pages == "1" :
74114 page_numbers .append ({"start" : 1 , "end" : 1 })
75115 else :
76- infile = PdfReader (self .filepath , strict = False )
77-
78- if infile .is_encrypted :
79- infile .decrypt (self .password )
80-
81- if pages == "all" :
82- page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
83- else :
84- for r in pages .split ("," ):
85- if "-" in r :
86- a , b = r .split ("-" )
87- if b == "end" :
88- b = len (infile .pages )
89- page_numbers .append ({"start" : int (a ), "end" : int (b )})
90- else :
91- page_numbers .append ({"start" : int (r ), "end" : int (r )})
116+ with self .managed_file_context () as f :
117+ infile = PdfReader (f , strict = False )
118+
119+ if infile .is_encrypted :
120+ infile .decrypt (self .password )
121+
122+ if pages == "all" :
123+ page_numbers .append ({"start" : 1 , "end" : len (infile .pages )})
124+ else :
125+ for r in pages .split ("," ):
126+ if "-" in r :
127+ a , b = r .split ("-" )
128+ if b == "end" :
129+ b = len (infile .pages )
130+ page_numbers .append ({"start" : int (a ), "end" : int (b )})
131+ else :
132+ page_numbers .append ({"start" : int (r ), "end" : int (r )})
92133
93134 result = []
94135 for p in page_numbers :
95136 result .extend (range (p ["start" ], p ["end" ] + 1 ))
96137 return sorted (set (result ))
97138
98- def _save_page (self , filepath : Union [StrByteType , Path ], page , temp ):
139+ def _save_page (self , filepath : Union [StrByteType , Path , None ], page , temp ):
99140 """Saves specified page from PDF into a temporary directory.
100141
101142 Parameters
@@ -108,39 +149,41 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
108149 Tmp directory.
109150
110151 """
111- infile = PdfReader (filepath , strict = False )
112- if infile .is_encrypted :
113- infile .decrypt (self .password )
114- fpath = os .path .join (temp , f"page-{ page } .pdf" )
115- froot , fext = os .path .splitext (fpath )
116- p = infile .pages [page - 1 ]
117- outfile = PdfWriter ()
118- outfile .add_page (p )
119- with open (fpath , "wb" ) as f :
120- outfile .write (f )
121- layout , dim = get_page_layout (fpath )
122- # fix rotated PDF
123- chars = get_text_objects (layout , ltype = "char" )
124- horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
125- vertical_text = get_text_objects (layout , ltype = "vertical_text" )
126- rotation = get_rotation (chars , horizontal_text , vertical_text )
127- if rotation != "" :
128- fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
129- os .rename (fpath , fpath_new )
130- instream = open (fpath_new , "rb" )
131- infile = PdfReader (instream , strict = False )
152+
153+ with self .managed_file_context () as fileobj :
154+ infile = PdfReader (fileobj , strict = False )
132155 if infile .is_encrypted :
133156 infile .decrypt (self .password )
157+ fpath = os .path .join (temp , f"page-{ page } .pdf" )
158+ froot , fext = os .path .splitext (fpath )
159+ p = infile .pages [page - 1 ]
134160 outfile = PdfWriter ()
135- p = infile .pages [0 ]
136- if rotation == "anticlockwise" :
137- p .rotate (90 )
138- elif rotation == "clockwise" :
139- p .rotate (- 90 )
140161 outfile .add_page (p )
141162 with open (fpath , "wb" ) as f :
142163 outfile .write (f )
143- instream .close ()
164+ layout , dim = get_page_layout (fpath )
165+ # fix rotated PDF
166+ chars = get_text_objects (layout , ltype = "char" )
167+ horizontal_text = get_text_objects (layout , ltype = "horizontal_text" )
168+ vertical_text = get_text_objects (layout , ltype = "vertical_text" )
169+ rotation = get_rotation (chars , horizontal_text , vertical_text )
170+ if rotation != "" :
171+ fpath_new = "" .join ([froot .replace ("page" , "p" ), "_rotated" , fext ])
172+ os .rename (fpath , fpath_new )
173+ instream = open (fpath_new , "rb" )
174+ infile = PdfReader (instream , strict = False )
175+ if infile .is_encrypted :
176+ infile .decrypt (self .password )
177+ outfile = PdfWriter ()
178+ p = infile .pages [0 ]
179+ if rotation == "anticlockwise" :
180+ p .rotate (90 )
181+ elif rotation == "clockwise" :
182+ p .rotate (- 90 )
183+ outfile .add_page (p )
184+ with open (fpath , "wb" ) as f :
185+ outfile .write (f )
186+ instream .close ()
144187
145188 def parse (
146189 self , flavor = "lattice" , suppress_stdout = False , layout_kwargs = None , ** kwargs
0 commit comments