1818from .utils import get_text_objects
1919from .utils import is_url
2020
21+ import warnings
2122
2223class PDFHandler :
2324 """Handles all operations like temp directory creation, splitting
@@ -36,7 +37,7 @@ class PDFHandler:
3637
3738 """
3839
39- def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None , multi = [] ):
40+ def __init__ (self , filepath : Union [StrByteType , Path ], pages = "1" , password = None , multi = {} ):
4041 if is_url (filepath ):
4142 filepath = download_url (filepath )
4243 self .filepath : Union [StrByteType , Path ] = filepath
@@ -188,35 +189,39 @@ def parse(
188189 if parallel and len (self .pages ) > 1 and cpu_count > 1 :
189190 with mp .get_context ("spawn" ).Pool (processes = cpu_count ) as pool :
190191 jobs = []
191- for p in self .pages :
192-
192+ for i , p in enumerate ( self .pages , 1 ) :
193+ p_no = str ( i ) # [start] # [-5]
193194 page_kwargs = kwargs
194195 page_parser = parser
195-
196- if p in self .multi :
196+ # assert p == 0
197+ # print("test")
198+ # warnings.warn(UserWarning("{}".format(p)))
199+ if p_no in self .multi :
200+ print (p + " is found in " + self .multi )
197201 page_kwargs .update (self .multi [p_no ])
198202 page_parser = Lattice (** page_kwargs ) if flavor == 'lattice' else Stream (** page_kwargs )
199203
200204 j = pool .apply_async (
201- self ._parse_page ,(p , tempdir , parser , suppress_stdout , layout_kwargs )
205+ self ._parse_page ,(p , tempdir , page_parser , suppress_stdout , layout_kwargs )
202206 )
203207 jobs .append (j )
204208
205209 for j in jobs :
206210 t = j .get ()
207211 tables .extend (t )
208212 else :
209- for p in self .pages :
210- # p_no = p
213+ for i , p in enumerate ( self .pages , 1 ) :
214+ p_no = str ( i ) # [start] # [-5]
211215
212216 page_kwargs = kwargs
213217 page_parser = parser
214218
215- if p in self .multi :
219+ if p_no in self .multi :
220+ print (i ,p ) # debug
216221 page_kwargs .update (self .multi [p_no ])
217222 page_parser = Lattice (** page_kwargs ) if flavor == 'lattice' else Stream (** page_kwargs )
218223
219- t = self ._parse_page (p , tempdir , parser , suppress_stdout , layout_kwargs )
224+ t = self ._parse_page (p , tempdir , page_parser , suppress_stdout , layout_kwargs )
220225 tables .extend (t )
221226
222227 return TableList (sorted (tables ))
0 commit comments