@@ -129,14 +129,32 @@ def scan_pdf_for_summary(pdf_path):
129129
130130
131131def parse_client_info (page_text ):
132-
133- lines = page_text .splitlines ()
134- lines = [line for line in lines if line ]
135- client_data = lines [:7 ]
136132 client_info = {}
137- for e in client_data :
138- e = e .split (':' )
139- client_info [e [0 ]] = e [1 ].lstrip ()
133+ lines = page_text .splitlines ()
134+
135+ for line in lines :
136+ if line .startswith ("Alleged Onset:" ):
137+ client_info ["Alleged Onset" ] = line .split (":" )[1 ].strip ()
138+ continue
139+ if line .startswith ("Application:" ):
140+ client_info ["Application" ] = line .split (":" )[1 ].strip ()
141+ continue
142+ if line .startswith ("Claim Type:" ):
143+ client_info ["Claim Type" ] = line .split (":" )[1 ].strip ()
144+ continue
145+ if line .startswith ("Claimant:" ):
146+ client_info ["Claimant" ] = line .split (":" )[1 ].strip ()
147+ continue
148+ if line .startswith ("Last Change:" ):
149+ client_info ["Last Change" ] = line .split (":" )[1 ].strip ()
150+ continue
151+ if line .startswith ("Last Insured:" ):
152+ client_info ["Last Insured" ] = line .split (":" )[1 ].strip ()
153+ continue
154+ if line .startswith ("SSN:" ):
155+ client_info ["SSN" ] = line .split (":" )[1 ].strip ()
156+ continue
157+
140158 return client_info
141159
142160
@@ -152,41 +170,49 @@ def parse_work_history(page_text):
152170 'job_title' : e .split (": " )[1 ],
153171 'intensity' : '' ,
154172 'skill_level' : '' ,
155- }
173+ }
156174 return work_history
157175
158176
159177def get_exhibits_from_pdf (doc ):
160- try :
161- outlines = doc .get_outlines ()
162- sys .setrecursionlimit (999999999 )
163- index = 1
164- provider = ''
165- exhibits = {}
166- for (level , title , dest , a , se ) in outlines :
167- if level == 2 :
168- provider = title
169- id = provider .split (":" )[0 ]
170- provider_name = provider .split (":" )[1 ].replace ("Doc. Dt." , "" ).replace ("Tmt. Dt." , "" ).strip ()
171- provider_dates = re .sub (r"\(\d* page.*" , "" , provider .split (":" )[2 ]).strip ()
172- from_date = provider_dates .split ("-" )[0 ]
173- try :
174- to_date = provider_dates .split ("-" )[1 ]
175- except IndexError :
176- to_date = from_date
177- ex = Exhibit (provider_name = provider_name , from_date = from_date , to_date = to_date , comments = [])
178- exhibits [id ] = ex
179- if level == 3 :
180- index += 1
181- except PDFNoOutlines :
182- exhibits = {}
183- sys .setrecursionlimit (1000 )
184- print ('PDF has no outlines to reference.' )
185-
178+ exhibits = {}
179+ outlines = doc .get_outlines ()
180+ sys .setrecursionlimit (999999999 )
181+ index = 1
182+ for (level , title , dest , a , se ) in outlines :
183+ if level == 2 :
184+ id , provider_name , from_date , to_date = parse_title (title )
185+ ex = Exhibit (provider_name = provider_name , from_date = from_date , to_date = to_date , comments = [])
186+ exhibits [id ] = ex
187+ if level == 3 :
188+ index += 1
186189 sys .setrecursionlimit (1000 )
187190 return exhibits
188191
189192
193+ def parse_title (title ):
194+ split_title = title .split (":" )
195+ id = split_title [0 ]
196+ provider_name = split_title [1 ].replace ("Doc. Dt." , "" ).replace ("Tmt. Dt." , "" ).strip ()
197+
198+ # if no dates, return empty
199+ if len (split_title ) == 2 :
200+ provider_name = re .sub (r"\(\d* page.*" , "" , provider_name ).strip ()
201+ return (id , provider_name , "" , "" )
202+
203+ provider_dates = re .sub (r"\(\d* page.*" , "" , split_title [2 ]).strip ().split ("-" )
204+
205+ # if one date, return both as date
206+ if len (provider_dates ) == 1 :
207+ date = provider_dates [0 ]
208+ return (id , provider_name , date , date )
209+
210+ from_date = provider_dates [0 ]
211+ to_date = provider_dates [1 ]
212+
213+ return (id , provider_name , from_date , to_date )
214+
215+
190216def parse_page_comments (annots ):
191217
192218 page_comments = []
0 commit comments