@@ -41,6 +41,10 @@ def write_csv(file_path: str, data: list, headers: list = None):
4141 for row in data :
4242 headers .update (row .keys ())
4343 if headers :
44+ if "abstract" in headers :
45+ headers .remove ("abstract" )
46+ if "primary_class" in headers :
47+ headers .remove ("primary_class" )
4448 if "id" in headers :
4549 headers .remove ("id" ) # Remove "id" if it's in the headers
4650 headers = ["id" ] + sorted (
@@ -49,21 +53,29 @@ def write_csv(file_path: str, data: list, headers: list = None):
4953 else :
5054 headers = sorted (headers ) # Just sort if "id" is not present
5155 header_set = set (headers )
52- for row in data :
53- missing_fields = header_set - row .keys () # Find missing fields in the row
54- if missing_fields :
55- # Add missing fields with empty values in one go
56- row .update ({field : "" for field in missing_fields })
57-
56+ else :
57+ header_set = None
5858 if len (data ) > 0 :
5959 with open (file_path , "w" , newline = "" ) as file :
6060 writer = csv .DictWriter (
6161 file , fieldnames = headers , delimiter = DEFAULT_DELIMITER
6262 )
6363 writer .writeheader ()
6464 for row in data :
65+ if header_set :
66+ missing_fields = (
67+ header_set - row .keys ()
68+ ) # Find missing fields in the row
69+ redundant_fields = row .keys () - header_set
70+ if missing_fields :
71+ # Add missing fields with empty values in one go
72+ row .update ({field : "" for field in missing_fields })
73+ for field in redundant_fields :
74+ del row [field ] # Remove field from the row
75+ if "authors" in row and "author" in header_set :
76+ if len (row ["authors" ]) > 0 :
77+ row ["author" ] = row ["authors" ][0 ]
6578 try :
66- # Exclude the 'embedding' field from being written to the CSV
6779 writer .writerow (row )
6880 except Exception as e :
6981 print (f"Error: { e } " )
0 commit comments