1515Usage:
1616 # Extract from single run
1717 python extract_futurex_results.py logs/futurex-online-test
18-
18+
1919 # Aggregate multiple runs (if run_* subdirectories exist)
2020 python extract_futurex_results.py logs/futurex-online-multi-runs
21-
21+
2222 # Specify output file
2323 python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl
2424"""
@@ -82,25 +82,27 @@ def discover_runs(results_dir: str) -> List[str]:
8282def extract_predictions_from_file (file_path : str ) -> Dict [str , str ]:
8383 """
8484 Extract predictions from a single benchmark_results.jsonl file.
85-
85+
8686 Args:
8787 file_path: Path to benchmark_results.jsonl file
88-
88+
8989 Returns:
9090 Dictionary mapping task_id to prediction
9191 """
9292 predictions = {}
93-
93+
9494 with open (file_path , "r" , encoding = "utf-8" ) as fin :
9595 for line_num , line in enumerate (fin , 1 ):
9696 line = line .strip ()
9797 if not line :
9898 continue
99-
99+
100100 try :
101101 rec = json .loads (line )
102102 except json .JSONDecodeError as e :
103- print (f"Warning: Skipping malformed JSON at line { line_num } in { file_path } : { e } " )
103+ print (
104+ f"Warning: Skipping malformed JSON at line { line_num } in { file_path } : { e } "
105+ )
104106 continue
105107
106108 task_id = rec .get ("task_id" )
@@ -110,17 +112,19 @@ def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
110112 if task_id and pred is not None and str (pred ).strip ():
111113 pred_str = str (pred ).strip ()
112114 predictions [task_id ] = pred_str
113-
115+
114116 return predictions
115117
116118
117- def aggregate_multiple_runs (results_dir : str ) -> Tuple [Dict [str , List [str ]], Dict [str , int ]]:
119+ def aggregate_multiple_runs (
120+ results_dir : str ,
121+ ) -> Tuple [Dict [str , List [str ]], Dict [str , int ]]:
118122 """
119123 Aggregate predictions from multiple runs in subdirectories.
120-
124+
121125 Args:
122126 results_dir: Directory containing run_* subdirectories
123-
127+
124128 Returns:
125129 Tuple of (predictions_by_task, first_seen_order)
126130 """
@@ -145,7 +149,7 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
145149 for run_dir in runs :
146150 fpath = os .path .join (run_dir , "benchmark_results.jsonl" )
147151 print (f"Reading: { fpath } " )
148-
152+
149153 with open (fpath , "r" , encoding = "utf-8" ) as fin :
150154 for line in fin :
151155 total_lines += 1
@@ -172,41 +176,41 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
172176
173177 print (f"Collected from { len (runs )} run(s)." )
174178 print (f"Read { total_lines } line(s), accepted { used_lines } record(s)." )
175-
179+
176180 return preds_by_task , first_seen_order
177181
178182
179183def process_single_run (results_dir : str ) -> Dict [str , str ]:
180184 """
181185 Process a single run (direct benchmark_results.jsonl file).
182-
186+
183187 Args:
184188 results_dir: Directory containing benchmark_results.jsonl
185-
189+
186190 Returns:
187191 Dictionary mapping task_id to prediction
188192 """
189193 file_path = os .path .join (results_dir , "benchmark_results.jsonl" )
190-
194+
191195 if not os .path .isfile (file_path ):
192196 raise FileNotFoundError (f"benchmark_results.jsonl not found in: { results_dir } " )
193-
197+
194198 print (f"Reading single run: { file_path } " )
195199 predictions = extract_predictions_from_file (file_path )
196200 print (f"Extracted { len (predictions )} predictions from single run." )
197-
201+
198202 return predictions
199203
200204
201205def write_submission_file (
202- predictions : Dict [str , str ],
203- output_file : str ,
206+ predictions : Dict [str , str ],
207+ output_file : str ,
204208 is_aggregated : bool = False ,
205- vote_counts : Dict [str , Dict [str , int ]] = None
209+ vote_counts : Dict [str , Dict [str , int ]] = None ,
206210) -> None :
207211 """
208212 Write predictions to FutureX submission format.
209-
213+
210214 Args:
211215 predictions: Dictionary mapping task_id to prediction
212216 output_file: Output file path
@@ -217,14 +221,14 @@ def write_submission_file(
217221 with open (output_file , "w" , encoding = "utf-8" ) as out :
218222 for task_id in sorted (predictions .keys ()):
219223 prediction = predictions [task_id ]
220-
224+
221225 # Create submission record
222226 record = {"id" : task_id , "prediction" : prediction }
223-
227+
224228 # Add vote information for aggregated runs
225229 if is_aggregated and vote_counts and task_id in vote_counts :
226230 record ["vote_counts" ] = vote_counts [task_id ]
227-
231+
228232 out .write (json .dumps (record , ensure_ascii = False ) + "\n " )
229233 num_tasks += 1
230234
@@ -238,7 +242,7 @@ def write_submission_file(
238242def parse_args () -> argparse .Namespace :
239243 parser = argparse .ArgumentParser (
240244 description = "Extract predictions from MiroFlow benchmark results and create FutureX submission files. "
241- "Supports both single runs and multi-run aggregation with majority voting."
245+ "Supports both single runs and multi-run aggregation with majority voting."
242246 )
243247 parser .add_argument (
244248 "results_dir" ,
@@ -257,7 +261,7 @@ def parse_args() -> argparse.Namespace:
257261 )
258262 parser .add_argument (
259263 "--single" ,
260- action = "store_true" ,
264+ action = "store_true" ,
261265 help = "Force single run mode (look for direct benchmark_results.jsonl)" ,
262266 )
263267 return parser .parse_args ()
@@ -279,7 +283,7 @@ def main() -> None:
279283 # Determine processing mode
280284 runs = discover_runs (results_dir )
281285 single_file = os .path .join (results_dir , "benchmark_results.jsonl" )
282-
286+
283287 if args .aggregate :
284288 if not runs :
285289 raise FileNotFoundError (
@@ -312,18 +316,20 @@ def main() -> None:
312316 if mode == "aggregate" :
313317 # Multi-run aggregation with majority voting
314318 preds_by_task , first_seen_order = aggregate_multiple_runs (results_dir )
315-
319+
316320 # Apply majority voting
317321 final_predictions = {}
318322 vote_counts = {}
319-
323+
320324 for task_id in preds_by_task :
321325 voted_pred , counts = majority_vote (preds_by_task [task_id ], first_seen_order )
322326 final_predictions [task_id ] = voted_pred
323327 vote_counts [task_id ] = counts
324-
325- write_submission_file (final_predictions , output_file , is_aggregated = True , vote_counts = vote_counts )
326-
328+
329+ write_submission_file (
330+ final_predictions , output_file , is_aggregated = True , vote_counts = vote_counts
331+ )
332+
327333 else :
328334 # Single run extraction
329335 predictions = process_single_run (results_dir )
0 commit comments