fix(lint): to pass lint for some files (#45)

BinWang28 · web-flow · commit 55a27bd1a07f · 2025-09-22T16:27:15.000+08:00
to pass lint for some files
diff --git a/utils/extract_futurex_results.py b/utils/extract_futurex_results.py
@@ -15,10 +15,10 @@
 Usage:
     # Extract from single run
     python extract_futurex_results.py logs/futurex-online-test
-    
+
     # Aggregate multiple runs (if run_* subdirectories exist)
     python extract_futurex_results.py logs/futurex-online-multi-runs
-    
+
     # Specify output file
     python extract_futurex_results.py logs/futurex-online-test -o my_submission.jsonl
 """
@@ -82,25 +82,27 @@ def discover_runs(results_dir: str) -> List[str]:
 def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
     """
     Extract predictions from a single benchmark_results.jsonl file.
-    
+
     Args:
         file_path: Path to benchmark_results.jsonl file
-        
+
     Returns:
         Dictionary mapping task_id to prediction
     """
     predictions = {}
-    
+
     with open(file_path, "r", encoding="utf-8") as fin:
         for line_num, line in enumerate(fin, 1):
             line = line.strip()
             if not line:
                 continue
-                
+
             try:
                 rec = json.loads(line)
             except json.JSONDecodeError as e:
-                print(f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}")
+                print(
+                    f"Warning: Skipping malformed JSON at line {line_num} in {file_path}: {e}"
+                )
                 continue
 
             task_id = rec.get("task_id")
@@ -110,17 +112,19 @@ def extract_predictions_from_file(file_path: str) -> Dict[str, str]:
             if task_id and pred is not None and str(pred).strip():
                 pred_str = str(pred).strip()
                 predictions[task_id] = pred_str
-                
+
     return predictions
 
 
-def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
+def aggregate_multiple_runs(
+    results_dir: str,
+) -> Tuple[Dict[str, List[str]], Dict[str, int]]:
     """
     Aggregate predictions from multiple runs in subdirectories.
-    
+
     Args:
         results_dir: Directory containing run_* subdirectories
-        
+
     Returns:
         Tuple of (predictions_by_task, first_seen_order)
     """
@@ -145,7 +149,7 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
     for run_dir in runs:
         fpath = os.path.join(run_dir, "benchmark_results.jsonl")
         print(f"Reading: {fpath}")
-        
+
         with open(fpath, "r", encoding="utf-8") as fin:
             for line in fin:
                 total_lines += 1
@@ -172,41 +176,41 @@ def aggregate_multiple_runs(results_dir: str) -> Tuple[Dict[str, List[str]], Dic
 
     print(f"Collected from {len(runs)} run(s).")
     print(f"Read {total_lines} line(s), accepted {used_lines} record(s).")
-    
+
     return preds_by_task, first_seen_order
 
 
 def process_single_run(results_dir: str) -> Dict[str, str]:
     """
     Process a single run (direct benchmark_results.jsonl file).
-    
+
     Args:
         results_dir: Directory containing benchmark_results.jsonl
-        
+
     Returns:
         Dictionary mapping task_id to prediction
     """
     file_path = os.path.join(results_dir, "benchmark_results.jsonl")
-    
+
     if not os.path.isfile(file_path):
         raise FileNotFoundError(f"benchmark_results.jsonl not found in: {results_dir}")
-    
+
     print(f"Reading single run: {file_path}")
     predictions = extract_predictions_from_file(file_path)
     print(f"Extracted {len(predictions)} predictions from single run.")
-    
+
     return predictions
 
 
 def write_submission_file(
-    predictions: Dict[str, str], 
-    output_file: str, 
+    predictions: Dict[str, str],
+    output_file: str,
     is_aggregated: bool = False,
-    vote_counts: Dict[str, Dict[str, int]] = None
+    vote_counts: Dict[str, Dict[str, int]] = None,
 ) -> None:
     """
     Write predictions to FutureX submission format.
-    
+
     Args:
         predictions: Dictionary mapping task_id to prediction
         output_file: Output file path
@@ -217,14 +221,14 @@ def write_submission_file(
     with open(output_file, "w", encoding="utf-8") as out:
         for task_id in sorted(predictions.keys()):
             prediction = predictions[task_id]
-            
+
             # Create submission record
             record = {"id": task_id, "prediction": prediction}
-            
+
             # Add vote information for aggregated runs
             if is_aggregated and vote_counts and task_id in vote_counts:
                 record["vote_counts"] = vote_counts[task_id]
-            
+
             out.write(json.dumps(record, ensure_ascii=False) + "\n")
             num_tasks += 1
 
@@ -238,7 +242,7 @@ def write_submission_file(
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Extract predictions from MiroFlow benchmark results and create FutureX submission files. "
-                   "Supports both single runs and multi-run aggregation with majority voting."
+        "Supports both single runs and multi-run aggregation with majority voting."
     )
     parser.add_argument(
         "results_dir",
@@ -257,7 +261,7 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--single",
-        action="store_true", 
+        action="store_true",
         help="Force single run mode (look for direct benchmark_results.jsonl)",
     )
     return parser.parse_args()
@@ -279,7 +283,7 @@ def main() -> None:
     # Determine processing mode
     runs = discover_runs(results_dir)
     single_file = os.path.join(results_dir, "benchmark_results.jsonl")
-    
+
     if args.aggregate:
         if not runs:
             raise FileNotFoundError(
@@ -312,18 +316,20 @@ def main() -> None:
     if mode == "aggregate":
         # Multi-run aggregation with majority voting
         preds_by_task, first_seen_order = aggregate_multiple_runs(results_dir)
-        
+
         # Apply majority voting
         final_predictions = {}
         vote_counts = {}
-        
+
         for task_id in preds_by_task:
             voted_pred, counts = majority_vote(preds_by_task[task_id], first_seen_order)
             final_predictions[task_id] = voted_pred
             vote_counts[task_id] = counts
-        
-        write_submission_file(final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts)
-        
+
+        write_submission_file(
+            final_predictions, output_file, is_aggregated=True, vote_counts=vote_counts
+        )
+
     else:
         # Single run extraction
         predictions = process_single_run(results_dir)
diff --git a/utils/prepare_benchmark/gen_futurex.py b/utils/prepare_benchmark/gen_futurex.py
@@ -12,16 +12,16 @@
 def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
     """
     Generate Futurex-Online dataset tasks in MiroFlow format
-    
+
     Args:
         hf_token: Hugging Face token for dataset access
-        
+
     Yields:
         Task: Standardized task objects
     """
     # Load the Futurex-Online dataset
     dataset = load_dataset("futurex-ai/Futurex-Online")
-    
+
     # Process each split in the dataset
     for split_name, split_data in dataset.items():
         for idx, sample in enumerate(split_data):
@@ -30,26 +30,26 @@ def gen_futurex(hf_token: str) -> Generator[Task, None, None]:
             task_question = sample.get("prompt", "")
             end_time = sample.get("end_time", "")
             level = sample.get("level", "")
-            
+
             # Create metadata dictionary
             metadata: MutableMapping = {
                 "level": level,
                 "end_time": end_time,
                 "source": "futurex-ai/Futurex-Online",
                 "split": split_name,
                 "original_id": sample.get("id", ""),
-                "dataset_name": "Futurex-Online"
+                "dataset_name": "Futurex-Online",
             }
-            
+
             # Create standardized Task object
             task = Task(
                 task_id=task_id,
                 task_question=task_question,
                 ground_truth="",  # Futurex-Online doesn't have ground truth
-                file_path=None,   # No file attachments
+                file_path=None,  # No file attachments
                 metadata=metadata,
             )
-            
+
             yield task
 
     return
diff --git a/utils/prepare_benchmark/gen_xbench_ds.py b/utils/prepare_benchmark/gen_xbench_ds.py
@@ -14,10 +14,11 @@ def xor_decrypt(data, key):
     """
     XOR decrypt data with a key
     """
-    key_bytes = key.encode('utf-8')
+    key_bytes = key.encode("utf-8")
     key_length = len(key_bytes)
     return bytes([data[i] ^ key_bytes[i % key_length] for i in range(len(data))])
 
+
 def gen_xbench_ds(hf_token: str) -> Generator[Task, None, None]:
     dataset = load_dataset(
         "xbench/DeepSearch",
@@ -28,9 +29,15 @@ def gen_xbench_ds(hf_token: str) -> Generator[Task, None, None]:
         task_id = metadata.pop("id")
 
         key = metadata.pop("canary")
-        prompt = xor_decrypt(base64.b64decode(metadata.pop("prompt")), key).decode('utf-8')
-        answer = xor_decrypt(base64.b64decode(metadata.pop("answer")), key).decode('utf-8')
-        reference_steps = xor_decrypt(base64.b64decode(metadata.pop("reference_steps")), key).decode('utf-8')
+        prompt = xor_decrypt(base64.b64decode(metadata.pop("prompt")), key).decode(
+            "utf-8"
+        )
+        answer = xor_decrypt(base64.b64decode(metadata.pop("answer")), key).decode(
+            "utf-8"
+        )
+        reference_steps = xor_decrypt(
+            base64.b64decode(metadata.pop("reference_steps")), key
+        ).decode("utf-8")
         task = Task(
             task_id=task_id,
             task_question=prompt,
diff --git a/utils/progress_check/check_futurex_progress.py b/utils/progress_check/check_futurex_progress.py
@@ -4,7 +4,7 @@
 
 This script analyzes Futurex-Online benchmark results in a log folder to count:
 - Total files processed
-- Files with status "completed" 
+- Files with status "completed"
 - Files with predictions (final_boxed_answer)
 - Files with errors
 
@@ -68,7 +68,6 @@ def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
             status = data.get("status", "").lower()
             final_answer = data.get("final_boxed_answer", "")
             error_msg = data.get("error", "")
-            judge_result = data.get("judge_result", "")
 
             # Count by status
             if status == "completed":
@@ -88,7 +87,14 @@ def analyze_futurex_results(log_folder: str) -> Dict[str, int]:
             # Count by prediction availability
             if final_answer and final_answer.strip():
                 results["with_predictions"] += 1
-                prediction_files.append((json_file.name, final_answer[:100] + "..." if len(final_answer) > 100 else final_answer))
+                prediction_files.append(
+                    (
+                        json_file.name,
+                        final_answer[:100] + "..."
+                        if len(final_answer) > 100
+                        else final_answer,
+                    )
+                )
             else:
                 results["without_predictions"] += 1
 
@@ -136,11 +142,17 @@ def display_results(
     with_errors = results["with_errors"]
 
     print(f"Total files processed:           {total:3d}")
-    print(f"Files with status 'completed':   {completed:3d} ({completed/total*100:.1f}%)")
+    print(
+        f"Files with status 'completed':   {completed:3d} ({completed/total*100:.1f}%)"
+    )
     print(f"Files with status 'running':     {running:3d} ({running/total*100:.1f}%)")
     print(f"Files with status 'failed':      {failed:3d} ({failed/total*100:.1f}%)")
-    print(f"Files with predictions:          {with_predictions:3d} ({with_predictions/total*100:.1f}%)")
-    print(f"Files with errors:               {with_errors:3d} ({with_errors/total*100:.1f}%)")
+    print(
+        f"Files with predictions:          {with_predictions:3d} ({with_predictions/total*100:.1f}%)"
+    )
+    print(
+        f"Files with errors:               {with_errors:3d} ({with_errors/total*100:.1f}%)"
+    )
     print(f"Files with parse errors:         {results['parse_errors']:3d}")
 
     if completed > 0:
@@ -200,10 +212,24 @@ def main():
 
     try:
         print(f"Analyzing Futurex-Online benchmark results in: {log_folder}")
-        results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files = analyze_futurex_results(
-            log_folder
+        (
+            results,
+            completed_files,
+            running_files,
+            failed_files,
+            prediction_files,
+            error_files,
+            parse_error_files,
+        ) = analyze_futurex_results(log_folder)
+        display_results(
+            results,
+            completed_files,
+            running_files,
+            failed_files,
+            prediction_files,
+            error_files,
+            parse_error_files,
         )
-        display_results(results, completed_files, running_files, failed_files, prediction_files, error_files, parse_error_files)
 
     except Exception as e:
         print(f"Error: {e}")