feat(config):Use configs to replace the multi-round testing functionality of scripts.

mmltt-star · mmltt-star · commit 3c0696641302 · 2025-10-29T16:17:34.000+08:00
diff --git a/common_benchmark.py b/common_benchmark.py
@@ -12,7 +12,9 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict
 import random
-
+from datetime import datetime
+from copy import deepcopy
+from concurrent.futures import ProcessPoolExecutor, as_completed
 import dotenv
 import hydra
 import openai
@@ -691,6 +693,76 @@ def signal_handler(signum, frame):
     os._exit(1)  # Force immediate exit
 
 
+def preprocess_config(cfg: DictConfig, chosen_config_name: str) -> DictConfig:
+    # set num_runs to 1 if not set
+    if "num_runs" not in cfg:
+        OmegaConf.set_struct(cfg, False)
+        cfg.num_runs = 1
+        OmegaConf.set_struct(cfg, True)
+    # set output_dir to logs/benchmark.name/agent_set/timestamp if not set
+    if cfg.output_dir == 'logs/':
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+        cfg.output_dir = Path(cfg.output_dir) / cfg.benchmark.name / chosen_config_name / timestamp
+    return cfg
+
+
+def _run_one_process(cfg, args, run_id: int, total_runs: int):
+    cfg_new = deepcopy(cfg)
+    cfg_new.output_dir = Path(cfg_new.output_dir) / f"run_{run_id}"
+    cfg_new = setup_hydra_output_dir(cfg_new, list(args))
+
+    print("==========================================")
+    print(f"🚀 Launching experiment {run_id}/{total_runs}")
+    print(f"📝 Output dir: {cfg_new.output_dir}")
+    print("==========================================")
+    try:
+        asyncio.run(entrypoint(cfg_new))
+        print(f"Run {run_id} finished.")
+        return
+    except Exception as e:
+        print(f"Run {run_id} failed: {e!r}")
+        return
+
+
+def main_runs_multiprocess(cfg, args, max_workers: int | None = None):
+    num_runs = int(cfg.num_runs)
+    if num_runs <= 1:
+        asyncio.run(entrypoint(cfg))
+        return
+
+    if max_workers is None:
+        hw = os.cpu_count() or 1
+        max_workers = min(num_runs, hw)
+
+    futures = []
+    import multiprocessing as mp
+    try:
+        mp.set_start_method("spawn", force=True)
+    except RuntimeError:
+        pass
+
+    with ProcessPoolExecutor(max_workers=max_workers) as ex:
+        for i in range(num_runs):
+            run_id = i + 1
+            fut = ex.submit(
+                _run_one_process,
+                cfg, list(args), run_id, num_runs
+            )
+            futures.append(fut)
+
+        ok_count, fail_count = 0, 0
+        for fut in as_completed(futures):
+            res = fut.result()
+            if res["ok"]:
+                ok_count += 1
+            else:
+                fail_count += 1
+                
+        print("==========================================")
+        print(f"All {num_runs} runs finished. OK={ok_count}, FAIL={fail_count}")
+        print("==========================================")
+
+
 def main(*args, config_file_name: str = ""):
     # Register signal handlers for immediate response to Ctrl+C
     signal.signal(signal.SIGINT, signal_handler)
@@ -708,8 +780,8 @@ def main(*args, config_file_name: str = ""):
         config_dir=os.path.abspath(config_path()), version_base=None
     ):
         cfg = hydra.compose(config_name=chosen_config_name, overrides=list(args))
-        cfg = setup_hydra_output_dir(cfg, list(args))
+        cfg = preprocess_config(cfg, chosen_config_name)
 
         _ = bootstrap_logger(level=LOGGER_LEVEL)
         # Tracing functionality removed - miroflow-contrib deleted
-        asyncio.run(entrypoint(cfg))
+        main_runs_multiprocess(cfg, args)
diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
@@ -137,33 +137,18 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}
 
 ### Step 1: Run Multiple Evaluations
 
-Use the multiple runs script to execute several independent evaluations:
+Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.
 
-```bash title="Run Multiple Evaluations"
-./scripts/run_evaluate_multiple_runs_futurex.sh
-```
-
-This script will:
+It will:
 
-- Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
+- Run multiple independent evaluations by default (configurable with `num_runs`)
 - Execute all tasks in parallel for efficiency
 - Generate separate result files for each run in `run_1/`, `run_2/`, etc.
-- Create a consolidated `futurex_submission.jsonl` file with voting results
 
 ### Step 2: Customize Multiple Runs
 
-You can customize the evaluation parameters:
-
-```bash title="Custom Multiple Runs"
-# Run 5 evaluations with limited tasks for testing
-NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh
+You can customize the evaluation parameters in relevant config, including `num_runs`, `max_tasks`, `max_concurrent`, etc.
 
-# Use different agent configuration
-AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh
-
-# Adjust concurrency for resource management
-MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh
-```
 
 ### Step 3: Voting and Aggregation
 
diff --git a/docs/mkdocs/docs/gaia_validation_mirothinker.md b/docs/mkdocs/docs/gaia_validation_mirothinker.md
@@ -61,11 +61,8 @@ Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#p
 
 Due to performance variance in MiroThinker models, it's recommended to run multiple evaluations for more reliable results.
 
-```bash title="Run Multiple MiroThinker Evaluations"
-bash ./scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
-```
 
-This script runs 3 evaluations in parallel and calculates average scores. You can modify `NUM_RUNS` in the script to change the number of runs.
+Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.
 
 ---
 
diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md
@@ -87,11 +87,7 @@ uv run main.py common-benchmark \
     - Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria
     - Provides more robust and accurate final answers
 
-Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
-
-```bash title="Multiple runs with parallel thinking post-processing"
-bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh
-```
+Set `num_runs` in relevant config to the desired number of runs to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
 
 ### Running Parallel Thinking Analysis alone