diff --git a/common_benchmark.py b/common_benchmark.py index 77c1f32..c175480 100644 --- a/common_benchmark.py +++ b/common_benchmark.py @@ -12,7 +12,9 @@ from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict import random - +from datetime import datetime +from copy import deepcopy +from concurrent.futures import ProcessPoolExecutor, as_completed import dotenv import hydra import openai @@ -691,6 +693,76 @@ def signal_handler(signum, frame): os._exit(1) # Force immediate exit +def preprocess_config(cfg: DictConfig, chosen_config_name: str) -> DictConfig: + # set num_runs to 1 if not set + if "num_runs" not in cfg: + OmegaConf.set_struct(cfg, False) + cfg.num_runs = 1 + OmegaConf.set_struct(cfg, True) + # set output_dir to logs/benchmark.name/agent_set/timestamp if not set + if cfg.output_dir == "logs/": + timestamp = datetime.now().strftime("%Y%m%d_%H%M") + cfg.output_dir = ( + Path(cfg.output_dir) / cfg.benchmark.name / chosen_config_name / timestamp + ) + return cfg + + +def _run_one_process(cfg, args, run_id: int, total_runs: int): + cfg_new = deepcopy(cfg) + cfg_new.output_dir = Path(cfg_new.output_dir) / f"run_{run_id}" + cfg_new = setup_hydra_output_dir(cfg_new, list(args)) + + print("==========================================") + print(f"🚀 Launching experiment {run_id}/{total_runs}") + print(f"📝 Output dir: {cfg_new.output_dir}") + print("==========================================") + try: + asyncio.run(entrypoint(cfg_new)) + print(f"Run {run_id} finished.") + return + except Exception as e: + print(f"Run {run_id} failed: {e!r}") + return + + +def main_runs_multiprocess(cfg, args, max_workers: int | None = None): + num_runs = int(cfg.num_runs) + if num_runs <= 1: + asyncio.run(entrypoint(cfg)) + return + + if max_workers is None: + hw = os.cpu_count() or 1 + max_workers = min(num_runs, hw) + + futures = [] + import multiprocessing as mp + + try: + mp.set_start_method("spawn", force=True) + except RuntimeError: + pass + + with ProcessPoolExecutor(max_workers=max_workers) as ex: + for i in range(num_runs): + run_id = i + 1 + fut = ex.submit(_run_one_process, cfg, list(args), run_id, num_runs) + futures.append(fut) + + ok_count, fail_count = 0, 0 + for fut in as_completed(futures): + res = fut.result() + if res["ok"]: + ok_count += 1 + else: + fail_count += 1 + + print("==========================================") + print(f"All {num_runs} runs finished. OK={ok_count}, FAIL={fail_count}") + print("==========================================") + + def main(*args, config_file_name: str = ""): # Register signal handlers for immediate response to Ctrl+C signal.signal(signal.SIGINT, signal_handler) @@ -708,8 +780,8 @@ def main(*args, config_file_name: str = ""): config_dir=os.path.abspath(config_path()), version_base=None ): cfg = hydra.compose(config_name=chosen_config_name, overrides=list(args)) - cfg = setup_hydra_output_dir(cfg, list(args)) + cfg = preprocess_config(cfg, chosen_config_name) _ = bootstrap_logger(level=LOGGER_LEVEL) # Tracing functionality removed - miroflow-contrib deleted - asyncio.run(entrypoint(cfg)) + main_runs_multiprocess(cfg, args) diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md index 97a3da0..8d84f20 100644 --- a/docs/mkdocs/docs/futurex.md +++ b/docs/mkdocs/docs/futurex.md @@ -137,33 +137,18 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C} ### Step 1: Run Multiple Evaluations -Use the multiple runs script to execute several independent evaluations: +Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance. -```bash title="Run Multiple Evaluations" -./scripts/run_evaluate_multiple_runs_futurex.sh -``` - -This script will: +It will: -- Run 3 independent evaluations by default (configurable with `NUM_RUNS`) +- Run multiple independent evaluations by default (configurable with `num_runs`) - Execute all tasks in parallel for efficiency - Generate separate result files for each run in `run_1/`, `run_2/`, etc. -- Create a consolidated `futurex_submission.jsonl` file with voting results ### Step 2: Customize Multiple Runs -You can customize the evaluation parameters: - -```bash title="Custom Multiple Runs" -# Run 5 evaluations with limited tasks for testing -NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh +You can customize the evaluation parameters in relevant config, including `num_runs`, `max_tasks`, `max_concurrent`, etc. -# Use different agent configuration -AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh - -# Adjust concurrency for resource management -MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh -``` ### Step 3: Voting and Aggregation diff --git a/docs/mkdocs/docs/gaia_validation_mirothinker.md b/docs/mkdocs/docs/gaia_validation_mirothinker.md index 959cdc4..400183b 100644 --- a/docs/mkdocs/docs/gaia_validation_mirothinker.md +++ b/docs/mkdocs/docs/gaia_validation_mirothinker.md @@ -61,11 +61,8 @@ Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#p Due to performance variance in MiroThinker models, it's recommended to run multiple evaluations for more reliable results. -```bash title="Run Multiple MiroThinker Evaluations" -bash ./scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh -``` -This script runs 3 evaluations in parallel and calculates average scores. You can modify `NUM_RUNS` in the script to change the number of runs. +Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance. --- diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md index c1f9bf7..6ccf224 100644 --- a/docs/mkdocs/docs/xbench_ds.md +++ b/docs/mkdocs/docs/xbench_ds.md @@ -87,11 +87,7 @@ uv run main.py common-benchmark \ - Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria - Provides more robust and accurate final answers -Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance. - -```bash title="Multiple runs with parallel thinking post-processing" -bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh -``` +Set `num_runs` in relevant config to the desired number of runs to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance. ### Running Parallel Thinking Analysis alone