Skip to content

Commit 3c06966

Browse files
committed
feat(config):Use configs to replace the multi-round testing functionality of scripts.
1 parent 119b2d9 commit 3c06966

File tree

4 files changed

+81
-31
lines changed

4 files changed

+81
-31
lines changed

common_benchmark.py

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from pathlib import Path
1313
from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict
1414
import random
15-
15+
from datetime import datetime
16+
from copy import deepcopy
17+
from concurrent.futures import ProcessPoolExecutor, as_completed
1618
import dotenv
1719
import hydra
1820
import openai
@@ -691,6 +693,76 @@ def signal_handler(signum, frame):
691693
os._exit(1) # Force immediate exit
692694

693695

696+
def preprocess_config(cfg: DictConfig, chosen_config_name: str) -> DictConfig:
697+
# set num_runs to 1 if not set
698+
if "num_runs" not in cfg:
699+
OmegaConf.set_struct(cfg, False)
700+
cfg.num_runs = 1
701+
OmegaConf.set_struct(cfg, True)
702+
# set output_dir to logs/benchmark.name/agent_set/timestamp if not set
703+
if cfg.output_dir == 'logs/':
704+
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
705+
cfg.output_dir = Path(cfg.output_dir) / cfg.benchmark.name / chosen_config_name / timestamp
706+
return cfg
707+
708+
709+
def _run_one_process(cfg, args, run_id: int, total_runs: int):
710+
cfg_new = deepcopy(cfg)
711+
cfg_new.output_dir = Path(cfg_new.output_dir) / f"run_{run_id}"
712+
cfg_new = setup_hydra_output_dir(cfg_new, list(args))
713+
714+
print("==========================================")
715+
print(f"🚀 Launching experiment {run_id}/{total_runs}")
716+
print(f"📝 Output dir: {cfg_new.output_dir}")
717+
print("==========================================")
718+
try:
719+
asyncio.run(entrypoint(cfg_new))
720+
print(f"Run {run_id} finished.")
721+
return
722+
except Exception as e:
723+
print(f"Run {run_id} failed: {e!r}")
724+
return
725+
726+
727+
def main_runs_multiprocess(cfg, args, max_workers: int | None = None):
728+
num_runs = int(cfg.num_runs)
729+
if num_runs <= 1:
730+
asyncio.run(entrypoint(cfg))
731+
return
732+
733+
if max_workers is None:
734+
hw = os.cpu_count() or 1
735+
max_workers = min(num_runs, hw)
736+
737+
futures = []
738+
import multiprocessing as mp
739+
try:
740+
mp.set_start_method("spawn", force=True)
741+
except RuntimeError:
742+
pass
743+
744+
with ProcessPoolExecutor(max_workers=max_workers) as ex:
745+
for i in range(num_runs):
746+
run_id = i + 1
747+
fut = ex.submit(
748+
_run_one_process,
749+
cfg, list(args), run_id, num_runs
750+
)
751+
futures.append(fut)
752+
753+
ok_count, fail_count = 0, 0
754+
for fut in as_completed(futures):
755+
res = fut.result()
756+
if res["ok"]:
757+
ok_count += 1
758+
else:
759+
fail_count += 1
760+
761+
print("==========================================")
762+
print(f"All {num_runs} runs finished. OK={ok_count}, FAIL={fail_count}")
763+
print("==========================================")
764+
765+
694766
def main(*args, config_file_name: str = ""):
695767
# Register signal handlers for immediate response to Ctrl+C
696768
signal.signal(signal.SIGINT, signal_handler)
@@ -708,8 +780,8 @@ def main(*args, config_file_name: str = ""):
708780
config_dir=os.path.abspath(config_path()), version_base=None
709781
):
710782
cfg = hydra.compose(config_name=chosen_config_name, overrides=list(args))
711-
cfg = setup_hydra_output_dir(cfg, list(args))
783+
cfg = preprocess_config(cfg, chosen_config_name)
712784

713785
_ = bootstrap_logger(level=LOGGER_LEVEL)
714786
# Tracing functionality removed - miroflow-contrib deleted
715-
asyncio.run(entrypoint(cfg))
787+
main_runs_multiprocess(cfg, args)

docs/mkdocs/docs/futurex.md

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -137,33 +137,18 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}
137137

138138
### Step 1: Run Multiple Evaluations
139139

140-
Use the multiple runs script to execute several independent evaluations:
140+
Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.
141141

142-
```bash title="Run Multiple Evaluations"
143-
./scripts/run_evaluate_multiple_runs_futurex.sh
144-
```
145-
146-
This script will:
142+
It will:
147143

148-
- Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
144+
- Run multiple independent evaluations by default (configurable with `num_runs`)
149145
- Execute all tasks in parallel for efficiency
150146
- Generate separate result files for each run in `run_1/`, `run_2/`, etc.
151-
- Create a consolidated `futurex_submission.jsonl` file with voting results
152147

153148
### Step 2: Customize Multiple Runs
154149

155-
You can customize the evaluation parameters:
156-
157-
```bash title="Custom Multiple Runs"
158-
# Run 5 evaluations with limited tasks for testing
159-
NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh
150+
You can customize the evaluation parameters in relevant config, including `num_runs`, `max_tasks`, `max_concurrent`, etc.
160151

161-
# Use different agent configuration
162-
AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh
163-
164-
# Adjust concurrency for resource management
165-
MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh
166-
```
167152

168153
### Step 3: Voting and Aggregation
169154

docs/mkdocs/docs/gaia_validation_mirothinker.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,8 @@ Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#p
6161

6262
Due to performance variance in MiroThinker models, it's recommended to run multiple evaluations for more reliable results.
6363

64-
```bash title="Run Multiple MiroThinker Evaluations"
65-
bash ./scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
66-
```
6764

68-
This script runs 3 evaluations in parallel and calculates average scores. You can modify `NUM_RUNS` in the script to change the number of runs.
65+
Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.
6966

7067
---
7168

docs/mkdocs/docs/xbench_ds.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,11 +87,7 @@ uv run main.py common-benchmark \
8787
- Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria
8888
- Provides more robust and accurate final answers
8989

90-
Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
91-
92-
```bash title="Multiple runs with parallel thinking post-processing"
93-
bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh
94-
```
90+
Set `num_runs` in relevant config to the desired number of runs to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
9591

9692
### Running Parallel Thinking Analysis alone
9793

0 commit comments

Comments
 (0)