Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 75 additions & 3 deletions common_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict
import random

from datetime import datetime
from copy import deepcopy
from concurrent.futures import ProcessPoolExecutor, as_completed
import dotenv
import hydra
import openai
Expand Down Expand Up @@ -691,6 +693,76 @@ def signal_handler(signum, frame):
os._exit(1) # Force immediate exit


def preprocess_config(cfg: DictConfig, chosen_config_name: str) -> DictConfig:
# set num_runs to 1 if not set
if "num_runs" not in cfg:
OmegaConf.set_struct(cfg, False)
cfg.num_runs = 1
OmegaConf.set_struct(cfg, True)
# set output_dir to logs/benchmark.name/agent_set/timestamp if not set
if cfg.output_dir == "logs/":
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
cfg.output_dir = (
Path(cfg.output_dir) / cfg.benchmark.name / chosen_config_name / timestamp
)
return cfg


def _run_one_process(cfg, args, run_id: int, total_runs: int):
cfg_new = deepcopy(cfg)
cfg_new.output_dir = Path(cfg_new.output_dir) / f"run_{run_id}"
cfg_new = setup_hydra_output_dir(cfg_new, list(args))

print("==========================================")
print(f"🚀 Launching experiment {run_id}/{total_runs}")
print(f"📝 Output dir: {cfg_new.output_dir}")
print("==========================================")
try:
asyncio.run(entrypoint(cfg_new))
print(f"Run {run_id} finished.")
return
except Exception as e:
print(f"Run {run_id} failed: {e!r}")
return


def main_runs_multiprocess(cfg, args, max_workers: int | None = None):
num_runs = int(cfg.num_runs)
if num_runs <= 1:
asyncio.run(entrypoint(cfg))
return

if max_workers is None:
hw = os.cpu_count() or 1
max_workers = min(num_runs, hw)

futures = []
import multiprocessing as mp

try:
mp.set_start_method("spawn", force=True)
except RuntimeError:
pass

with ProcessPoolExecutor(max_workers=max_workers) as ex:
for i in range(num_runs):
run_id = i + 1
fut = ex.submit(_run_one_process, cfg, list(args), run_id, num_runs)
futures.append(fut)

ok_count, fail_count = 0, 0
for fut in as_completed(futures):
res = fut.result()
if res["ok"]:
ok_count += 1
else:
fail_count += 1

print("==========================================")
print(f"All {num_runs} runs finished. OK={ok_count}, FAIL={fail_count}")
print("==========================================")


def main(*args, config_file_name: str = ""):
# Register signal handlers for immediate response to Ctrl+C
signal.signal(signal.SIGINT, signal_handler)
Expand All @@ -708,8 +780,8 @@ def main(*args, config_file_name: str = ""):
config_dir=os.path.abspath(config_path()), version_base=None
):
cfg = hydra.compose(config_name=chosen_config_name, overrides=list(args))
cfg = setup_hydra_output_dir(cfg, list(args))
cfg = preprocess_config(cfg, chosen_config_name)

_ = bootstrap_logger(level=LOGGER_LEVEL)
# Tracing functionality removed - miroflow-contrib deleted
asyncio.run(entrypoint(cfg))
main_runs_multiprocess(cfg, args)
23 changes: 4 additions & 19 deletions docs/mkdocs/docs/futurex.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,33 +137,18 @@ Expected Format: \boxed{A}, \boxed{B}, or \boxed{C}

### Step 1: Run Multiple Evaluations

Use the multiple runs script to execute several independent evaluations:
Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.

```bash title="Run Multiple Evaluations"
./scripts/run_evaluate_multiple_runs_futurex.sh
```

This script will:
It will:

- Run 3 independent evaluations by default (configurable with `NUM_RUNS`)
- Run multiple independent evaluations by default (configurable with `num_runs`)
- Execute all tasks in parallel for efficiency
- Generate separate result files for each run in `run_1/`, `run_2/`, etc.
- Create a consolidated `futurex_submission.jsonl` file with voting results

### Step 2: Customize Multiple Runs

You can customize the evaluation parameters:

```bash title="Custom Multiple Runs"
# Run 5 evaluations with limited tasks for testing
NUM_RUNS=5 MAX_TASKS=10 ./scripts/run_evaluate_multiple_runs_futurex.sh
You can customize the evaluation parameters in relevant config, including `num_runs`, `max_tasks`, `max_concurrent`, etc.

# Use different agent configuration
AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_futurex.sh

# Adjust concurrency for resource management
MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_futurex.sh
```

### Step 3: Voting and Aggregation

Expand Down
5 changes: 1 addition & 4 deletions docs/mkdocs/docs/gaia_validation_mirothinker.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,8 @@ Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#p

Due to performance variance in MiroThinker models, it's recommended to run multiple evaluations for more reliable results.

```bash title="Run Multiple MiroThinker Evaluations"
bash ./scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
```

This script runs 3 evaluations in parallel and calculates average scores. You can modify `NUM_RUNS` in the script to change the number of runs.
Set `num_runs` in relevant config to the desired number of runs to run multiple evaluations and automatically enable parallel thinking for enhanced performance.

---

Expand Down
6 changes: 1 addition & 5 deletions docs/mkdocs/docs/xbench_ds.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,7 @@ uv run main.py common-benchmark \
- Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria
- Provides more robust and accurate final answers

Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.

```bash title="Multiple runs with parallel thinking post-processing"
bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh
```
Set `num_runs` in relevant config to the desired number of runs to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.

### Running Parallel Thinking Analysis alone

Expand Down
Loading