44#
55# SPDX-License-Identifier: Apache-2.0
66
7- # Configuration parameters - dual model configuration
7+ # Configuration parameters
88NUM_RUNS=3
9- MAX_CONCURRENT=20
9+ AGENT_SET= " agent_quickstart_1 "
1010BENCHMARK_NAME=" xbench-ds"
11- AGENT_SET=" claude03_claude_dual"
12- ADD_MESSAGE_ID=" true" # Set to true to add random message ID to all messages sent to LLM
13- MAX_TURNS=-1
11+ MAX_CONCURRENT=5
12+ export CHINESE_CONTEXT=" true"
1413
15- # Automatically set Chinese context - if BENCHMARK_NAME contains xbench or -zh
16- if [[ $BENCHMARK_NAME == " xbench-ds" ]] || [[ $BENCHMARK_NAME == " browsecomp-zh" ]]; then
17- export CHINESE_CONTEXT=" true"
18- echo " 检测到中文相关基准测试,已启用中文上下文:CHINESE_CONTEXT=true"
19- fi
20-
21- # export REMOVE_SNIPPETS="true"
22- # export REMOVE_KNOWLEDGE_GRAPH="true"
23- # export REMOVE_ANSWER_BOX="true"
14+ # Set results directory with timestamp
15+ TIMESTAMP=$( date +%Y%m%d_%H%M)
16+ RESULTS_DIR=${RESULTS_DIR:- " logs/${BENCHMARK_NAME} /${AGENT_SET} _${TIMESTAMP} " }
2417
2518export LOGGER_LEVEL=" INFO"
2619
27- RESULTS_DIR=" logs/${BENCHMARK_NAME} /${AGENT_SET} "
28-
2920echo " Starting $NUM_RUNS runs of the evaluation..."
3021echo " Results will be saved in: $RESULTS_DIR "
3122
23+ # Create results directory
3224mkdir -p " $RESULTS_DIR "
3325
3426for i in $( seq 1 $NUM_RUNS ) ; do
@@ -40,11 +32,8 @@ for i in $(seq 1 $NUM_RUNS); do
4032
4133 (
4234 uv run main.py common-benchmark \
35+ --config_file_name=$AGENT_SET \
4336 benchmark=$BENCHMARK_NAME \
44- agent=$AGENT_SET \
45- agent.add_message_id=$ADD_MESSAGE_ID \
46- agent.main_agent.max_turns=$MAX_TURNS \
47- agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
4837 benchmark.execution.max_tasks=null \
4938 benchmark.execution.max_concurrent=$MAX_CONCURRENT \
5039 benchmark.execution.pass_at_k=1 \
@@ -84,4 +73,18 @@ echo "=========================================="
8473echo " Multiple runs evaluation completed!"
8574echo " Check results in: $RESULTS_DIR "
8675echo " Check individual run logs: $RESULTS_DIR /run_*_output.log"
87- echo " =========================================="
76+ echo " =========================================="
77+
78+
79+ echo " =========================================="
80+ echo " Parallel thinking post-processing"
81+ echo " =========================================="
82+
83+ # Parallel thinking post-processing
84+ uv run utils/util_llm_parallel_thinking.py \
85+ --benchmark xbench-ds \
86+ --results_dir " $RESULTS_DIR "
87+
88+ echo " =========================================="
89+ echo " Parallel thinking post-processing completed!"
90+ echo " =========================================="
0 commit comments