Skip to content

Commit 128d4ba

Browse files
update xbench-ds docs
1 parent 8c1846c commit 128d4ba

File tree

3 files changed

+60
-29
lines changed

3 files changed

+60
-29
lines changed

docs/mkdocs/docs/xbench_ds.md

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,23 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
5050

5151
### Step 3: Run the Evaluation
5252

53-
!!! note "Chinese Context Configuration"
53+
```bash
54+
bash scripts/run_evaluate_single_run_xbench-ds.sh
55+
```
56+
57+
!!! note "Script Contents"
5458
Since xbench-DeepSearch operates in a Chinese context, enable Chinese prompts by setting the environment variable `CHINESE_CONTEXT="true"`
5559

56-
```bash title="Run xbench-DeepSearch Evaluation"
60+
```bash title="scripts/run_evaluate_single_run_xbench-ds.sh"
61+
RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
62+
echo "Results will be saved in: $RESULTS_DIR"
63+
5764
export CHINESE_CONTEXT="true"
65+
5866
uv run main.py common-benchmark \
5967
--config_file_name=agent_quickstart_1 \
6068
benchmark=xbench-ds \
61-
output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
69+
output_dir=$RESULTS_DIR
6270
```
6371

6472
### Step 4: Monitor Progress and Resume
@@ -76,10 +84,7 @@ Replace `$PATH_TO_LOG` with your actual output directory path.
7684
If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
7785

7886
```bash title="Resume Interrupted Evaluation"
79-
uv run main.py common-benchmark \
80-
--config_file_name=agent_quickstart_1 \
81-
benchmark=xbench-ds \
82-
output_dir="logs/xbench-ds/20250922_1430"
87+
RESULTS_DIR=$PATH_TO_LOG bash scripts/run_evaluate_single_run_xbench-ds.sh
8388
```
8489

8590
---
@@ -93,7 +98,15 @@ uv run main.py common-benchmark \
9398
- Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria
9499
- Provides more robust and accurate final answers
95100

96-
### Running Parallel Thinking Analysis
101+
Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
102+
103+
```bash title="Multiple runs with parallel thinking post-processing"
104+
bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh
105+
```
106+
107+
### Running Parallel Thinking Analysis alone
108+
109+
After completing evaluations (single or multiple runs), you can apply parallel thinking post-processing to aggregate and generate the final result.
97110

98111
```bash title="Parallel Thinking Post-Processing"
99112
uv run utils/util_llm_parallel_thinking.py \

scripts/run_evaluate_multiple_runs_xbench-ds.sh

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,31 +4,23 @@
44
#
55
# SPDX-License-Identifier: Apache-2.0
66

7-
# Configuration parameters - dual model configuration
7+
# Configuration parameters
88
NUM_RUNS=3
9-
MAX_CONCURRENT=20
9+
AGENT_SET="agent_quickstart_1"
1010
BENCHMARK_NAME="xbench-ds"
11-
AGENT_SET="claude03_claude_dual"
12-
ADD_MESSAGE_ID="true" # Set to true to add random message ID to all messages sent to LLM
13-
MAX_TURNS=-1
11+
MAX_CONCURRENT=5
12+
export CHINESE_CONTEXT="true"
1413

15-
# Automatically set Chinese context - if BENCHMARK_NAME contains xbench or -zh
16-
if [[ $BENCHMARK_NAME == "xbench-ds" ]] || [[ $BENCHMARK_NAME == "browsecomp-zh" ]]; then
17-
export CHINESE_CONTEXT="true"
18-
echo "检测到中文相关基准测试,已启用中文上下文:CHINESE_CONTEXT=true"
19-
fi
20-
21-
# export REMOVE_SNIPPETS="true"
22-
# export REMOVE_KNOWLEDGE_GRAPH="true"
23-
# export REMOVE_ANSWER_BOX="true"
14+
# Set results directory with timestamp
15+
TIMESTAMP=$(date +%Y%m%d_%H%M)
16+
RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
2417

2518
export LOGGER_LEVEL="INFO"
2619

27-
RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}"
28-
2920
echo "Starting $NUM_RUNS runs of the evaluation..."
3021
echo "Results will be saved in: $RESULTS_DIR"
3122

23+
# Create results directory
3224
mkdir -p "$RESULTS_DIR"
3325

3426
for i in $(seq 1 $NUM_RUNS); do
@@ -40,11 +32,8 @@ for i in $(seq 1 $NUM_RUNS); do
4032

4133
(
4234
uv run main.py common-benchmark \
35+
--config_file_name=$AGENT_SET \
4336
benchmark=$BENCHMARK_NAME \
44-
agent=$AGENT_SET \
45-
agent.add_message_id=$ADD_MESSAGE_ID \
46-
agent.main_agent.max_turns=$MAX_TURNS \
47-
agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
4837
benchmark.execution.max_tasks=null \
4938
benchmark.execution.max_concurrent=$MAX_CONCURRENT \
5039
benchmark.execution.pass_at_k=1 \
@@ -84,4 +73,18 @@ echo "=========================================="
8473
echo "Multiple runs evaluation completed!"
8574
echo "Check results in: $RESULTS_DIR"
8675
echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
87-
echo "=========================================="
76+
echo "=========================================="
77+
78+
79+
echo "=========================================="
80+
echo "Parallel thinking post-processing"
81+
echo "=========================================="
82+
83+
# Parallel thinking post-processing
84+
uv run utils/util_llm_parallel_thinking.py \
85+
--benchmark xbench-ds \
86+
--results_dir "$RESULTS_DIR"
87+
88+
echo "=========================================="
89+
echo "Parallel thinking post-processing completed!"
90+
echo "=========================================="
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/bin/bash
2+
3+
# SPDX-FileCopyrightText: 2025 MiromindAI
4+
#
5+
# SPDX-License-Identifier: Apache-2.0
6+
7+
RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
8+
echo "Results will be saved in: $RESULTS_DIR"
9+
10+
export CHINESE_CONTEXT="true"
11+
12+
uv run main.py common-benchmark \
13+
--config_file_name=agent_quickstart_1 \
14+
benchmark=xbench-ds \
15+
output_dir=$RESULTS_DIR

0 commit comments

Comments
 (0)