update xbench-ds docs

xuan-dong-shanda · xuan-dong-shanda · commit 128d4ba6dd98 · 2025-09-24T01:19:03.000+08:00
diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md
@@ -50,15 +50,23 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
 
 ### Step 3: Run the Evaluation
 
-!!! note "Chinese Context Configuration"
+```bash
+bash scripts/run_evaluate_single_run_xbench-ds.sh
+```
+
+!!! note "Script Contents"
     Since xbench-DeepSearch operates in a Chinese context, enable Chinese prompts by setting the environment variable `CHINESE_CONTEXT="true"`
 
-```bash title="Run xbench-DeepSearch Evaluation"
+```bash title="scripts/run_evaluate_single_run_xbench-ds.sh"
+RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
+echo "Results will be saved in: $RESULTS_DIR"
+
 export CHINESE_CONTEXT="true"
+
 uv run main.py common-benchmark \
   --config_file_name=agent_quickstart_1 \
   benchmark=xbench-ds \
-  output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
+  output_dir=$RESULTS_DIR
 ```
 
 ### Step 4: Monitor Progress and Resume
@@ -76,10 +84,7 @@ Replace `$PATH_TO_LOG` with your actual output directory path.
     If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
 
 ```bash title="Resume Interrupted Evaluation"
-uv run main.py common-benchmark \
-  --config_file_name=agent_quickstart_1 \
-  benchmark=xbench-ds \
-  output_dir="logs/xbench-ds/20250922_1430"
+RESULTS_DIR=$PATH_TO_LOG bash scripts/run_evaluate_single_run_xbench-ds.sh
 ```
 
 ---
@@ -93,7 +98,15 @@ uv run main.py common-benchmark \
     - Uses another agent (o3 by default) to make final decisions based on equivalence and source reliability criteria
     - Provides more robust and accurate final answers
 
-### Running Parallel Thinking Analysis
+Execute the following command to run multiple xbench-DeepSearch evaluations and automatically enable parallel thinking for enhanced performance.
+
+```bash title="Multiple runs with parallel thinking post-processing"
+bash scripts/run_evaluate_mulitple_runs_xbench-ds.sh
+```
+
+### Running Parallel Thinking Analysis alone
+
+After completing evaluations (single or multiple runs), you can apply parallel thinking post-processing to aggregate and generate the final result.
 
 ```bash title="Parallel Thinking Post-Processing"
 uv run utils/util_llm_parallel_thinking.py \
diff --git a/scripts/run_evaluate_multiple_runs_xbench-ds.sh b/scripts/run_evaluate_multiple_runs_xbench-ds.sh
@@ -4,31 +4,23 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-# Configuration parameters - dual model configuration
+# Configuration parameters
 NUM_RUNS=3
-MAX_CONCURRENT=20
+AGENT_SET="agent_quickstart_1"
 BENCHMARK_NAME="xbench-ds"
-AGENT_SET="claude03_claude_dual"
-ADD_MESSAGE_ID="true"  # Set to true to add random message ID to all messages sent to LLM
-MAX_TURNS=-1
+MAX_CONCURRENT=5
+export CHINESE_CONTEXT="true"
 
-# Automatically set Chinese context - if BENCHMARK_NAME contains xbench or -zh
-if [[ $BENCHMARK_NAME == "xbench-ds" ]] || [[ $BENCHMARK_NAME == "browsecomp-zh" ]]; then
-    export CHINESE_CONTEXT="true"
-    echo "检测到中文相关基准测试，已启用中文上下文：CHINESE_CONTEXT=true"
-fi
-
-# export REMOVE_SNIPPETS="true"
-# export REMOVE_KNOWLEDGE_GRAPH="true"
-# export REMOVE_ANSWER_BOX="true"
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
 
 export LOGGER_LEVEL="INFO"
 
-RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}"
-
 echo "Starting $NUM_RUNS runs of the evaluation..."
 echo "Results will be saved in: $RESULTS_DIR"
 
+# Create results directory
 mkdir -p "$RESULTS_DIR"
 
 for i in $(seq 1 $NUM_RUNS); do
@@ -40,11 +32,8 @@ for i in $(seq 1 $NUM_RUNS); do
     
     (
         uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
             benchmark=$BENCHMARK_NAME \
-            agent=$AGENT_SET \
-            agent.add_message_id=$ADD_MESSAGE_ID \
-            agent.main_agent.max_turns=$MAX_TURNS \
-            agent.sub_agents.agent-worker.max_turns=$MAX_TURNS \
             benchmark.execution.max_tasks=null \
             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
             benchmark.execution.pass_at_k=1 \
@@ -84,4 +73,18 @@ echo "=========================================="
 echo "Multiple runs evaluation completed!"
 echo "Check results in: $RESULTS_DIR"
 echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
-echo "==========================================" 
+echo "==========================================" 
+
+
+echo "=========================================="
+echo "Parallel thinking post-processing"
+echo "=========================================="
+
+# Parallel thinking post-processing
+uv run utils/util_llm_parallel_thinking.py \
+    --benchmark xbench-ds \
+    --results_dir "$RESULTS_DIR"
+
+echo "=========================================="
+echo "Parallel thinking post-processing completed!"
+echo "=========================================="
diff --git a/scripts/run_evaluate_sinlge_run_xbench-ds.sh b/scripts/run_evaluate_sinlge_run_xbench-ds.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
+echo "Results will be saved in: $RESULTS_DIR"
+
+export CHINESE_CONTEXT="true"
+
+uv run main.py common-benchmark \
+  --config_file_name=agent_quickstart_1 \
+  benchmark=xbench-ds \
+  output_dir=$RESULTS_DIR