update xbench-ds docs, fix small read_file bug, set llm_as_judge temp to 0

xuan-dong-shanda · xuan-dong-shanda · commit 41c42e2f4026 · 2025-09-24T17:12:37.000+08:00
diff --git a/config/agent_xbench-ds.yaml b/config/agent_xbench-ds.yaml
@@ -0,0 +1,75 @@
+defaults:
+  - benchmark: xbench-ds
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: -1  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    o3_hint: true
+  output_process:
+    o3_final_answer: true
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "true"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: -1  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md
@@ -51,22 +51,9 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
 ### Step 3: Run the Evaluation
 
 ```bash
-bash scripts/run_evaluate_single_run_xbench-ds.sh
-```
-
-!!! note "Script Contents"
-    Since xbench-DeepSearch operates in a Chinese context, enable Chinese prompts by setting the environment variable `CHINESE_CONTEXT="true"`
-
-```bash title="scripts/run_evaluate_single_run_xbench-ds.sh"
-RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
-echo "Results will be saved in: $RESULTS_DIR"
-
-export CHINESE_CONTEXT="true"
-
 uv run main.py common-benchmark \
-  --config_file_name=agent_quickstart_1 \
-  benchmark=xbench-ds \
-  output_dir=$RESULTS_DIR
+  --config_file_name=agent_xbench-ds \
+  output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
 ```
 
 ### Step 4: Monitor Progress and Resume
@@ -84,7 +71,9 @@ Replace `$PATH_TO_LOG` with your actual output directory path.
     If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
 
 ```bash title="Resume Interrupted Evaluation"
-RESULTS_DIR=$PATH_TO_LOG bash scripts/run_evaluate_single_run_xbench-ds.sh
+uv run main.py common-benchmark \
+  --config_file_name=agent_xbench-ds \
+  output_dir="logs/xbench-ds/20250922_1430"
 ```
 
 ---
@@ -110,8 +99,8 @@ After completing evaluations (single or multiple runs), you can apply parallel t
 
 ```bash title="Parallel Thinking Post-Processing"
 uv run utils/util_llm_parallel_thinking.py \
-    --benchmark xbench-ds \
-    --results_dir "logs/xbench-ds/20250922_1430"
+  --benchmark xbench-ds \
+  --results_dir "logs/xbench-ds/20250922_1430"
 ```
 
 The program automatically reads results from each run in the specified directory and performs aggregated analysis. The final output files are generated in the `results_dir`:
diff --git a/src/tool/mcp_servers/reading_mcp_server.py b/src/tool/mcp_servers/reading_mcp_server.py
@@ -16,7 +16,8 @@
 
 # Initialize FastMCP server
 mcp = FastMCP("reading-mcp-server")
-
+SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
+JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
 
 @mcp.tool()
 async def read_file(uri: str) -> str:
@@ -64,7 +65,7 @@ async def read_file(uri: str) -> str:
                 if retry_count > 3:
                     # Try scrape_website tool as fallback
                     try:
-                        scrape_result = await smart_request(uri)
+                        scrape_result = await smart_request(uri, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY})
                         return f"[INFO]: Download failed, automatically tried `scrape_website` tool instead.\n\n{scrape_result}"
                     except Exception as scrape_error:
                         return f"[ERROR]: Failed to download {uri}: {e}. Also failed to scrape with `scrape_website` tool: {scrape_error}"
@@ -91,7 +92,8 @@ def _cleanup_tempfile(path):
     arguments = {"uri": uri}
 
     server_params = StdioServerParameters(
-        command="markitdown-mcp",
+        command="uv",
+        args=["run", "--active", "--", "markitdown-mcp"],
     )
 
     result_content = ""
diff --git a/utils/eval_utils.py b/utils/eval_utils.py
@@ -112,7 +112,7 @@ async def verify_answer_llm_simpleqa(
     CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"}
 
     llm_response = await openai_client.chat.completions.create(
-        model="gpt-4o-mini", messages=messages, max_completion_tokens=2
+        model="gpt-4o-mini", messages=messages, max_completion_tokens=2, temperature=0.0
     )
     content = llm_response.choices[0].message.content
     match = re.search(r"(A|B|C)", content)

Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ async def verify_answer_llm_simpleqa(`
`112`	`112`	`CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"}`
`113`	`113`
`114`	`114`	`llm_response = await openai_client.chat.completions.create(`
`115`		`- model="gpt-4o-mini", messages=messages, max_completion_tokens=2`
	`115`	`+ model="gpt-4o-mini", messages=messages, max_completion_tokens=2, temperature=0.0`
`116`	`116`	`)`
`117`	`117`	`content = llm_response.choices[0].message.content`
`118`	`118`	`match = re.search(r"(A\|B\|C)", content)`