Skip to content

Commit 41c42e2

Browse files
update xbench-ds docs, fix small read_file bug, set llm_as_judge temp to 0
1 parent 128d4ba commit 41c42e2

File tree

4 files changed

+88
-22
lines changed

4 files changed

+88
-22
lines changed

config/agent_xbench-ds.yaml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
defaults:
2+
- benchmark: xbench-ds
3+
- override hydra/job_logging: none
4+
- _self_ # Allow defining variables at the top of this file
5+
6+
7+
main_agent:
8+
prompt_class: MainAgentPrompt_GAIA
9+
llm:
10+
provider_class: "ClaudeOpenRouterClient"
11+
model_name: "anthropic/claude-3.7-sonnet"
12+
async_client: true
13+
temperature: 0.3
14+
top_p: 0.95
15+
min_p: 0.0
16+
top_k: -1
17+
max_tokens: 32000
18+
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
19+
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
20+
openrouter_provider: "anthropic"
21+
disable_cache_control: false
22+
keep_tool_result: -1
23+
oai_tool_thinking: false
24+
25+
tool_config:
26+
- tool-reasoning
27+
28+
max_turns: -1 # Maximum number of turns for main agent execution
29+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
30+
31+
input_process:
32+
o3_hint: true
33+
output_process:
34+
o3_final_answer: true
35+
36+
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
37+
add_message_id: true
38+
keep_tool_result: -1
39+
chinese_context: "true"
40+
41+
42+
sub_agents:
43+
agent-worker:
44+
prompt_class: SubAgentWorkerPrompt
45+
llm:
46+
provider_class: "ClaudeOpenRouterClient"
47+
model_name: "anthropic/claude-3.7-sonnet"
48+
async_client: true
49+
temperature: 0.3
50+
top_p: 0.95
51+
min_p: 0.0
52+
top_k: -1
53+
max_tokens: 32000
54+
openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
55+
openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
56+
openrouter_provider: "anthropic"
57+
disable_cache_control: false
58+
keep_tool_result: -1
59+
oai_tool_thinking: false
60+
61+
tool_config:
62+
- tool-searching
63+
- tool-image-video
64+
- tool-reading
65+
- tool-code
66+
- tool-audio
67+
68+
max_turns: -1 # Maximum number of turns for main agent execution
69+
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
70+
71+
72+
# Can define some top-level or default parameters here
73+
output_dir: logs/
74+
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
75+

docs/mkdocs/docs/xbench_ds.md

Lines changed: 7 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -51,22 +51,9 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
5151
### Step 3: Run the Evaluation
5252

5353
```bash
54-
bash scripts/run_evaluate_single_run_xbench-ds.sh
55-
```
56-
57-
!!! note "Script Contents"
58-
Since xbench-DeepSearch operates in a Chinese context, enable Chinese prompts by setting the environment variable `CHINESE_CONTEXT="true"`
59-
60-
```bash title="scripts/run_evaluate_single_run_xbench-ds.sh"
61-
RESULTS_DIR=${RESULTS_DIR:-"logs/xbench-ds/$(date +"%Y%m%d_%H%M")"}
62-
echo "Results will be saved in: $RESULTS_DIR"
63-
64-
export CHINESE_CONTEXT="true"
65-
6654
uv run main.py common-benchmark \
67-
--config_file_name=agent_quickstart_1 \
68-
benchmark=xbench-ds \
69-
output_dir=$RESULTS_DIR
55+
--config_file_name=agent_xbench-ds \
56+
output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
7057
```
7158

7259
### Step 4: Monitor Progress and Resume
@@ -84,7 +71,9 @@ Replace `$PATH_TO_LOG` with your actual output directory path.
8471
If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
8572

8673
```bash title="Resume Interrupted Evaluation"
87-
RESULTS_DIR=$PATH_TO_LOG bash scripts/run_evaluate_single_run_xbench-ds.sh
74+
uv run main.py common-benchmark \
75+
--config_file_name=agent_xbench-ds \
76+
output_dir="logs/xbench-ds/20250922_1430"
8877
```
8978

9079
---
@@ -110,8 +99,8 @@ After completing evaluations (single or multiple runs), you can apply parallel t
11099

111100
```bash title="Parallel Thinking Post-Processing"
112101
uv run utils/util_llm_parallel_thinking.py \
113-
--benchmark xbench-ds \
114-
--results_dir "logs/xbench-ds/20250922_1430"
102+
--benchmark xbench-ds \
103+
--results_dir "logs/xbench-ds/20250922_1430"
115104
```
116105

117106
The program automatically reads results from each run in the specified directory and performs aggregated analysis. The final output files are generated in the `results_dir`:

src/tool/mcp_servers/reading_mcp_server.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616

1717
# Initialize FastMCP server
1818
mcp = FastMCP("reading-mcp-server")
19-
19+
SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
20+
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
2021

2122
@mcp.tool()
2223
async def read_file(uri: str) -> str:
@@ -64,7 +65,7 @@ async def read_file(uri: str) -> str:
6465
if retry_count > 3:
6566
# Try scrape_website tool as fallback
6667
try:
67-
scrape_result = await smart_request(uri)
68+
scrape_result = await smart_request(uri, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY})
6869
return f"[INFO]: Download failed, automatically tried `scrape_website` tool instead.\n\n{scrape_result}"
6970
except Exception as scrape_error:
7071
return f"[ERROR]: Failed to download {uri}: {e}. Also failed to scrape with `scrape_website` tool: {scrape_error}"
@@ -91,7 +92,8 @@ def _cleanup_tempfile(path):
9192
arguments = {"uri": uri}
9293

9394
server_params = StdioServerParameters(
94-
command="markitdown-mcp",
95+
command="uv",
96+
args=["run", "--active", "--", "markitdown-mcp"],
9597
)
9698

9799
result_content = ""

utils/eval_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ async def verify_answer_llm_simpleqa(
112112
CHOICE_MAP = {"A": "CORRECT", "B": "INCORRECT", "C": "NOT_ATTEMPTED"}
113113

114114
llm_response = await openai_client.chat.completions.create(
115-
model="gpt-4o-mini", messages=messages, max_completion_tokens=2
115+
model="gpt-4o-mini", messages=messages, max_completion_tokens=2, temperature=0.0
116116
)
117117
content = llm_response.choices[0].message.content
118118
match = re.search(r"(A|B|C)", content)

0 commit comments

Comments
 (0)