feat(benchmark): add browsecomp_zh (#88)

ntudy · Yue Deng · web-flow · commit a0f47dfe45cc · 2025-10-16T18:52:25.000+08:00
add browsecomp_zh

Co-authored-by: Yue Deng &lt;yue.deng@miromind.ai&gt;
diff --git a/config/agent_browsecomp-zh_claude37sonnet.yaml b/config/agent_browsecomp-zh_claude37sonnet.yaml
@@ -0,0 +1,79 @@
+defaults:
+  - benchmark: browsecomp-zh
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,true}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 50  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
+
diff --git a/config/agent_browsecomp-zh_mirothinker.yaml b/config/agent_browsecomp-zh_mirothinker.yaml
@@ -0,0 +1,56 @@
+defaults:
+  - benchmark: browsecomp-zh
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "DUMMY_MODEL_NAME"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 4096
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+    - tool-searching
+    - tool-image-video
+    - tool-reading
+    - tool-code
+    - tool-audio
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,true}"
+
+
+sub_agents: null
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
+
diff --git a/config/benchmark/browsecomp-zh.yaml b/config/benchmark/browsecomp-zh.yaml
@@ -0,0 +1,21 @@
+# config/benchmark/browsecomp-zh.yaml
+defaults:
+  - default
+  - _self_
+
+name: "browsecomp-zh"
+
+data:
+  data_dir: "${data_dir}/browsecomp-zh-test"  # Path to browsecomp-zh-test (Chinese) dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for browsecomp-zh since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+
+
diff --git a/docs/mkdocs/docs/browsecomp_zh.md b/docs/mkdocs/docs/browsecomp_zh.md
@@ -0,0 +1,94 @@
+# BrowseComp-ZH (Chinese)
+
+MiroFlow's evaluation on the BrowseComp-ZH benchmark demonstrates advanced web browsing and information retrieval capabilities in the Chinese information ecosystem.
+
+More details: [BrowseComp-ZH: Benchmarking Web Browsing Ability of Large Language Models in Chinese](https://github.com/PALIN2018/BrowseComp-ZH)
+
+---
+
+## Dataset Overview
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: 289 complex multi-hop retrieval questions in the test split
+    - **Language**: Chinese (Simplified)
+    - **Task Types**: Web browsing, search, and information retrieval with multi-hop reasoning
+    - **Domains**: 11 domains including Film & TV, Technology, Medicine, History, Sports, and Arts
+    - **Evaluation**: Automated comparison with ground truth answers
+    - **Difficulty**: High-difficulty benchmark designed to test real-world Chinese web browsing capabilities
+
+---
+
+## Quick Start Guide
+
+### Step 1: Prepare the BrowseComp-ZH Dataset
+
+```bash title="Download BrowseComp-ZH Dataset"
+uv run main.py prepare-benchmark get browsecomp-zh-test
+```
+
+This will create the standardized dataset at `data/browsecomp-zh-test/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+```env title=".env Configuration"
+# Search and web scraping (recommended for Chinese web)
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# Code execution
+E2B_API_KEY="xxx"
+
+# LLM (Claude 3.7 Sonnet via OpenRouter)
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Evaluation and hint generation
+OPENAI_API_KEY="xxx"
+
+# Vision capabilities
+ANTHROPIC_API_KEY="xxx"
+GEMINI_API_KEY="xxx"
+
+# Optional: Set Chinese context mode
+CHINESE_CONTEXT="true"
+```
+
+### Step 3: Run the Evaluation
+
+```bash title="Run BrowseComp-ZH Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_claude37sonnet output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")"
+```
+
+Results are automatically generated in the output directory:
+- `benchmark_results.jsonl` - Detailed results for each task
+- `benchmark_results_pass_at_1_accuracy.txt` - Summary accuracy statistics
+
+---
+
+## Usage Examples
+
+```bash title="Limited Task Testing"
+# Test with 10 tasks only
+uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")"
+```
+
+```bash title="Using MiroThinker Model"
+uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_mirothinker output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")"
+```
+
+---
+
+## Available Agent Configurations
+
+| Agent Configuration | Model | Use Case |
+|-------------------|-------|----------|
+| `agent_browsecomp-zh_claude37sonnet` | Claude 3.7 Sonnet | Recommended for better performance on Chinese tasks |
+| `agent_browsecomp-zh_mirothinker` | MiroThinker | For local deployment |
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
+
+
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -61,6 +61,7 @@ nav:
     - GAIA-Val-Text: gaia_validation_text_only.md
     - GAIA-Test: gaia_test.md
     - BrowseComp-EN: browsecomp_en.md
+    - BrowseComp-ZH: browsecomp_zh.md
     - WebWalkerQA: webwalkerqa.md
     - FutureX: futurex.md
     - xBench-DeepSearch: xbench_ds.md