diff --git a/config/agent_browsecomp-zh_claude37sonnet.yaml b/config/agent_browsecomp-zh_claude37sonnet.yaml new file mode 100644 index 0000000..92fd3b2 --- /dev/null +++ b/config/agent_browsecomp-zh_claude37sonnet.yaml @@ -0,0 +1,79 @@ +defaults: + - benchmark: browsecomp-zh + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,true}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + + diff --git a/config/agent_browsecomp-zh_mirothinker.yaml b/config/agent_browsecomp-zh_mirothinker.yaml new file mode 100644 index 0000000..f3edf13 --- /dev/null +++ b/config/agent_browsecomp-zh_mirothinker.yaml @@ -0,0 +1,56 @@ +defaults: + - benchmark: browsecomp-zh + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "MiroThinkerSGLangClient" + model_name: "DUMMY_MODEL_NAME" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 4096 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,true}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + + diff --git a/config/benchmark/browsecomp-zh.yaml b/config/benchmark/browsecomp-zh.yaml new file mode 100644 index 0000000..8dfb1e5 --- /dev/null +++ b/config/benchmark/browsecomp-zh.yaml @@ -0,0 +1,21 @@ +# config/benchmark/browsecomp-zh.yaml +defaults: + - default + - _self_ + +name: "browsecomp-zh" + +data: + data_dir: "${data_dir}/browsecomp-zh-test" # Path to browsecomp-zh-test (Chinese) dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for browsecomp-zh since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + + diff --git a/docs/mkdocs/docs/browsecomp_zh.md b/docs/mkdocs/docs/browsecomp_zh.md new file mode 100644 index 0000000..b9f237d --- /dev/null +++ b/docs/mkdocs/docs/browsecomp_zh.md @@ -0,0 +1,94 @@ +# BrowseComp-ZH (Chinese) + +MiroFlow's evaluation on the BrowseComp-ZH benchmark demonstrates advanced web browsing and information retrieval capabilities in the Chinese information ecosystem. + +More details: [BrowseComp-ZH: Benchmarking Web Browsing Ability of Large Language Models in Chinese](https://github.com/PALIN2018/BrowseComp-ZH) + +--- + +## Dataset Overview + +!!! abstract "Key Dataset Characteristics" + + - **Total Tasks**: 289 complex multi-hop retrieval questions in the test split + - **Language**: Chinese (Simplified) + - **Task Types**: Web browsing, search, and information retrieval with multi-hop reasoning + - **Domains**: 11 domains including Film & TV, Technology, Medicine, History, Sports, and Arts + - **Evaluation**: Automated comparison with ground truth answers + - **Difficulty**: High-difficulty benchmark designed to test real-world Chinese web browsing capabilities + +--- + +## Quick Start Guide + +### Step 1: Prepare the BrowseComp-ZH Dataset + +```bash title="Download BrowseComp-ZH Dataset" +uv run main.py prepare-benchmark get browsecomp-zh-test +``` + +This will create the standardized dataset at `data/browsecomp-zh-test/standardized_data.jsonl`. + +### Step 2: Configure API Keys + +```env title=".env Configuration" +# Search and web scraping (recommended for Chinese web) +SERPER_API_KEY="xxx" +JINA_API_KEY="xxx" + +# Code execution +E2B_API_KEY="xxx" + +# LLM (Claude 3.7 Sonnet via OpenRouter) +OPENROUTER_API_KEY="xxx" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + +# Evaluation and hint generation +OPENAI_API_KEY="xxx" + +# Vision capabilities +ANTHROPIC_API_KEY="xxx" +GEMINI_API_KEY="xxx" + +# Optional: Set Chinese context mode +CHINESE_CONTEXT="true" +``` + +### Step 3: Run the Evaluation + +```bash title="Run BrowseComp-ZH Evaluation" +uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_claude37sonnet output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")" +``` + +Results are automatically generated in the output directory: +- `benchmark_results.jsonl` - Detailed results for each task +- `benchmark_results_pass_at_1_accuracy.txt` - Summary accuracy statistics + +--- + +## Usage Examples + +```bash title="Limited Task Testing" +# Test with 10 tasks only +uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")" +``` + +```bash title="Using MiroThinker Model" +uv run main.py common-benchmark --config_file_name=agent_browsecomp-zh_mirothinker output_dir="logs/browsecomp-zh/$(date +"%Y%m%d_%H%M")" +``` + +--- + +## Available Agent Configurations + +| Agent Configuration | Model | Use Case | +|-------------------|-------|----------| +| `agent_browsecomp-zh_claude37sonnet` | Claude 3.7 Sonnet | Recommended for better performance on Chinese tasks | +| `agent_browsecomp-zh_mirothinker` | MiroThinker | For local deployment | + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 ยท **Doc Contributor:** Team @ MiroMind AI + + diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 7b33d6b..f35b7b3 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -61,6 +61,7 @@ nav: - GAIA-Val-Text: gaia_validation_text_only.md - GAIA-Test: gaia_test.md - BrowseComp-EN: browsecomp_en.md + - BrowseComp-ZH: browsecomp_zh.md - WebWalkerQA: webwalkerqa.md - FutureX: futurex.md - xBench-DeepSearch: xbench_ds.md