feat(benchmark): add support for WebWalkerQA dataset (#84)

BinWang28 · web-flow · commit 60df2572b4ec · 2025-10-15T14:04:22.000+08:00
add support for webwalkerqa
diff --git a/config/agent_webwalkerqa_claude37sonnet.yaml b/config/agent_webwalkerqa_claude37sonnet.yaml
@@ -0,0 +1,56 @@
+defaults:
+  - benchmark: webwalkerqa
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-searching
+    - tool-image-video
+    - tool-reading
+    - tool-code
+    - tool-audio
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents: null
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
diff --git a/config/agent_webwalkerqa_mirothinker.yaml b/config/agent_webwalkerqa_mirothinker.yaml
@@ -0,0 +1,56 @@
+defaults:
+  - benchmark: webwalkerqa
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "DUMMY_MODEL_NAME"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 4096
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-searching
+    - tool-image-video
+    - tool-reading
+    - tool-code
+    - tool-audio
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents: null
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
+
diff --git a/config/benchmark/webwalkerqa.yaml b/config/benchmark/webwalkerqa.yaml
@@ -0,0 +1,21 @@
+# config/benchmark/webwalkerqa.yaml
+defaults:
+  - default
+  - _self_
+
+name: "webwalkerqa"
+
+data:
+  data_dir: "${data_dir}/webwalkerqa"  # Path to webwalkerqa dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for webwalkerqa since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+
+
diff --git a/docs/mkdocs/docs/mirothinker.md b/docs/mkdocs/docs/mirothinker.md
@@ -60,6 +60,7 @@ uv run main.py common-benchmark --config_file_name=agent_llm_mirothinker output_
 ```
 
 This command will:
+
 - Use the `agent_llm_mirothinker` configuration with the dedicated MiroThinkerSGLangClient
 - Run the example dataset benchmark (configured in the YAML file)
 - Test the model's question-answering capabilities
diff --git a/docs/mkdocs/docs/webwalkerqa.md b/docs/mkdocs/docs/webwalkerqa.md
@@ -0,0 +1,132 @@
+# WebWalkerQA
+
+MiroFlow's evaluation on the WebWalkerQA benchmark demonstrates web navigation and question-answering capabilities across diverse domains.
+
+More details: [WebWalkerQA on HuggingFace](https://huggingface.co/datasets/MiromindAI/WebWalkerQA)
+
+---
+
+## Dataset Overview
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: 680 tasks in the main split
+    - **Language**: English
+    - **Domains**: Conference, game, academic, business, and more
+    - **Task Types**: Web navigation, information retrieval, multi-hop reasoning
+    - **Difficulty Levels**: Easy, medium, hard
+    - **Evaluation**: Automated comparison with ground truth answers
+
+---
+
+## Quick Start Guide
+
+### Step 1: Prepare the WebWalkerQA Dataset
+
+```bash title="Download WebWalkerQA Dataset"
+uv run main.py prepare-benchmark get webwalkerqa
+```
+
+This will create the standardized dataset at `data/webwalkerqa/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+=== "Claude 3.7 Sonnet"
+
+    ```env title=".env Configuration"
+    # Search and web scraping
+    SERPER_API_KEY="xxx"
+    JINA_API_KEY="xxx"
+    
+    # Code execution
+    E2B_API_KEY="xxx"
+    
+    # LLM (Claude 3.7 Sonnet via OpenRouter)
+    OPENROUTER_API_KEY="xxx"
+    OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+    
+    # Evaluation and hint generation
+    OPENAI_API_KEY="xxx"
+    
+    # Vision capabilities
+    ANTHROPIC_API_KEY="xxx"
+    GEMINI_API_KEY="xxx"
+    ```
+
+=== "MiroThinker"
+
+    ```env title=".env Configuration"
+    # Search and web scraping
+    SERPER_API_KEY="xxx"
+    JINA_API_KEY="xxx"
+    
+    # Code execution
+    E2B_API_KEY="xxx"
+    
+    # LLM (MiroThinker via SGLang)
+    OAI_MIROTHINKER_API_KEY="dummy_key"
+    OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
+    
+    # Evaluation and final answer extraction
+    OPENAI_API_KEY="xxx"
+    
+    # Vision capabilities
+    ANTHROPIC_API_KEY="xxx"
+    GEMINI_API_KEY="xxx"
+    ```
+
+### Step 3: Run the Evaluation
+
+```bash title="Run WebWalkerQA Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Progress Monitoring and Resume"
+    To check the progress while running:
+    
+    ```bash title="Check Progress"
+    ls -lh logs/webwalkerqa/YOUR_RUN_DIR/
+    ```
+    
+    If you need to resume an interrupted evaluation, specify the same output directory:
+    
+    ```bash title="Resume Evaluation"
+    uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet output_dir=${PATH_TO_LOG}
+    ```
+
+Results are automatically generated in the output directory:
+- `benchmark_results.jsonl` - Detailed results for each task
+- `benchmark_results_pass_at_1_accuracy.txt` - Summary accuracy statistics
+
+---
+
+## Usage Examples
+
+```bash title="Limited Task Testing"
+# Test with 10 tasks only
+uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/webwalkerqa/test"
+```
+
+```bash title="Custom Concurrency"
+# Run with 10 concurrent tasks
+uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet benchmark.execution.max_concurrent=10 output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")"
+```
+
+```bash title="Using MiroThinker Model"
+uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_mirothinker output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")"
+```
+
+---
+
+## Available Agent Configurations
+
+| Agent Configuration | Model | Use Case |
+|-------------------|-------|----------|
+| `agent_webwalkerqa_claude37sonnet` | Claude 3.7 Sonnet | Recommended for best performance |
+| `agent_webwalkerqa_mirothinker` | MiroThinker | For local deployment |
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
+
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -61,6 +61,7 @@ nav:
     - GAIA-Val-Text: gaia_validation_text_only.md
     - GAIA-Test: gaia_test.md
     - BrowseComp-EN: browsecomp_en.md
+    - WebWalkerQA: webwalkerqa.md
     - FutureX: futurex.md
     - xBench-DeepSearch: xbench_ds.md
     - FinSearchComp: finsearchcomp.md