From 4a7326c927f65e4678fecbdc6cd243a647823d3d Mon Sep 17 00:00:00 2001 From: BinWang28 Date: Wed, 15 Oct 2025 14:00:34 +0800 Subject: [PATCH] add support for webwalkerqa --- config/agent_webwalkerqa_claude37sonnet.yaml | 56 ++++++++ config/agent_webwalkerqa_mirothinker.yaml | 56 ++++++++ config/benchmark/webwalkerqa.yaml | 21 +++ docs/mkdocs/docs/mirothinker.md | 1 + docs/mkdocs/docs/webwalkerqa.md | 132 +++++++++++++++++++ docs/mkdocs/mkdocs.yml | 1 + 6 files changed, 267 insertions(+) create mode 100644 config/agent_webwalkerqa_claude37sonnet.yaml create mode 100644 config/agent_webwalkerqa_mirothinker.yaml create mode 100644 config/benchmark/webwalkerqa.yaml create mode 100644 docs/mkdocs/docs/webwalkerqa.md diff --git a/config/agent_webwalkerqa_claude37sonnet.yaml b/config/agent_webwalkerqa_claude37sonnet.yaml new file mode 100644 index 0000000..df76d98 --- /dev/null +++ b/config/agent_webwalkerqa_claude37sonnet.yaml @@ -0,0 +1,56 @@ +defaults: + - benchmark: webwalkerqa + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + - tool-reasoning + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/agent_webwalkerqa_mirothinker.yaml b/config/agent_webwalkerqa_mirothinker.yaml new file mode 100644 index 0000000..282e493 --- /dev/null +++ b/config/agent_webwalkerqa_mirothinker.yaml @@ -0,0 +1,56 @@ +defaults: + - benchmark: webwalkerqa + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "MiroThinkerSGLangClient" + model_name: "DUMMY_MODEL_NAME" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 4096 + oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}" + oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}" + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + - tool-reasoning + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: false + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: null + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + + diff --git a/config/benchmark/webwalkerqa.yaml b/config/benchmark/webwalkerqa.yaml new file mode 100644 index 0000000..3a7ad71 --- /dev/null +++ b/config/benchmark/webwalkerqa.yaml @@ -0,0 +1,21 @@ +# config/benchmark/webwalkerqa.yaml +defaults: + - default + - _self_ + +name: "webwalkerqa" + +data: + data_dir: "${data_dir}/webwalkerqa" # Path to webwalkerqa dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 5 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for webwalkerqa since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + + diff --git a/docs/mkdocs/docs/mirothinker.md b/docs/mkdocs/docs/mirothinker.md index 93213d8..ec284ea 100644 --- a/docs/mkdocs/docs/mirothinker.md +++ b/docs/mkdocs/docs/mirothinker.md @@ -60,6 +60,7 @@ uv run main.py common-benchmark --config_file_name=agent_llm_mirothinker output_ ``` This command will: + - Use the `agent_llm_mirothinker` configuration with the dedicated MiroThinkerSGLangClient - Run the example dataset benchmark (configured in the YAML file) - Test the model's question-answering capabilities diff --git a/docs/mkdocs/docs/webwalkerqa.md b/docs/mkdocs/docs/webwalkerqa.md new file mode 100644 index 0000000..020c3bd --- /dev/null +++ b/docs/mkdocs/docs/webwalkerqa.md @@ -0,0 +1,132 @@ +# WebWalkerQA + +MiroFlow's evaluation on the WebWalkerQA benchmark demonstrates web navigation and question-answering capabilities across diverse domains. + +More details: [WebWalkerQA on HuggingFace](https://huggingface.co/datasets/MiromindAI/WebWalkerQA) + +--- + +## Dataset Overview + +!!! abstract "Key Dataset Characteristics" + + - **Total Tasks**: 680 tasks in the main split + - **Language**: English + - **Domains**: Conference, game, academic, business, and more + - **Task Types**: Web navigation, information retrieval, multi-hop reasoning + - **Difficulty Levels**: Easy, medium, hard + - **Evaluation**: Automated comparison with ground truth answers + +--- + +## Quick Start Guide + +### Step 1: Prepare the WebWalkerQA Dataset + +```bash title="Download WebWalkerQA Dataset" +uv run main.py prepare-benchmark get webwalkerqa +``` + +This will create the standardized dataset at `data/webwalkerqa/standardized_data.jsonl`. + +### Step 2: Configure API Keys + +=== "Claude 3.7 Sonnet" + + ```env title=".env Configuration" + # Search and web scraping + SERPER_API_KEY="xxx" + JINA_API_KEY="xxx" + + # Code execution + E2B_API_KEY="xxx" + + # LLM (Claude 3.7 Sonnet via OpenRouter) + OPENROUTER_API_KEY="xxx" + OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + + # Evaluation and hint generation + OPENAI_API_KEY="xxx" + + # Vision capabilities + ANTHROPIC_API_KEY="xxx" + GEMINI_API_KEY="xxx" + ``` + +=== "MiroThinker" + + ```env title=".env Configuration" + # Search and web scraping + SERPER_API_KEY="xxx" + JINA_API_KEY="xxx" + + # Code execution + E2B_API_KEY="xxx" + + # LLM (MiroThinker via SGLang) + OAI_MIROTHINKER_API_KEY="dummy_key" + OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1" + + # Evaluation and final answer extraction + OPENAI_API_KEY="xxx" + + # Vision capabilities + ANTHROPIC_API_KEY="xxx" + GEMINI_API_KEY="xxx" + ``` + +### Step 3: Run the Evaluation + +```bash title="Run WebWalkerQA Evaluation" +uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")" +``` + +!!! tip "Progress Monitoring and Resume" + To check the progress while running: + + ```bash title="Check Progress" + ls -lh logs/webwalkerqa/YOUR_RUN_DIR/ + ``` + + If you need to resume an interrupted evaluation, specify the same output directory: + + ```bash title="Resume Evaluation" + uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet output_dir=${PATH_TO_LOG} + ``` + +Results are automatically generated in the output directory: +- `benchmark_results.jsonl` - Detailed results for each task +- `benchmark_results_pass_at_1_accuracy.txt` - Summary accuracy statistics + +--- + +## Usage Examples + +```bash title="Limited Task Testing" +# Test with 10 tasks only +uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/webwalkerqa/test" +``` + +```bash title="Custom Concurrency" +# Run with 10 concurrent tasks +uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_claude37sonnet benchmark.execution.max_concurrent=10 output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")" +``` + +```bash title="Using MiroThinker Model" +uv run main.py common-benchmark --config_file_name=agent_webwalkerqa_mirothinker output_dir="logs/webwalkerqa/$(date +"%Y%m%d_%H%M")" +``` + +--- + +## Available Agent Configurations + +| Agent Configuration | Model | Use Case | +|-------------------|-------|----------| +| `agent_webwalkerqa_claude37sonnet` | Claude 3.7 Sonnet | Recommended for best performance | +| `agent_webwalkerqa_mirothinker` | MiroThinker | For local deployment | + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 ยท **Doc Contributor:** Team @ MiroMind AI + diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index b98703d..7b33d6b 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -61,6 +61,7 @@ nav: - GAIA-Val-Text: gaia_validation_text_only.md - GAIA-Test: gaia_test.md - BrowseComp-EN: browsecomp_en.md + - WebWalkerQA: webwalkerqa.md - FutureX: futurex.md - xBench-DeepSearch: xbench_ds.md - FinSearchComp: finsearchcomp.md