feat(doc&benchmark): add tool-searching doc and gaia test config (#41)

xings19 · web-flow · commit d041bbcd55fd · 2025-09-18T15:46:09.000+08:00
* Add GAIA test configuration and documentation

* fix

* add python tool doc
diff --git a/config/agent_gaia-test.yaml b/config/agent_gaia-test.yaml
@@ -0,0 +1,75 @@
+defaults:
+  - benchmark: gaia-test
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: -1  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    o3_hint: true
+  output_process:
+    o3_final_answer: true
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: -1  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
diff --git a/config/benchmark/gaia-test.yaml b/config/benchmark/gaia-test.yaml
@@ -0,0 +1,16 @@
+# config/benchmark/gaia-validation.yaml
+defaults:
+  - default
+  - _self_
+
+name: "gaia-test"
+
+data:
+  data_dir: "${data_dir}/gaia-test"
+
+execution:
+  max_tasks: null  # null means no limit
+  max_concurrent: 10
+  pass_at_k: 1
+
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
diff --git a/docs/mkdocs/docs/gaia_test.md b/docs/mkdocs/docs/gaia_test.md
@@ -1,9 +1,54 @@
 # GAIA Test
 
+This document provides step-by-step instructions for evaluating the GAIA test benchmark.
 
-# - Coming Soon -
+### Step 1: Prepare the GAIA Test Dataset
 
+First, download and prepare the GAIA test dataset:
+```bash
+cd data
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-test.zip
+unzip gaia-test.zip
+# The unzip passcode is: `pf4*`
+```
 
+### Step 2: Configure API Keys
+
+Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+
+```
+
+# For searching and scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# We use Claude-3.7-Sonnet with OpenRouter backend to initialize the LLM. The main reason is that OpenRouter provides better response rates
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Used for Claude vision understanding
+ANTHROPIC_API_KEY="xxx"
+
+# Used for Gemini vision
+GEMINI_API_KEY="xxx"
+
+# Use for llm judge, reasoning, o3 hints, etc.
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+
+
+```
+
+### Step 3: Run the Evaluation
+
+Execute the following command to run a single evaluation pass on the GAIA test dataset:
+
+```
+uv run main.py common-benchmark --config_file_name=agent_gaia-test output_dir="logs/gaia-test/$(date +"%Y%m%d_%H%M")"
+```
 
 ---
 **Last Updated:** Sep 2025  
diff --git a/docs/mkdocs/docs/gaia_validation.md b/docs/mkdocs/docs/gaia_validation.md
@@ -57,7 +57,7 @@ JINA_API_KEY="xxx"
 # For Linux sandbox (code execution environment)
 E2B_API_KEY="xxx"
 
-# We use Claude-3.5-Sonnet with OpenRouter backend to initialize the LLM. The main reason is that OpenRouter provides better response rates
+# We use Claude-3.7-Sonnet with OpenRouter backend to initialize the LLM. The main reason is that OpenRouter provides better response rates
 OPENROUTER_API_KEY="xxx"
 OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
 
diff --git a/docs/mkdocs/docs/tool_python.md b/docs/mkdocs/docs/tool_python.md
@@ -0,0 +1,86 @@
+# Python Tools (`python_server.py`)
+
+The Python Execution Server provides a secure sandboxed environment for running Python code and shell commands using E2B server.
+
+### `create_sandbox()`
+Creates a Linux sandbox for safely executing commands and running Python code.
+
+**Returns:**
+- `str`: The `sandbox_id` of the newly created sandbox
+
+**Usage Notes:**
+- This tool must be called before using other tools within this MCP server file
+- The sandbox may timeout and automatically shut down
+- The sandbox comes pre-installed with common packages for data science and document processing. For a detailed list and advanced usage information, see [E2B Extention](./e2b_extension.md)
+
+### `run_command(sandbox_id: str, command: str)`
+Execute shell commands in the Linux sandbox.
+
+**Parameters:**
+- `sandbox_id`: ID of the existing sandbox (must be created first)
+- `command`: Shell command to execute
+
+**Returns:**
+- `str`: Command execution result (stderr, stdout, exit_code, error)
+
+**Features:**
+- Automatic retry mechanism
+- Permission hints for sudo commands
+
+### `run_python_code(sandbox_id: str, code_block: str)`
+Run Python code in the sandbox and return execution results.
+
+**Parameters:**
+- `sandbox_id`: ID of the existing sandbox
+- `code_block`: Python code to execute
+
+**Returns:**
+- `str`: Code execution result (stderr, stdout, exit_code, error)
+
+**Features:**
+- Automatic retry mechanism
+
+### `upload_file_from_local_to_sandbox(sandbox_id: str, local_file_path: str, sandbox_file_path: str = "/home/user")`
+Upload local files to the sandbox environment.
+
+When a local file is provided to the agent, the agent needs to call this tool to copy the file from local storage to the sandbox for further file processing.
+
+**Parameters:**
+- `sandbox_id`: ID of the existing sandbox
+- `local_file_path`: Local path of the file to upload
+- `sandbox_file_path`: Target directory in sandbox (default: `/home/user`)
+
+**Returns:**
+- `str`: Path of uploaded file in sandbox or error message
+
+### `download_file_from_internet_to_sandbox(sandbox_id: str, url: str, sandbox_file_path: str = "/home/user")`
+Download files from the internet directly to the sandbox.
+
+**Parameters:**
+- `sandbox_id`: ID of the existing sandbox
+- `url`: URL of the file to download
+- `sandbox_file_path`: Target directory in sandbox (default: `/home/user`)
+
+**Returns:**
+- `str`: Path of downloaded file in sandbox or error message
+
+**Features:**
+- Automatic retry mechanism
+
+### `download_file_from_sandbox_to_local(sandbox_id: str, sandbox_file_path: str, local_filename: str = None)`
+Download files from sandbox to local system for processing by other tools.
+
+Other MCP tools (such as visual question answering) cannot access files in a sandbox. Therefore, this tool should be called when the agent wants other tools to analyze files in the sandbox.
+
+**Parameters:**
+- `sandbox_id`: ID of the sandbox
+- `sandbox_file_path`: Path of file in sandbox
+- `local_filename`: Optional local filename (uses original if not provided)
+
+**Returns:**
+- `str`: Local path of downloaded file or error message
+
+---
+
+**Last Updated:** Sep 2025  
+**Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/docs/tool_searching.md b/docs/mkdocs/docs/tool_searching.md
@@ -0,0 +1,125 @@
+# Searching Tools (`searching_mcp_server.py`)
+
+The Searching MCP Server provides comprehensive search capabilities including Google search, Wikipedia content retrieval, archive searching, and web scraping functionality.
+
+## Environment Variables Used in Tools
+- `SERPER_API_KEY`: Required API key for Serper service, Used by `google_search` and as a fallback for `scrape_website`
+- `JINA_API_KEY`: Required API key for JINA service. Default choice for scraping websites in `scrape_website`
+- `REMOVE_SNIPPETS`: Set to "true" to filter out snippets from results. Used in `google_search` to filter the search results returned by Serper.
+- `REMOVE_KNOWLEDGE_GRAPH`: Set to "true" to remove knowledge graph data. Used in `google_search` to filter the search results returned by Serper.
+- `REMOVE_ANSWER_BOX`: Set to "true" to remove answer box content. Used in `google_search` to filter the search results returned by Serper.
+
+### `google_search(q: str, gl: str = "us", hl: str = "en", location: str = None, num: int = 10, tbs: str = None, page: int = 1)`
+Perform Google searches via Serper API and retrieve rich search results including organic results, people also ask, related searches, and knowledge graph.
+
+**Parameters:**
+
+- `q`: Search query string
+- `gl`: Country context for search (e.g., 'us' for United States, 'cn' for China, 'uk' for United Kingdom). Default: 'us'
+- `hl`: Google interface language (e.g., 'en' for English, 'zh' for Chinese, 'es' for Spanish). Default: 'en'
+- `location`: City-level location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
+- `num`: Number of results to return. Default: 10
+- `tbs`: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 'qdr:m' for past month, 'qdr:y' for past year)
+- `page`: Page number of results to return. Default: 1
+
+**Returns:**
+
+- `str`: JSON formatted search results with organic results and related information
+
+**Features:**
+
+- Automatic retry mechanism (up to 5 attempts)
+- Configurable result filtering via environment variables
+- Support for regional and language-specific searches
+
+### `wiki_get_page_content(entity: str, first_sentences: int = 10)`
+Get specific Wikipedia page content for entities (people, places, concepts, events) and return structured information.
+
+**Parameters:**
+
+- `entity`: The entity to search for in Wikipedia
+- `first_sentences`: Number of first sentences to return from the page. Set to 0 to return full content. Default: 10
+
+**Returns:**
+
+- `str`: Formatted content containing page title, introduction/full content, and URL
+
+**Features:**
+
+- Handles disambiguation pages automatically
+- Provides clean, structured output
+- Fallback search suggestions when page not found
+- Automatic content truncation for manageable output
+
+### `search_wiki_revision(entity: str, year: int, month: int, max_revisions: int = 50)`
+Search for an entity in Wikipedia and return the revision history for a specific month.
+
+**Parameters:**
+
+- `entity`: The entity to search for in Wikipedia
+- `year`: The year of the revision (e.g., 2024)
+- `month`: The month of the revision (1-12)
+- `max_revisions`: Maximum number of revisions to return. Default: 50
+
+**Returns:**
+
+- `str`: Formatted revision history with timestamps, revision IDs, and URLs
+
+**Features:**
+
+- Automatic date validation and adjustment
+- Support for date range from 2000 to current year
+- Detailed revision metadata including timestamps and direct links
+- Clear error handling for invalid dates or missing pages
+
+### `search_archived_webpage(url: str, year: int, month: int, day: int)`
+Search the Wayback Machine (archive.org) for archived versions of a webpage for a specific date.
+
+**Parameters:**
+
+- `url`: The URL to search for in the Wayback Machine
+- `year`: The target year (e.g., 2023)
+- `month`: The target month (1-12)
+- `day`: The target day (1-31)
+
+**Returns:**
+
+- `str`: Formatted archive information including archived URL, timestamp, and availability status
+
+**Features:**
+
+- Automatic URL protocol detection and correction
+- Date validation and adjustment (1995 to present)
+- Fallback to most recent archive if specific date not found
+- Special handling for Wikipedia URLs with tool suggestions
+- Automatic retry mechanism for reliable results
+
+### `scrape_website(url: str)`
+Scrape website content including support for regular websites and YouTube video information.
+
+**Parameters:**
+
+- `url`: The URL of the website to scrape
+
+**Returns:**
+
+- `str`: Scraped website content including text, metadata, and structured information
+
+**Features:**
+
+- Support for various website types
+- YouTube video information extraction (subtitles, titles, descriptions, key moments)
+- Automatic content parsing and cleaning
+- Integration with Jina API for enhanced scraping capabilities
+
+**Usage Notes:**
+
+- Search engines are not supported by this tool
+- For YouTube videos, provides non-visual information only
+- Content may be incomplete for some complex websites
+
+---
+
+**Last Updated:** Sep 2025  
+**Doc Contributor:** Team @ MiroMind AI
+
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -60,6 +60,8 @@ nav:
     - Tools:
       - tool-reasoning: tool_reasoning.md
       - tool-vqa: tool_vqa.md
+      - tool-searching: tool_searching.md
+      - tool-python: tool_python.md
     - Advanced Features:
       - E2B Advanced Features: e2b_advanced_features.md
     - Add New Tools: contribute_tools.md