MiroMindAI · BinWang28 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml
@@ -27,11 +27,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: true
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: true
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_gaia-test.yaml b/config/agent_gaia-test.yaml
@@ -29,11 +29,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: true
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: true
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_gaia-validation-text-only.yaml b/config/agent_gaia-validation-text-only.yaml
@@ -29,11 +29,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: true
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: true
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_gaia-validation.yaml → ...agent_gaia-validation_claude37sonnet.yaml b/config/agent_gaia-validation.yaml → ...agent_gaia-validation_claude37sonnet.yaml
@@ -29,11 +29,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: true
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: true
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_gaia-validation_mirothinker.yaml b/config/agent_gaia-validation_mirothinker.yaml
@@ -0,0 +1,73 @@
+defaults:
+  - benchmark: gaia-validation
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "MODEL_NAME"
+    async_client: true
+    temperature: 0.3
+    top_p: 1.0
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 4096
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+    keep_tool_result: -1
+    oai_tool_thinking: false
+
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+  input_process:
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "MiroThinkerSGLangClient"
+      model_name: "MODEL_NAME"
+      async_client: true
+      temperature: 0.3
+      top_p: 1.0
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 4096
+      oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+      oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+      keep_tool_result: -1
+      oai_tool_thinking: false
+
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 50  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
diff --git a/config/agent_mirothinker.yaml b/config/agent_mirothinker.yaml
@@ -26,11 +26,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: false
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: false
+    final_answer_extraction: false
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_quickstart_1.yaml b/config/agent_quickstart_1.yaml
@@ -28,11 +28,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: false
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: false
+    final_answer_extraction: false
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"

diff --git a/config/agent_xbench-ds.yaml b/config/agent_xbench-ds.yaml
@@ -29,11 +29,13 @@ main_agent:
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
 
   input_process:
-    o3_hint: true
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
   output_process:
-    o3_final_answer: true
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
 
-  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
   add_message_id: true
   keep_tool_result: -1
   chinese_context: "true"

diff --git a/docs/mkdocs/docs/all_about_agents.md b/docs/mkdocs/docs/all_about_agents.md
@@ -103,6 +103,9 @@ Welcome to our comprehensive resource collection for AI agents. This page curate
 - **Terminal-Bench**: the benchmark for testing AI agents in real terminal environments 
     - [:material-github: GitHub](https://github.com/laude-institute/terminal-bench)
 
+- **Gaia2 and ARE**: Empowering the Community to Evaluate Agents
+    - [:material-file-document: Blog Post](https://huggingface.co/blog/gaia2)
+
 ---
 
 !!! info "Documentation Info"

diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md
@@ -63,7 +63,7 @@ E2B_API_KEY="xxx"
 OAI_MIROTHINKER_API_KEY="xxx"
 OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
 
-# Used for o3 hints and final answer extraction
+# Used for hint generation and final answer extraction
 OPENAI_API_KEY="xxx"
 OPENAI_BASE_URL="https://api.openai.com/v1"
 

diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
@@ -64,7 +64,7 @@ ANTHROPIC_API_KEY="xxx"
 # Used for Gemini vision
 GEMINI_API_KEY="xxx"
 
-# Use for llm judge, reasoning, o3 hints, etc.
+# Use for llm judge, reasoning, hint generation, etc.
 OPENAI_API_KEY="xxx"
 OPENAI_BASE_URL="https://api.openai.com/v1"
 ```

diff --git a/docs/mkdocs/docs/gaia_test.md b/docs/mkdocs/docs/gaia_test.md
@@ -41,7 +41,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
 ANTHROPIC_API_KEY="your-anthropic-api-key"
 GEMINI_API_KEY="your-gemini-api-key"
 
-# LLM judge, reasoning, and O3 hints
+# LLM judge, reasoning, and hint generation
 OPENAI_API_KEY="your-openai-api-key"
 OPENAI_BASE_URL="https://api.openai.com/v1"
 ```

diff --git a/docs/mkdocs/docs/gaia_validation_claude37sonnet.md b/docs/mkdocs/docs/gaia_validation_claude37sonnet.md
@@ -0,0 +1,93 @@
+# GAIA Validation - Claude 3.7 Sonnet
+
+MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark using Claude 3.7 Sonnet models, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.
+
+!!! info "Prerequisites"
+    Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration.
+
+---
+
+## Performance Comparison
+
+!!! success "State-of-the-Art Performance with Claude 3.7 Sonnet"
+    MiroFlow achieves **state-of-the-art (SOTA) performance** among open-source agent frameworks on the GAIA validation set using Claude 3.7 Sonnet.
+
+<div align="center" markdown="1">
+  ![GAIA Validation Performance](../assets/gaia_score.png){ width="100%" }
+</div>
+
+!!! abstract "Key Performance Metrics"
+    - **Pass@3**: **81.8%**
+    - **Majority Vote**: **82.4%**
+    - **Pass@1 (best@3)**: **74.5%**
+    - **Pass@1 (avg@3)**: **72.2%**
+
+!!! info "Reproducibility Guarantee"
+    Unlike other frameworks with unclear evaluation methods, MiroFlow's results are **fully reproducible**. Note that Hugging Face access was disabled during inference to prevent direct answer retrieval.
+
+---
+
+## Running the Evaluation
+
+### Step 1: Dataset Preparation
+
+Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document.
+
+### Step 2: API Keys Configuration
+
+Configure the following API keys in your `.env` file:
+
+```env title="Claude 3.7 Sonnet .env Configuration"
+# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter)
+OPENROUTER_API_KEY="your-openrouter-api-key"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Search and web scraping capabilities
+SERPER_API_KEY="your-serper-api-key"
+JINA_API_KEY="your-jina-api-key"
+
+# Code execution environment
+E2B_API_KEY="your-e2b-api-key"
+
+# Vision understanding capabilities
+ANTHROPIC_API_KEY="your-anthropic-api-key"
+GEMINI_API_KEY="your-gemini-api-key"
+
+# LLM judge, reasoning, and hint generation
+OPENAI_API_KEY="your-openai-api-key"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+Execute the evaluation using the Claude 3.7 Sonnet configuration:
+
+```bash title="Run GAIA Validation with Claude 3.7 Sonnet"
+uv run main.py common-benchmark \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+### Step 4: Monitor Progress
+
+Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document.
+
+---
+
+## Execution Traces
+
+!!! info "Complete Execution Traces"
+    We have released our complete execution traces for the `gaia-validation` dataset using Claude 3.7 Sonnet on Hugging Face. This comprehensive collection includes a full run of 165 tasks with an overall accuracy of 73.94% and detailed reasoning traces.
+
+You can download them using the following command:
+
+```bash title="Download Execution Traces"
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia_validation_miroflow_trace_public_20250825.zip
+unzip gaia_validation_miroflow_trace_public_20250825.zip
+# Unzip passcode: pf4*
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI