Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions config/agent_finsearchcomp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_gaia-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_gaia-validation-text-only.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
73 changes: 73 additions & 0 deletions config/agent_gaia-validation_mirothinker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
defaults:
- benchmark: gaia-validation
- override hydra/job_logging: none
- _self_ # Allow defining variables at the top of this file


main_agent:
prompt_class: MainAgentPrompt_GAIA
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "MODEL_NAME"
async_client: true
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-reasoning

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"

output_process:
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"


sub_agents:
agent-worker:
prompt_class: SubAgentWorkerPrompt
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "MODEL_NAME"
async_client: true
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-searching
- tool-image-video
- tool-reading
- tool-code
- tool-audio

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn


# Can define some top-level or default parameters here
output_dir: logs/
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
8 changes: 5 additions & 3 deletions config/agent_mirothinker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: false
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: false
final_answer_extraction: false
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_quickstart_1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: false
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: false
final_answer_extraction: false
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_xbench-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "true"
Expand Down
3 changes: 3 additions & 0 deletions docs/mkdocs/docs/all_about_agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ Welcome to our comprehensive resource collection for AI agents. This page curate
- **Terminal-Bench**: the benchmark for testing AI agents in real terminal environments
- [:material-github: GitHub](https://github.com/laude-institute/terminal-bench)

- **Gaia2 and ARE**: Empowering the Community to Evaluate Agents
- [:material-file-document: Blog Post](https://huggingface.co/blog/gaia2)

---

!!! info "Documentation Info"
Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/finsearchcomp.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ E2B_API_KEY="xxx"
OAI_MIROTHINKER_API_KEY="xxx"
OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"

# Used for o3 hints and final answer extraction
# Used for hint generation and final answer extraction
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"

Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/futurex.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ ANTHROPIC_API_KEY="xxx"
# Used for Gemini vision
GEMINI_API_KEY="xxx"

# Use for llm judge, reasoning, o3 hints, etc.
# Use for llm judge, reasoning, hint generation, etc.
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/gaia_test.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"

# LLM judge, reasoning, and O3 hints
# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
Expand Down
93 changes: 93 additions & 0 deletions docs/mkdocs/docs/gaia_validation_claude37sonnet.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# GAIA Validation - Claude 3.7 Sonnet

MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark using Claude 3.7 Sonnet models, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.

!!! info "Prerequisites"
Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration.

---

## Performance Comparison

!!! success "State-of-the-Art Performance with Claude 3.7 Sonnet"
MiroFlow achieves **state-of-the-art (SOTA) performance** among open-source agent frameworks on the GAIA validation set using Claude 3.7 Sonnet.

<div align="center" markdown="1">
![GAIA Validation Performance](../assets/gaia_score.png){ width="100%" }
</div>

!!! abstract "Key Performance Metrics"
- **Pass@3**: **81.8%**
- **Majority Vote**: **82.4%**
- **Pass@1 (best@3)**: **74.5%**
- **Pass@1 (avg@3)**: **72.2%**

!!! info "Reproducibility Guarantee"
Unlike other frameworks with unclear evaluation methods, MiroFlow's results are **fully reproducible**. Note that Hugging Face access was disabled during inference to prevent direct answer retrieval.

---

## Running the Evaluation

### Step 1: Dataset Preparation

Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document.

### Step 2: API Keys Configuration

Configure the following API keys in your `.env` file:

```env title="Claude 3.7 Sonnet .env Configuration"
# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter)
OPENROUTER_API_KEY="your-openrouter-api-key"
OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"

# Search and web scraping capabilities
SERPER_API_KEY="your-serper-api-key"
JINA_API_KEY="your-jina-api-key"

# Code execution environment
E2B_API_KEY="your-e2b-api-key"

# Vision understanding capabilities
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"

# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```

### Step 3: Run the Evaluation

Execute the evaluation using the Claude 3.7 Sonnet configuration:

```bash title="Run GAIA Validation with Claude 3.7 Sonnet"
uv run main.py common-benchmark \
--config_file_name=agent_gaia-validation_claude37sonnet \
output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
```

### Step 4: Monitor Progress

Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document.

---

## Execution Traces

!!! info "Complete Execution Traces"
We have released our complete execution traces for the `gaia-validation` dataset using Claude 3.7 Sonnet on Hugging Face. This comprehensive collection includes a full run of 165 tasks with an overall accuracy of 73.94% and detailed reasoning traces.

You can download them using the following command:

```bash title="Download Execution Traces"
wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia_validation_miroflow_trace_public_20250825.zip
unzip gaia_validation_miroflow_trace_public_20250825.zip
# Unzip passcode: pf4*
```

---

!!! info "Documentation Info"
**Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
Loading
Loading