Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions config/agent_finsearchcomp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_gaia-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_gaia-validation-text-only.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
73 changes: 73 additions & 0 deletions config/agent_gaia-validation_mirothinker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
defaults:
- benchmark: gaia-validation
- override hydra/job_logging: none
- _self_ # Allow defining variables at the top of this file


main_agent:
prompt_class: MainAgentPrompt_GAIA
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "MODEL_NAME"
async_client: true
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-reasoning

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"

output_process:
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"


sub_agents:
agent-worker:
prompt_class: SubAgentWorkerPrompt
llm:
provider_class: "MiroThinkerSGLangClient"
model_name: "MODEL_NAME"
async_client: true
temperature: 0.3
top_p: 1.0
min_p: 0.0
top_k: -1
max_tokens: 4096
oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
keep_tool_result: -1
oai_tool_thinking: false

tool_config:
- tool-searching
- tool-image-video
- tool-reading
- tool-code
- tool-audio

max_turns: 50 # Maximum number of turns for main agent execution
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn


# Can define some top-level or default parameters here
output_dir: logs/
data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
8 changes: 5 additions & 3 deletions config/agent_mirothinker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: false
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: false
final_answer_extraction: false
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_quickstart_1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: false
hint_generation: false
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: false
final_answer_extraction: false
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
Expand Down
8 changes: 5 additions & 3 deletions config/agent_xbench-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn

input_process:
o3_hint: true
hint_generation: true
hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
o3_final_answer: true
final_answer_extraction: true
final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"

openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "true"
Expand Down
3 changes: 3 additions & 0 deletions docs/mkdocs/docs/all_about_agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ Welcome to our comprehensive resource collection for AI agents. This page curate
- **Terminal-Bench**: the benchmark for testing AI agents in real terminal environments
- [:material-github: GitHub](https://github.com/laude-institute/terminal-bench)

- **Gaia2 and ARE**: Empowering the Community to Evaluate Agents
- [:material-file-document: Blog Post](https://huggingface.co/blog/gaia2)

---

!!! info "Documentation Info"
Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/finsearchcomp.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ E2B_API_KEY="xxx"
OAI_MIROTHINKER_API_KEY="xxx"
OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"

# Used for o3 hints and final answer extraction
# Used for hint generation and final answer extraction
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"

Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/futurex.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ ANTHROPIC_API_KEY="xxx"
# Used for Gemini vision
GEMINI_API_KEY="xxx"

# Use for llm judge, reasoning, o3 hints, etc.
# Use for llm judge, reasoning, hint generation, etc.
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
Expand Down
2 changes: 1 addition & 1 deletion docs/mkdocs/docs/gaia_test.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"

# LLM judge, reasoning, and O3 hints
# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
Expand Down
93 changes: 93 additions & 0 deletions docs/mkdocs/docs/gaia_validation_claude37sonnet.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# GAIA Validation - Claude 3.7 Sonnet

MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark using Claude 3.7 Sonnet models, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.

!!! info "Prerequisites"
Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration.

---

## Performance Comparison

!!! success "State-of-the-Art Performance with Claude 3.7 Sonnet"
MiroFlow achieves **state-of-the-art (SOTA) performance** among open-source agent frameworks on the GAIA validation set using Claude 3.7 Sonnet.

<div align="center" markdown="1">
![GAIA Validation Performance](../assets/gaia_score.png){ width="100%" }
</div>

!!! abstract "Key Performance Metrics"
- **Pass@3**: **81.8%**
- **Majority Vote**: **82.4%**
- **Pass@1 (best@3)**: **74.5%**
- **Pass@1 (avg@3)**: **72.2%**

!!! info "Reproducibility Guarantee"
Unlike other frameworks with unclear evaluation methods, MiroFlow's results are **fully reproducible**. Note that Hugging Face access was disabled during inference to prevent direct answer retrieval.

---

## Running the Evaluation

### Step 1: Dataset Preparation

Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document.

### Step 2: API Keys Configuration

Configure the following API keys in your `.env` file:

```env title="Claude 3.7 Sonnet .env Configuration"
# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter)
OPENROUTER_API_KEY="your-openrouter-api-key"
OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"

# Search and web scraping capabilities
SERPER_API_KEY="your-serper-api-key"
JINA_API_KEY="your-jina-api-key"

# Code execution environment
E2B_API_KEY="your-e2b-api-key"

# Vision understanding capabilities
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"

# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```

### Step 3: Run the Evaluation

Execute the evaluation using the Claude 3.7 Sonnet configuration:

```bash title="Run GAIA Validation with Claude 3.7 Sonnet"
uv run main.py common-benchmark \
--config_file_name=agent_gaia-validation_claude37sonnet \
output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
```

### Step 4: Monitor Progress

Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document.

---

## Execution Traces

!!! info "Complete Execution Traces"
We have released our complete execution traces for the `gaia-validation` dataset using Claude 3.7 Sonnet on Hugging Face. This comprehensive collection includes a full run of 165 tasks with an overall accuracy of 73.94% and detailed reasoning traces.

You can download them using the following command:

```bash title="Download Execution Traces"
wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia_validation_miroflow_trace_public_20250825.zip
unzip gaia_validation_miroflow_trace_public_20250825.zip
# Unzip passcode: pf4*
```

---

!!! info "Documentation Info"
**Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
Loading
Loading