diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml
index 836588f..16225e6 100644
--- a/config/agent_finsearchcomp.yaml
+++ b/config/agent_finsearchcomp.yaml
@@ -27,11 +27,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: true
+ hint_generation: true
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: true
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_gaia-test.yaml b/config/agent_gaia-test.yaml
index 1d4a788..3bdf562 100644
--- a/config/agent_gaia-test.yaml
+++ b/config/agent_gaia-test.yaml
@@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: true
+ hint_generation: true
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: true
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_gaia-validation-text-only.yaml b/config/agent_gaia-validation-text-only.yaml
index ed9bc2f..2eea55d 100644
--- a/config/agent_gaia-validation-text-only.yaml
+++ b/config/agent_gaia-validation-text-only.yaml
@@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: true
+ hint_generation: true
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: true
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_gaia-validation.yaml b/config/agent_gaia-validation_claude37sonnet.yaml
similarity index 85%
rename from config/agent_gaia-validation.yaml
rename to config/agent_gaia-validation_claude37sonnet.yaml
index c68c88d..274bca6 100644
--- a/config/agent_gaia-validation.yaml
+++ b/config/agent_gaia-validation_claude37sonnet.yaml
@@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: true
+ hint_generation: true
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: true
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_gaia-validation_mirothinker.yaml b/config/agent_gaia-validation_mirothinker.yaml
new file mode 100644
index 0000000..24f59df
--- /dev/null
+++ b/config/agent_gaia-validation_mirothinker.yaml
@@ -0,0 +1,73 @@
+defaults:
+ - benchmark: gaia-validation
+ - override hydra/job_logging: none
+ - _self_ # Allow defining variables at the top of this file
+
+
+main_agent:
+ prompt_class: MainAgentPrompt_GAIA
+ llm:
+ provider_class: "MiroThinkerSGLangClient"
+ model_name: "MODEL_NAME"
+ async_client: true
+ temperature: 0.3
+ top_p: 1.0
+ min_p: 0.0
+ top_k: -1
+ max_tokens: 4096
+ oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+ oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+ keep_tool_result: -1
+ oai_tool_thinking: false
+
+ tool_config:
+ - tool-reasoning
+
+ max_turns: 50 # Maximum number of turns for main agent execution
+ max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
+
+ input_process:
+ hint_generation: false
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+
+ output_process:
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+ add_message_id: true
+ keep_tool_result: -1
+ chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+ agent-worker:
+ prompt_class: SubAgentWorkerPrompt
+ llm:
+ provider_class: "MiroThinkerSGLangClient"
+ model_name: "MODEL_NAME"
+ async_client: true
+ temperature: 0.3
+ top_p: 1.0
+ min_p: 0.0
+ top_k: -1
+ max_tokens: 4096
+ oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+ oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+ keep_tool_result: -1
+ oai_tool_thinking: false
+
+ tool_config:
+ - tool-searching
+ - tool-image-video
+ - tool-reading
+ - tool-code
+ - tool-audio
+
+ max_turns: 50 # Maximum number of turns for main agent execution
+ max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored
\ No newline at end of file
diff --git a/config/agent_mirothinker.yaml b/config/agent_mirothinker.yaml
index 3215578..709eeed 100644
--- a/config/agent_mirothinker.yaml
+++ b/config/agent_mirothinker.yaml
@@ -26,11 +26,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: false
+ hint_generation: false
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: false
+ final_answer_extraction: false
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_quickstart_1.yaml b/config/agent_quickstart_1.yaml
index 9b0ec60..076daf1 100644
--- a/config/agent_quickstart_1.yaml
+++ b/config/agent_quickstart_1.yaml
@@ -28,11 +28,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: false
+ hint_generation: false
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: false
+ final_answer_extraction: false
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
diff --git a/config/agent_xbench-ds.yaml b/config/agent_xbench-ds.yaml
index 13b2ddc..6b5213f 100644
--- a/config/agent_xbench-ds.yaml
+++ b/config/agent_xbench-ds.yaml
@@ -29,11 +29,13 @@ main_agent:
max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn
input_process:
- o3_hint: true
+ hint_generation: true
+ hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
output_process:
- o3_final_answer: true
+ final_answer_extraction: true
+ final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
- openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+ openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
add_message_id: true
keep_tool_result: -1
chinese_context: "true"
diff --git a/docs/mkdocs/docs/all_about_agents.md b/docs/mkdocs/docs/all_about_agents.md
index bd4f7e7..f8282c2 100644
--- a/docs/mkdocs/docs/all_about_agents.md
+++ b/docs/mkdocs/docs/all_about_agents.md
@@ -103,6 +103,9 @@ Welcome to our comprehensive resource collection for AI agents. This page curate
- **Terminal-Bench**: the benchmark for testing AI agents in real terminal environments
- [:material-github: GitHub](https://github.com/laude-institute/terminal-bench)
+- **Gaia2 and ARE**: Empowering the Community to Evaluate Agents
+ - [:material-file-document: Blog Post](https://huggingface.co/blog/gaia2)
+
---
!!! info "Documentation Info"
diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md
index 19eb6f3..925909e 100644
--- a/docs/mkdocs/docs/finsearchcomp.md
+++ b/docs/mkdocs/docs/finsearchcomp.md
@@ -63,7 +63,7 @@ E2B_API_KEY="xxx"
OAI_MIROTHINKER_API_KEY="xxx"
OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
-# Used for o3 hints and final answer extraction
+# Used for hint generation and final answer extraction
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"
diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
index db20f4d..080ec6f 100644
--- a/docs/mkdocs/docs/futurex.md
+++ b/docs/mkdocs/docs/futurex.md
@@ -64,7 +64,7 @@ ANTHROPIC_API_KEY="xxx"
# Used for Gemini vision
GEMINI_API_KEY="xxx"
-# Use for llm judge, reasoning, o3 hints, etc.
+# Use for llm judge, reasoning, hint generation, etc.
OPENAI_API_KEY="xxx"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
diff --git a/docs/mkdocs/docs/gaia_test.md b/docs/mkdocs/docs/gaia_test.md
index 1d53841..ae5d593 100644
--- a/docs/mkdocs/docs/gaia_test.md
+++ b/docs/mkdocs/docs/gaia_test.md
@@ -41,7 +41,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"
-# LLM judge, reasoning, and O3 hints
+# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
diff --git a/docs/mkdocs/docs/gaia_validation_claude37sonnet.md b/docs/mkdocs/docs/gaia_validation_claude37sonnet.md
new file mode 100644
index 0000000..c1db1a8
--- /dev/null
+++ b/docs/mkdocs/docs/gaia_validation_claude37sonnet.md
@@ -0,0 +1,93 @@
+# GAIA Validation - Claude 3.7 Sonnet
+
+MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark using Claude 3.7 Sonnet models, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.
+
+!!! info "Prerequisites"
+ Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration.
+
+---
+
+## Performance Comparison
+
+!!! success "State-of-the-Art Performance with Claude 3.7 Sonnet"
+ MiroFlow achieves **state-of-the-art (SOTA) performance** among open-source agent frameworks on the GAIA validation set using Claude 3.7 Sonnet.
+
+
+ { width="100%" }
+
+
+!!! abstract "Key Performance Metrics"
+ - **Pass@3**: **81.8%**
+ - **Majority Vote**: **82.4%**
+ - **Pass@1 (best@3)**: **74.5%**
+ - **Pass@1 (avg@3)**: **72.2%**
+
+!!! info "Reproducibility Guarantee"
+ Unlike other frameworks with unclear evaluation methods, MiroFlow's results are **fully reproducible**. Note that Hugging Face access was disabled during inference to prevent direct answer retrieval.
+
+---
+
+## Running the Evaluation
+
+### Step 1: Dataset Preparation
+
+Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document.
+
+### Step 2: API Keys Configuration
+
+Configure the following API keys in your `.env` file:
+
+```env title="Claude 3.7 Sonnet .env Configuration"
+# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter)
+OPENROUTER_API_KEY="your-openrouter-api-key"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Search and web scraping capabilities
+SERPER_API_KEY="your-serper-api-key"
+JINA_API_KEY="your-jina-api-key"
+
+# Code execution environment
+E2B_API_KEY="your-e2b-api-key"
+
+# Vision understanding capabilities
+ANTHROPIC_API_KEY="your-anthropic-api-key"
+GEMINI_API_KEY="your-gemini-api-key"
+
+# LLM judge, reasoning, and hint generation
+OPENAI_API_KEY="your-openai-api-key"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+Execute the evaluation using the Claude 3.7 Sonnet configuration:
+
+```bash title="Run GAIA Validation with Claude 3.7 Sonnet"
+uv run main.py common-benchmark \
+ --config_file_name=agent_gaia-validation_claude37sonnet \
+ output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+### Step 4: Monitor Progress
+
+Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document.
+
+---
+
+## Execution Traces
+
+!!! info "Complete Execution Traces"
+ We have released our complete execution traces for the `gaia-validation` dataset using Claude 3.7 Sonnet on Hugging Face. This comprehensive collection includes a full run of 165 tasks with an overall accuracy of 73.94% and detailed reasoning traces.
+
+You can download them using the following command:
+
+```bash title="Download Execution Traces"
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia_validation_miroflow_trace_public_20250825.zip
+unzip gaia_validation_miroflow_trace_public_20250825.zip
+# Unzip passcode: pf4*
+```
+
+---
+
+!!! info "Documentation Info"
+ **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
\ No newline at end of file
diff --git a/docs/mkdocs/docs/gaia_validation_mirothinker.md b/docs/mkdocs/docs/gaia_validation_mirothinker.md
new file mode 100644
index 0000000..959cdc4
--- /dev/null
+++ b/docs/mkdocs/docs/gaia_validation_mirothinker.md
@@ -0,0 +1,73 @@
+# GAIA Validation - MiroThinker
+
+MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark using MiroThinker models, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.
+
+!!! info "Prerequisites"
+ Before proceeding, please review the [GAIA Validation Prerequisites](gaia_validation_prerequisites.md) document, which covers common setup requirements, dataset preparation, and API key configuration.
+
+---
+
+## Running the Evaluation
+
+### Step 1: Dataset Preparation
+
+Follow the [dataset preparation instructions](gaia_validation_prerequisites.md#dataset-preparation) in the prerequisites document.
+
+### Step 2: API Keys Configuration
+
+Configure the following API keys in your `.env` file:
+
+```env title="MiroThinker .env Configuration"
+# MiroThinker model access
+OAI_MIROTHINKER_API_KEY="your-mirothinker-api-key"
+OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
+
+# Search and web scraping capabilities
+SERPER_API_KEY="your-serper-api-key"
+JINA_API_KEY="your-jina-api-key"
+
+# Code execution environment
+E2B_API_KEY="your-e2b-api-key"
+
+# Vision understanding capabilities
+ANTHROPIC_API_KEY="your-anthropic-api-key"
+GEMINI_API_KEY="your-gemini-api-key"
+
+# LLM judge, reasoning, and hint generation
+OPENAI_API_KEY="your-openai-api-key"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+
+# Hint Generation and final answer with MiroThinker model
+HINT_LLM_BASE_URL="http://localhost:61005/v1"
+FINAL_ANSWER_LLM_BASE_URL="http://localhost:61005/v1"
+
+```
+
+### Step 3: Run the Evaluation
+
+Execute the evaluation using the MiroThinker configuration:
+
+```bash title="Run GAIA Validation with MiroThinker"
+uv run main.py common-benchmark \
+ --config_file_name=agent_gaia-validation_mirothinker \
+ output_dir="logs/gaia-validation-mirothinker/$(date +"%Y%m%d_%H%M")"
+```
+
+### Step 4: Monitor Progress
+
+Follow the [progress monitoring instructions](gaia_validation_prerequisites.md#progress-monitoring-and-resume) in the prerequisites document.
+
+## Multiple Runs
+
+Due to performance variance in MiroThinker models, it's recommended to run multiple evaluations for more reliable results.
+
+```bash title="Run Multiple MiroThinker Evaluations"
+bash ./scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
+```
+
+This script runs 3 evaluations in parallel and calculates average scores. You can modify `NUM_RUNS` in the script to change the number of runs.
+
+---
+
+!!! info "Documentation Info"
+ **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
\ No newline at end of file
diff --git a/docs/mkdocs/docs/gaia_validation_prerequisites.md b/docs/mkdocs/docs/gaia_validation_prerequisites.md
new file mode 100644
index 0000000..09f1396
--- /dev/null
+++ b/docs/mkdocs/docs/gaia_validation_prerequisites.md
@@ -0,0 +1,88 @@
+# GAIA Validation Prerequisites
+
+This document covers the common setup requirements and prerequisites for running GAIA validation benchmarks with MiroFlow, regardless of the specific model configuration used.
+
+## About the GAIA Dataset
+
+!!! info "What is GAIA?"
+ GAIA (General AI Assistant) is a comprehensive benchmark designed to evaluate AI agents' ability to perform complex reasoning tasks that require multiple skills including web browsing, file manipulation, data analysis, and multi-step problem solving.
+
+More details: [GAIA: a benchmark for General AI Assistants](https://arxiv.org/abs/2311.12983)
+
+---
+
+## Dataset Preparation
+
+### Step 1: Prepare the GAIA Validation Dataset
+
+Choose one of the following methods to obtain the GAIA validation dataset:
+
+**Method 1: Direct Download (Recommended)**
+
+!!! tip "No Authentication Required"
+ This method does not require HuggingFace tokens or access permissions.
+
+```bash title="Manual Dataset Download"
+cd data
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip
+unzip gaia-val.zip
+# Unzip passcode: pf4*
+```
+
+**Method 2: Using the prepare-benchmark command**
+
+!!! warning "Prerequisites Required"
+ This method requires HuggingFace dataset access and token configuration.
+
+First, you need to request access and configure your environment:
+
+1. **Request Dataset Access**: Visit [https://huggingface.co/datasets/gaia-benchmark/GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA) and request access
+2. **Configure Environment**:
+ ```bash
+ cp .env.template .env
+ ```
+ Edit the `.env` file:
+ ```env
+ HF_TOKEN="your-actual-huggingface-token-here"
+ DATA_DIR="data/"
+ ```
+
+!!! tip "Getting Your Hugging Face Token"
+ 1. Go to [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
+ 2. Create a new token with at least "Read" permissions
+ 3. Add your token to the `.env` file
+
+Then download the dataset:
+
+```bash title="Download via Script"
+uv run main.py prepare-benchmark get gaia-val
+```
+
+---
+
+## Progress Monitoring and Resume
+
+### Progress Tracking
+
+You can monitor the evaluation progress in real-time:
+
+```bash title="Check Progress"
+uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
+```
+
+Replace `$PATH_TO_LOG` with your actual output directory path.
+
+### Resume Capability
+
+If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
+
+```bash title="Resume Interrupted Evaluation"
+uv run main.py common-benchmark \
+ --config_file_name=YOUR_CONFIG_FILE \
+ output_dir="logs/gaia-validation/20250922_1430"
+```
+
+---
+
+!!! info "Documentation Info"
+ **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/docs/gaia_validation_text_only.md b/docs/mkdocs/docs/gaia_validation_text_only.md
index fd5b2cb..d19f2a0 100644
--- a/docs/mkdocs/docs/gaia_validation_text_only.md
+++ b/docs/mkdocs/docs/gaia_validation_text_only.md
@@ -51,7 +51,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"
-# LLM judge, reasoning, and O3 hints
+# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
diff --git a/docs/mkdocs/docs/openai-gpt.md b/docs/mkdocs/docs/openai-gpt.md
index 32a8664..c363691 100644
--- a/docs/mkdocs/docs/openai-gpt.md
+++ b/docs/mkdocs/docs/openai-gpt.md
@@ -1,6 +1,6 @@
# OpenAI GPT Models
-OpenAI's latest models including GPT-4o and O3 reasoning models with strong coding, vision, and reasoning capabilities.
+OpenAI's latest models including GPT-4o and advanced reasoning models with strong coding, vision, and reasoning capabilities.
## Client Used
@@ -19,7 +19,7 @@ export OPENAI_BASE_URL="https://api.openai.com/v1" # optional
main_agent:
llm:
provider_class: "GPTOpenAIClient"
- model_name: "gpt-4o" # or o3, etc.
+ model_name: "gpt-4o" # or gpt-4o-mini, etc.
openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
openai_base_url: "${oc.env:OPENAI_BASE_URL,https://api.openai.com/v1}"
```
diff --git a/docs/mkdocs/docs/gaia_validation.md b/docs/mkdocs/docs/prerequisite.md
similarity index 67%
rename from docs/mkdocs/docs/gaia_validation.md
rename to docs/mkdocs/docs/prerequisite.md
index a95c6af..ba8d5f9 100644
--- a/docs/mkdocs/docs/gaia_validation.md
+++ b/docs/mkdocs/docs/prerequisite.md
@@ -1,14 +1,14 @@
-# GAIA Validation
+# GAIA Validation Prerequisites
-MiroFlow demonstrates state-of-the-art performance on the GAIA validation benchmark, showcasing exceptional capabilities in complex reasoning tasks that require multi-step problem solving, information synthesis, and tool usage.
-
-More details: [GAIA: a benchmark for General AI Assistants](https://arxiv.org/abs/2311.12983)
+This document covers the common setup requirements and prerequisites for running GAIA validation benchmarks with MiroFlow, regardless of the specific model configuration used.
## About the GAIA Dataset
!!! info "What is GAIA?"
GAIA (General AI Assistant) is a comprehensive benchmark designed to evaluate AI agents' ability to perform complex reasoning tasks that require multiple skills including web browsing, file manipulation, data analysis, and multi-step problem solving.
+More details: [GAIA: a benchmark for General AI Assistants](https://arxiv.org/abs/2311.12983)
+
---
## Performance Comparison
@@ -31,10 +31,7 @@ More details: [GAIA: a benchmark for General AI Assistants](https://arxiv.org/ab
---
-## Setup and Evaluation Guide
-
-!!! note "Complete Reproduction Instructions"
- This section provides comprehensive step-by-step instructions to reproduce our GAIA validation benchmark results. All results are fully reproducible using our open-source framework.
+## Dataset Preparation
### Step 1: Prepare the GAIA Validation Dataset
@@ -81,12 +78,15 @@ Then download the dataset:
uv run main.py prepare-benchmark get gaia-val
```
-### Step 2: Configure API Keys
+---
+
+## Common API Keys Configuration
-!!! warning "Required API Configuration"
- Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+### Required API Keys
-```env title=".env Configuration"
+The following API keys are required for all GAIA validation runs, regardless of the model configuration:
+
+```env title="Common .env Configuration"
# Search and web scraping capabilities
SERPER_API_KEY="your-serper-api-key"
JINA_API_KEY="your-jina-api-key"
@@ -94,37 +94,31 @@ JINA_API_KEY="your-jina-api-key"
# Code execution environment
E2B_API_KEY="your-e2b-api-key"
-# Primary LLM provider (Claude-3.7-Sonnet via OpenRouter)
-OPENROUTER_API_KEY="your-openrouter-api-key"
-OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
-
# Vision understanding capabilities
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"
-# LLM judge, reasoning, and O3 hints
+# LLM judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
-!!! tip "Why OpenRouter?"
- We use Claude-3.7-Sonnet through the OpenRouter backend as the primary LLM provider because OpenRouter offers better response rates and improved reliability compared to direct API access.
+### API Key Descriptions
-### Step 3: Run the Evaluation
+- **SERPER_API_KEY**: Required for web search functionality
+- **JINA_API_KEY**: Required for web scraping and content extraction
+- **E2B_API_KEY**: Required for secure code execution environment
+- **ANTHROPIC_API_KEY**: Required for vision understanding capabilities
+- **GEMINI_API_KEY**: Required for additional vision processing
+- **OPENAI_API_KEY**: Required for hint generation and final answer extraction
-Execute the evaluation using the following command:
-
-```bash title="Run GAIA Validation"
-uv run main.py common-benchmark \
- --config_file_name=agent_gaia-validation \
- output_dir="logs/gaia-validation/$(date +"%Y%m%d_%H%M")"
-```
+---
+## Progress Monitoring and Resume
-### Step 4: Monitor Progress and Resume
+### Progress Tracking
-!!! tip "Progress Tracking"
- You can monitor the evaluation progress in real-time:
+You can monitor the evaluation progress in real-time:
```bash title="Check Progress"
uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
@@ -132,12 +126,13 @@ uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
Replace `$PATH_TO_LOG` with your actual output directory path.
-!!! note "Resume Capability"
- If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
+### Resume Capability
+
+If the evaluation is interrupted, you can resume from where it left off by specifying the same output directory:
```bash title="Resume Interrupted Evaluation"
uv run main.py common-benchmark \
- --config_file_name=agent_gaia-validation \
+ --config_file_name=YOUR_CONFIG_FILE \
output_dir="logs/gaia-validation/20250922_1430"
```
@@ -146,7 +141,7 @@ uv run main.py common-benchmark \
## Execution Traces
!!! info "Complete Execution Traces"
- We have released our complete execution traces for the `gaia-validation` dataset on Hugging Face. This comprehensive collection includes a full run of 165 tasks with an overall accuracy of 73.94%.
+ We have released our complete execution traces for the `gaia-validation` dataset on Hugging Face. This comprehensive collection includes a full run of 165 tasks with detailed reasoning traces.
You can download them using the following command:
@@ -159,4 +154,4 @@ unzip gaia_validation_miroflow_trace_public_20250825.zip
---
!!! info "Documentation Info"
- **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
\ No newline at end of file
+ **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/docs/xbench_ds.md b/docs/mkdocs/docs/xbench_ds.md
index d5f5c21..2ab9dc0 100644
--- a/docs/mkdocs/docs/xbench_ds.md
+++ b/docs/mkdocs/docs/xbench_ds.md
@@ -43,7 +43,7 @@ OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
GEMINI_API_KEY="your-gemini-api-key"
-# LLM as judge, reasoning, and O3 hints
+# LLM as judge, reasoning, and hint generation
OPENAI_API_KEY="your-openai-api-key"
OPENAI_BASE_URL="https://api.openai.com/v1"
```
diff --git a/docs/mkdocs/docs/yaml_config.md b/docs/mkdocs/docs/yaml_config.md
index aad6d39..1ce8917 100644
--- a/docs/mkdocs/docs/yaml_config.md
+++ b/docs/mkdocs/docs/yaml_config.md
@@ -111,9 +111,9 @@ main_agent:
- tool-reasoning
input_process:
- o3_hint: true # Use O3 for task hints
+ hint_generation: true # Use LLM for task hint generation
output_process:
- o3_final_answer: true # Use O3 for answer extraction
+ final_answer_extraction: true # Use LLM for answer extraction
sub_agents:
agent-worker:
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
index 1a48883..376bf4b 100644
--- a/docs/mkdocs/mkdocs.yml
+++ b/docs/mkdocs/mkdocs.yml
@@ -41,7 +41,7 @@ nav:
- News & Updates: index.md
- License: license.md
- - Overview:
+ - Quick Start:
- Quickstart: quickstart.md
- Core Concepts: core_concepts.md
- YAML Configuration: yaml_config.md
@@ -49,7 +49,10 @@ nav:
- Evaluation:
- Overview: evaluation_overview.md
- Benchmarks:
- - GAIA-Validation: gaia_validation.md
+ - GAIA-Validation:
+ - Prerequisites: gaia_validation_prerequisites.md
+ - Claude-3.7-Sonnet: gaia_validation_claude37sonnet.md
+ - MiroThinker: gaia_validation_mirothinker.md
- GAIA-Validation-Text-Only: gaia_validation_text_only.md
- GAIA-Test: gaia_test.md
- FutureX: futurex.md
diff --git a/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh b/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
new file mode 100644
index 0000000..6037c87
--- /dev/null
+++ b/scripts/run_evaluate_multiple_runs_mirothinker_gaia-validation.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration parameters
+NUM_RUNS=3
+AGENT_SET="agent_gaia-validation_mirothinker"
+MAX_CONCURRENT=15
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR=${RESULTS_DIR:-"logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"}
+
+echo "Starting $NUM_RUNS runs of the evaluation..."
+echo "Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+for i in $(seq 1 $NUM_RUNS); do
+ echo "=========================================="
+ echo "Launching experiment $i/$NUM_RUNS"
+ echo "=========================================="
+
+ RUN_ID="run_$i"
+
+ (
+ uv run main.py common-benchmark \
+ --config_file_name=$AGENT_SET \
+ benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+ output_dir="$RESULTS_DIR/$RUN_ID" \
+ hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+ > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+
+ if [ $? -eq 0 ]; then
+ echo "Run $i completed successfully"
+ RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+ if [ -f "$RESULT_FILE" ]; then
+ echo "Results saved to $RESULT_FILE"
+ else
+ echo "Warning: Result file not found for run $i"
+ fi
+ else
+ echo "Run $i failed!"
+ fi
+ ) &
+
+ sleep 2
+done
+
+echo "All $NUM_RUNS runs have been launched in parallel"
+echo "Waiting for all runs to complete..."
+
+wait
+
+echo "=========================================="
+echo "All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+echo "Calculating average scores..."
+uv run main.py avg-score "$RESULTS_DIR"
+
+echo "=========================================="
+echo "Multiple runs evaluation completed!"
+echo "Check results in: $RESULTS_DIR"
+echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="
+
diff --git a/src/core/orchestrator.py b/src/core/orchestrator.py
index 70eb67a..7afa94f 100644
--- a/src/core/orchestrator.py
+++ b/src/core/orchestrator.py
@@ -23,9 +23,9 @@
from src.utils.io_utils import OutputFormatter, process_input
from src.utils.tool_utils import expose_sub_agents_as_tools
from src.utils.summary_utils import (
- o3_extract_hints,
- o3_extract_gaia_final_answer,
- o3_extract_browsecomp_zh_final_answer,
+ extract_hints,
+ extract_gaia_final_answer,
+ extract_browsecomp_zh_final_answer,
)
LOGGER_LEVEL = os.getenv("LOGGER_LEVEL", "INFO")
@@ -726,32 +726,35 @@ async def run_main_agent(
initial_user_content[0]["text"] + task_guidence
)
- o3_notes = "" # Initialize o3_notes
- if self.cfg.main_agent.input_process.o3_hint:
- # Execute O3 hints extraction
+ hint_notes = "" # Initialize hint_notes
+ if self.cfg.main_agent.input_process.hint_generation:
+ # Execute hint generation
try:
- o3_hints = await o3_extract_hints(
+ hint_content = await extract_hints(
task_description,
self.cfg.main_agent.openai_api_key,
self.chinese_context,
self.add_message_id,
+ self.cfg.main_agent.input_process.get(
+ "hint_llm_base_url", "https://api.openai.com/v1"
+ ),
)
- o3_notes = (
+ hint_notes = (
"\n\nBefore you begin, please review the following preliminary notes highlighting subtle or easily misunderstood points in the question, which might help you avoid common pitfalls during your analysis (for reference only; these may not be exhaustive):\n\n"
- + o3_hints
+ + hint_content
)
# Update initial user content
original_text = initial_user_content[0]["text"]
- initial_user_content[0]["text"] = original_text + o3_notes
+ initial_user_content[0]["text"] = original_text + hint_notes
except Exception as e:
- logger.error(f"O3 hints extraction failed after retries: {str(e)}")
+ logger.error(f"Hint generation failed after retries: {str(e)}")
self.task_log.log_step(
- step_name="o3_hint",
- message=f"[ERROR] O3 hint generation failed: {str(e)}",
+ step_name="hint_generation",
+ message=f"[ERROR] Hint generation failed: {str(e)}",
status="failed",
)
- o3_notes = "" # Continue execution but without O3 hints
+ hint_notes = "" # Continue execution but without hints
logger.info("Initial user input content: %s", initial_user_content)
message_history = [{"role": "user", "content": initial_user_content}]
@@ -993,71 +996,75 @@ async def run_main_agent(
"final_answer_content", f"Final answer content: {final_answer_text}"
)
- # Use O3 model to extract final answer
- o3_extracted_answer = ""
- if self.cfg.main_agent.output_process.o3_final_answer:
- # Execute O3 final answer extraction
+ # Use LLM to extract final answer
+ extracted_answer = ""
+ if self.cfg.main_agent.output_process.final_answer_extraction:
+ # Execute final answer extraction
try:
# For browsecomp-zh, we use another Chinese prompt to extract the final answer
if "browsecomp-zh" in self.cfg.benchmark.name:
- o3_extracted_answer = (
- await o3_extract_browsecomp_zh_final_answer(
- task_description,
- final_answer_text,
- self.cfg.main_agent.openai_api_key,
- )
+ extracted_answer = await extract_browsecomp_zh_final_answer(
+ task_description,
+ final_answer_text,
+ self.cfg.main_agent.openai_api_key,
+ self.cfg.main_agent.output_process.get(
+ "final_answer_llm_base_url", "https://api.openai.com/v1"
+ ),
)
- # Disguise O3 extracted answer as assistant returned result and add to message history
- assistant_o3_message = {
+ # Disguise LLM extracted answer as assistant returned result and add to message history
+ assistant_extracted_message = {
"role": "assistant",
"content": [
{
"type": "text",
- "text": f"O3 extracted final answer:\n{o3_extracted_answer}",
+ "text": f"LLM extracted final answer:\n{extracted_answer}",
}
],
}
- message_history.append(assistant_o3_message)
+ message_history.append(assistant_extracted_message)
- # o3 answer as final result
- final_answer_text = o3_extracted_answer
+ # LLM answer as final result
+ final_answer_text = extracted_answer
else:
- o3_extracted_answer = await o3_extract_gaia_final_answer(
+ extracted_answer = await extract_gaia_final_answer(
task_description,
final_answer_text,
self.cfg.main_agent.openai_api_key,
self.chinese_context,
+ self.cfg.main_agent.output_process.get(
+ "final_answer_llm_base_url", "https://api.openai.com/v1"
+ ),
)
- # Disguise O3 extracted answer as assistant returned result and add to message history
- assistant_o3_message = {
+ # Disguise LLM extracted answer as assistant returned result and add to message history
+ assistant_extracted_message = {
"role": "assistant",
"content": [
{
"type": "text",
- "text": f"O3 extracted final answer:\n{o3_extracted_answer}",
+ "text": f"LLM extracted final answer:\n{extracted_answer}",
}
],
}
- message_history.append(assistant_o3_message)
+ message_history.append(assistant_extracted_message)
- # Concatenate original summary and o3 answer as final result
- final_answer_text = f"{final_answer_text}\n\nO3 Extracted Answer:\n{o3_extracted_answer}"
+ # Concatenate original summary and LLM answer as final result
+ final_answer_text = f"{final_answer_text}\n\nLLM Extracted Answer:\n{extracted_answer}"
except Exception as e:
logger.error(
- f"O3 final answer extraction failed after retries: {str(e)}"
+ f"Final answer extraction failed after retries: {str(e)}"
)
self.task_log.log_step(
- step_name="o3_final_answer",
- message=f"[ERROR] O3 final answer extraction failed: {str(e)}",
+ step_name="final_answer_extraction",
+ message=f"[ERROR] Final answer extraction failed: {str(e)}",
status="failed",
)
# Continue using original final_answer_text
else:
- # to process when o3_final_answer is false
+ # to process when final_answer_extraction is false
# leave it here to be more clear
final_answer_text = final_answer_text
@@ -1069,7 +1076,7 @@ async def run_main_agent(
logger.debug(f"LLM Final Answer: {final_answer_text}")
- # Save final message history (including O3 processing results)
+ # Save final message history (including LLM processing results)
self.task_log.main_agent_message_history = {
"system_prompt": system_prompt,
"message_history": message_history,
diff --git a/src/core/pipeline.py b/src/core/pipeline.py
index 664ae47..772c4de 100644
--- a/src/core/pipeline.py
+++ b/src/core/pipeline.py
@@ -63,9 +63,9 @@ async def execute_task_pipeline(
task_file_name=task_file_name,
ground_truth=ground_truth,
input={
- "task_description": task_description,
+ "task_description": task_description,
"task_file_name": task_file_name,
- "metadata": metadata or {}
+ "metadata": metadata or {},
},
)
diff --git a/src/llm/providers/claude_openrouter_client.py b/src/llm/providers/claude_openrouter_client.py
index fd44125..b95fd03 100644
--- a/src/llm/providers/claude_openrouter_client.py
+++ b/src/llm/providers/claude_openrouter_client.py
@@ -191,8 +191,10 @@ async def _create_message(
or "exceeds the maximum length" in error_str
or "exceeds the maximum allowed length" in error_str
or "Input tokens exceed the configured limit" in error_str
- or "Requested token count exceeds the model's maximum context length" in error_str
- or "BadRequestError" in error_str and "context length" in error_str
+ or "Requested token count exceeds the model's maximum context length"
+ in error_str
+ or "BadRequestError" in error_str
+ and "context length" in error_str
):
logger.debug(f"OpenRouter LLM Context limit exceeded: {error_str}")
raise ContextLimitError(f"Context limit exceeded: {error_str}")
diff --git a/src/llm/providers/mirothinker_sglang_client.py b/src/llm/providers/mirothinker_sglang_client.py
index 6008f8c..a4dcca0 100644
--- a/src/llm/providers/mirothinker_sglang_client.py
+++ b/src/llm/providers/mirothinker_sglang_client.py
@@ -159,8 +159,10 @@ async def _create_message(
or "exceeds the maximum length" in error_str
or "exceeds the maximum allowed length" in error_str
or "Input tokens exceed the configured limit" in error_str
- or "Requested token count exceeds the model's maximum context length" in error_str
- or "BadRequestError" in error_str and "context length" in error_str
+ or "Requested token count exceeds the model's maximum context length"
+ in error_str
+ or "BadRequestError" in error_str
+ and "context length" in error_str
):
logger.debug(f"MiroThinker LLM Context limit exceeded: {error_str}")
raise ContextLimitError(f"Context limit exceeded: {error_str}")
diff --git a/src/tool/mcp_servers/reading_mcp_server.py b/src/tool/mcp_servers/reading_mcp_server.py
index c0f6ec0..c9d4f64 100644
--- a/src/tool/mcp_servers/reading_mcp_server.py
+++ b/src/tool/mcp_servers/reading_mcp_server.py
@@ -19,6 +19,7 @@
SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
+
@mcp.tool()
async def read_file(uri: str) -> str:
"""Read various types of resources (Doc, PPT, PDF, Excel, CSV, ZIP file etc.)
@@ -65,7 +66,13 @@ async def read_file(uri: str) -> str:
if retry_count > 3:
# Try scrape_website tool as fallback
try:
- scrape_result = await smart_request(uri, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY})
+ scrape_result = await smart_request(
+ uri,
+ env={
+ "SERPER_API_KEY": SERPER_API_KEY,
+ "JINA_API_KEY": JINA_API_KEY,
+ },
+ )
return f"[INFO]: Download failed, automatically tried `scrape_website` tool instead.\n\n{scrape_result}"
except Exception as scrape_error:
return f"[ERROR]: Failed to download {uri}: {e}. Also failed to scrape with `scrape_website` tool: {scrape_error}"
diff --git a/src/utils/summary_utils.py b/src/utils/summary_utils.py
index 8c7514a..29c7640 100644
--- a/src/utils/summary_utils.py
+++ b/src/utils/summary_utils.py
@@ -14,12 +14,22 @@ def _generate_message_id() -> str:
return f"msg_{uuid.uuid4().hex[:8]}"
-@retry(wait=wait_exponential(multiplier=15), stop=stop_after_attempt(5))
-async def o3_extract_hints(
- question: str, api_key: str, chinese_context: bool, add_message_id: bool
+@retry(
+ wait=wait_exponential(multiplier=15),
+ stop=stop_after_attempt(5),
+ retry_error_callback=lambda retry_state: print(
+ f"Retry attempt {retry_state.attempt_number} for extract_hints"
+ ),
+)
+async def extract_hints(
+ question: str,
+ api_key: str,
+ chinese_context: bool,
+ add_message_id: bool,
+ base_url: str = "https://api.openai.com/v1",
) -> str:
- """Use O3 model to extract task hints"""
- client = AsyncOpenAI(api_key=api_key, timeout=600)
+ """Use LLM to extract task hints"""
+ client = AsyncOpenAI(api_key=api_key, timeout=600, base_url=base_url)
instruction = """Carefully analyze the given task description (question) without attempting to solve it directly. Your role is to identify potential challenges and areas that require special attention during the solving process, and provide practical guidance for someone who will solve this task by actively gathering and analyzing information from the web.
@@ -70,18 +80,29 @@ async def o3_extract_hints(
messages=[{"role": "user", "content": content}],
reasoning_effort="high",
)
+
result = response.choices[0].message.content
# Check if result is empty, raise exception to trigger retry if empty
if not result or not result.strip():
- raise ValueError("O3 hints extraction returned empty result")
+ raise ValueError("Hint extraction returned empty result")
return result
-@retry(wait=wait_exponential(multiplier=15), stop=stop_after_attempt(5))
-async def get_gaia_answer_type(task_description: str, api_key: str) -> str:
- client = AsyncOpenAI(api_key=api_key, timeout=600)
+@retry(
+ wait=wait_exponential(multiplier=15),
+ stop=stop_after_attempt(5),
+ retry_error_callback=lambda retry_state: print(
+ f"Retry attempt {retry_state.attempt_number} for get_gaia_answer_type"
+ ),
+)
+async def get_gaia_answer_type(
+ task_description: str, api_key: str, base_url: str = "https://api.openai.com/v1"
+) -> str:
+
+ client = AsyncOpenAI(api_key=api_key, timeout=600, base_url=base_url)
+
instruction = f"""Input:
`{task_description}`
@@ -112,14 +133,24 @@ async def get_gaia_answer_type(task_description: str, api_key: str) -> str:
return answer_type.strip()
-@retry(wait=wait_exponential(multiplier=15), stop=stop_after_attempt(5))
-async def o3_extract_gaia_final_answer(
- task_description_detail: str, summary: str, api_key: str, chinese_context: bool
+@retry(
+ wait=wait_exponential(multiplier=15),
+ stop=stop_after_attempt(5),
+ retry_error_callback=lambda retry_state: print(
+ f"Retry attempt {retry_state.attempt_number} for extract_gaia_final_answer"
+ ),
+)
+async def extract_gaia_final_answer(
+ task_description_detail: str,
+ summary: str,
+ api_key: str,
+ chinese_context: bool,
+ base_url: str = "https://api.openai.com/v1",
) -> str:
- """Use O3 model to extract final answer from summary"""
- answer_type = await get_gaia_answer_type(task_description_detail, api_key)
+ """Use LLM to extract final answer from summary"""
+ answer_type = await get_gaia_answer_type(task_description_detail, api_key, base_url)
- client = AsyncOpenAI(api_key=api_key, timeout=600)
+ client = AsyncOpenAI(api_key=api_key, timeout=600, base_url=base_url)
# Add Chinese-specific instructions and output format if enabled
chinese_supplement = ""
@@ -427,25 +458,24 @@ async def o3_extract_gaia_final_answer(
answer_type if answer_type in ["number", "time"] else "string"
)
- print("O3 Extract Final Answer Prompt:")
+ print("Extract Final Answer Prompt:")
print(full_prompt)
message_id = _generate_message_id()
response = await client.chat.completions.create(
model="o3",
messages=[{"role": "user", "content": f"[{message_id}] {full_prompt}"}],
- reasoning_effort="medium",
)
result = response.choices[0].message.content
# Check if result is empty, raise exception to trigger retry if empty
if not result or not result.strip():
- raise ValueError("O3 final answer extraction returned empty result")
+ raise ValueError("Final answer extraction returned empty result")
# Verify boxed answer exists
boxed_match = re.search(r"\\boxed{([^}]*)}", result)
if not boxed_match:
- raise ValueError("O3 final answer extraction returned empty answer")
+ raise ValueError("Final answer extraction returned empty answer")
print("response:", result)
@@ -454,12 +484,21 @@ async def o3_extract_gaia_final_answer(
return result
-@retry(wait=wait_exponential(multiplier=15), stop=stop_after_attempt(5))
-async def o3_extract_browsecomp_zh_final_answer(
- task_description_detail: str, summary: str, api_key: str
+@retry(
+ wait=wait_exponential(multiplier=15),
+ stop=stop_after_attempt(5),
+ retry_error_callback=lambda retry_state: print(
+ f"Retry attempt {retry_state.attempt_number} for extract_browsecomp_zh_final_answer"
+ ),
+)
+async def extract_browsecomp_zh_final_answer(
+ task_description_detail: str,
+ summary: str,
+ api_key: str,
+ base_url: str = "https://api.openai.com/v1",
) -> str:
- """Use O3 model to extract final answer from summary"""
- client = AsyncOpenAI(api_key=api_key, timeout=600)
+ """Use LLM to extract final answer from summary"""
+ client = AsyncOpenAI(api_key=api_key, timeout=600, base_url=base_url)
chinese_supplement = """
@@ -567,7 +606,7 @@ async def o3_extract_browsecomp_zh_final_answer(
+ common_confidence_section
)
- print("O3 Extract Final Answer Prompt:")
+ print("Extract Final Answer Prompt:")
print(full_prompt)
message_id = _generate_message_id()
@@ -580,12 +619,12 @@ async def o3_extract_browsecomp_zh_final_answer(
# Check if result is empty, raise exception to trigger retry if empty
if not result or not result.strip():
- raise ValueError("O3 final answer extraction returned empty result")
+ raise ValueError("Final answer extraction returned empty result")
# Verify boxed answer exists
boxed_match = re.search(r"\\boxed{([^}]*)}", result)
if not boxed_match:
- raise ValueError("O3 final answer extraction returned empty answer")
+ raise ValueError("Final answer extraction returned empty answer")
print("response:", result)
diff --git a/utils/eval_answer_from_log.py b/utils/eval_answer_from_log.py
index 838da35..fc239b1 100644
--- a/utils/eval_answer_from_log.py
+++ b/utils/eval_answer_from_log.py
@@ -34,7 +34,7 @@ async def main(input_dir: str, benchmark_name: str):
ground_truth = data.get("ground_truth", "")
predicted_answer = data.get("final_boxed_answer", "")
metadata = data.get("input", {}).get("metadata", {})
-
+
# If already has judge result, skip
# if "judge_result" in data and data["judge_result"] in ("CORRECT", "INCORRECT"):
# print(f"Log {log_file} already has judge result: {data['judge_result']}")
diff --git a/utils/eval_utils.py b/utils/eval_utils.py
index a200394..de7e799 100644
--- a/utils/eval_utils.py
+++ b/utils/eval_utils.py
@@ -154,7 +154,7 @@ async def verify_answer_llm_xbench(
openai_client: AsyncOpenAI, question: str, target: str, predicted_answer: str
) -> str:
"""
- Use XBench-style LLM judge (o3) to verify if the predicted answer is correct.
+ Use XBench-style LLM judge to verify if the predicted answer is correct.
Uses structured output format similar to verify_answer_llm_hle.
Args:
@@ -376,17 +376,17 @@ def is_float(element: Any) -> bool:
@retry(wait=wait_exponential(multiplier=5), stop=stop_after_attempt(5))
async def verify_answer_llm_finsearchcomp(
- openai_client: AsyncOpenAI,
- question: str,
- target: str,
+ openai_client: AsyncOpenAI,
+ question: str,
+ target: str,
predicted_answer: str,
judge_prompt_template: str,
judge_system_prompt: str,
- metadata: dict = None
+ metadata: dict = None,
) -> str:
"""
Use FinSearchComp-style LLM judge with dynamic prompts to verify if the predicted answer is correct.
-
+
Args:
openai_client: OpenAI client for LLM calls
question: The question being answered
@@ -395,64 +395,64 @@ async def verify_answer_llm_finsearchcomp(
judge_prompt_template: The judge prompt template from metadata
judge_system_prompt: The judge system prompt from metadata
metadata: Additional metadata containing response_reference and ground_truth_finance
-
+
Returns:
String indicating the evaluation result: "CORRECT", "INCORRECT", or "NOT_ATTEMPTED"
"""
# Get the appropriate ground truth based on the prompt template
response_reference = metadata.get("response_reference", "") if metadata else ""
ground_truth_finance = metadata.get("ground_truth_finance", "") if metadata else ""
-
+
# Format the judge prompt template with the actual values
formatted_prompt = judge_prompt_template.format(
prompt=question,
response_reference=response_reference,
ground_truth=ground_truth_finance,
- response=predicted_answer
+ response=predicted_answer,
)
-
+
# Create messages with system prompt and user prompt
messages = [
{"role": "system", "content": judge_system_prompt},
- {"role": "user", "content": formatted_prompt}
+ {"role": "user", "content": formatted_prompt},
]
-
+
try:
# NOTE: no explicit LLM model is specified here, so we use gpt-4o-mini for consistency
response = await openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
max_completion_tokens=2048,
- temperature=0.0 # Deterministic evaluation
+ temperature=0.0, # Deterministic evaluation
)
-
+
content = response.choices[0].message.content
-
+
# Print FinSearchComp judge reasoning
print(f"FinSearchComp LLM Judge Response: {content}")
-
+
# Parse the response to determine if it's correct
# Look for common patterns in the response
content_lower = content.lower()
-
+
# Check for JSON format responses
if "answer_score" in content_lower:
if '"answer_score": 1' in content or '"answer_score":1' in content:
return "CORRECT"
elif '"answer_score": 0' in content or '"answer_score":0' in content:
return "INCORRECT"
-
+
# Check for score format responses
if "score" in content_lower:
if '"score": 1' in content or '"score":1' in content:
return "CORRECT"
elif '"score": 0' in content or '"score":0' in content:
return "INCORRECT"
-
+
# If we can't parse the response, return NOT_ATTEMPTED
print(f"Warning: Could not parse FinSearchComp judge response: {content}")
return "NOT_ATTEMPTED"
-
+
except Exception as e:
print(f"FinSearchComp LLM evaluation failed: {e}")
return "NOT_ATTEMPTED"
@@ -475,13 +475,18 @@ async def verify_answer_for_datasets(
if "finsearchcomp" in benchmark_name and metadata:
judge_prompt_template = metadata.get("judge_prompt_template", "")
judge_system_prompt = metadata.get("judge_system_prompt", "")
-
+
if judge_prompt_template and judge_system_prompt:
return await verify_answer_llm_finsearchcomp(
- openai_client, question, target, predicted_answer,
- judge_prompt_template, judge_system_prompt, metadata
+ openai_client,
+ question,
+ target,
+ predicted_answer,
+ judge_prompt_template,
+ judge_system_prompt,
+ metadata,
)
-
+
# for all questions, do gaia scorer first, if not return CORRECT, then do others
gaia_scorer_answer = await verify_answer_gaia(target, predicted_answer)
diff --git a/utils/prepare_benchmark/gen_finsearchcomp.py b/utils/prepare_benchmark/gen_finsearchcomp.py
index f56d675..139389b 100644
--- a/utils/prepare_benchmark/gen_finsearchcomp.py
+++ b/utils/prepare_benchmark/gen_finsearchcomp.py
@@ -8,18 +8,19 @@
from utils.prepare_benchmark.common import Task
+
def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
"""
Generate FinSearchComp dataset tasks in MiroFlow format
-
+
Args:
hf_token: Hugging Face token for dataset access
-
+
Yields:
Task: Standardized task objects
"""
dataset = load_dataset("ByteSeedXpert/FinSearchComp")
-
+
for split_name, split_data in dataset.items():
for idx, sample in enumerate(split_data):
# Extract task information
@@ -27,7 +28,7 @@ def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
task_question = sample.get("prompt", "")
response_reference = sample.get("response_reference", "")
ground_truth_finance = sample.get("ground_truth", "")
-
+
# Create metadata dictionary with all original fields
metadata: MutableMapping = {
"source": "ByteSeedXpert/FinSearchComp",
@@ -37,12 +38,17 @@ def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
"response_reference": response_reference,
"ground_truth_finance": ground_truth_finance,
}
-
+
# Add all other fields from sample to metadata (including judge prompts)
for key, value in sample.items():
- if key not in ["prompt_id", "prompt", "response_reference", "ground_truth"]:
+ if key not in [
+ "prompt_id",
+ "prompt",
+ "response_reference",
+ "ground_truth",
+ ]:
metadata[key] = value
-
+
# Determine the primary ground truth for evaluation
# Priority: response_reference > ground_truth_finance
if response_reference:
@@ -51,16 +57,15 @@ def gen_finsearchcomp(hf_token: str) -> Generator[Task, None, None]:
ground_truth_task = ground_truth_finance
else:
ground_truth_task = "" # Fallback to empty string
-
+
# Create standardized Task object
task = Task(
task_id=task_id,
task_question=task_question,
ground_truth=ground_truth_task,
- file_path=None, # No file attachments
+ file_path=None, # No file attachments
metadata=metadata,
)
-
+
yield task
return
-
\ No newline at end of file
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
index 12db9cf..4675982 100644
--- a/utils/prepare_benchmark/main.py
+++ b/utils/prepare_benchmark/main.py
@@ -121,6 +121,7 @@ def gen():
return gen
case "finsearchcomp":
+
def gen():
for x in gen_finsearchcomp(env.hf_token):
yield x
diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py
index 2e45787..52035fe 100755
--- a/utils/progress_check/check_finsearchcomp_progress.py
+++ b/utils/progress_check/check_finsearchcomp_progress.py
@@ -27,14 +27,14 @@
def extract_task_type(task_id: str) -> str:
"""
Extract task type (T1, T2, T3) from task_id.
-
+
Args:
task_id: Task ID string like "(T1)Time_Sensitive_Data_Fetching_006"
-
+
Returns:
Task type string ("T1", "T2", "T3", or "Unknown")
"""
- match = re.match(r'^\(T(\d+)\)', task_id)
+ match = re.match(r"^\(T(\d+)\)", task_id)
if match:
return f"T{match.group(1)}"
return "Unknown"
@@ -43,16 +43,16 @@ def extract_task_type(task_id: str) -> str:
def extract_region_from_label(label: str) -> str:
"""
Extract region from the label field.
-
+
Args:
label: Label string like "Complex_Historical_Investigation(Global)" or "Financial_Analysis(Greater_China)"
-
+
Returns:
Region string ("Global", "Greater China", or "Unknown")
"""
if not label:
return "Unknown"
-
+
if "(Global)" in label:
return "Global"
elif "(Greater China)" in label:
@@ -90,18 +90,18 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
"T1": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
"T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
"T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
- "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+ "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
},
"regional_breakdown": {
"Global": {
"T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
- "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
+ "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
},
"Greater China": {
"T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
- "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}
- }
- }
+ "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+ },
+ },
}
completed_correct_files = []
@@ -121,14 +121,14 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
task_type = extract_task_type(task_id)
status = data.get("status", "").lower()
judge_result = data.get("judge_result", "").upper()
-
+
# Extract region from label
label = data.get("input", {}).get("metadata", {}).get("label", "")
region = extract_region_from_label(label)
# Update task type breakdown
results["task_type_breakdown"][task_type]["total"] += 1
-
+
# Update regional breakdown for T2 and T3 tasks
if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
results["regional_breakdown"][region][task_type]["total"] += 1
@@ -136,9 +136,12 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
if status == "completed":
results["completed_status"] += 1
results["task_type_breakdown"][task_type]["completed"] += 1
-
+
# Update regional breakdown for completed T2 and T3 tasks
- if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
+ if (
+ task_type in ["T2", "T3"]
+ and region in results["regional_breakdown"]
+ ):
results["regional_breakdown"][region][task_type]["completed"] += 1
# For T1 tasks, exclude from correctness evaluation but count as completed
@@ -152,15 +155,25 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
results["completed_and_correct"] += 1
results["task_type_breakdown"][task_type]["correct"] += 1
# Update regional breakdown for correct T2 and T3 tasks
- if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
- results["regional_breakdown"][region][task_type]["correct"] += 1
+ if (
+ task_type in ["T2", "T3"]
+ and region in results["regional_breakdown"]
+ ):
+ results["regional_breakdown"][region][task_type][
+ "correct"
+ ] += 1
completed_correct_files.append(json_file.name)
else:
results["completed_and_incorrect"] += 1
results["task_type_breakdown"][task_type]["incorrect"] += 1
# Update regional breakdown for incorrect T2 and T3 tasks
- if task_type in ["T2", "T3"] and region in results["regional_breakdown"]:
- results["regional_breakdown"][region][task_type]["incorrect"] += 1
+ if (
+ task_type in ["T2", "T3"]
+ and region in results["regional_breakdown"]
+ ):
+ results["regional_breakdown"][region][task_type][
+ "incorrect"
+ ] += 1
completed_incorrect_files.append((json_file.name, judge_result))
else:
results["other_status"] += 1
@@ -208,60 +221,88 @@ def display_results(
# Calculate accuracy excluding T1 tasks
t2_t3_completed = (
- results["task_type_breakdown"]["T2"]["completed"] +
- results["task_type_breakdown"]["T3"]["completed"]
+ results["task_type_breakdown"]["T2"]["completed"]
+ + results["task_type_breakdown"]["T3"]["completed"]
)
t2_t3_correct = (
- results["task_type_breakdown"]["T2"]["correct"] +
- results["task_type_breakdown"]["T3"]["correct"]
+ results["task_type_breakdown"]["T2"]["correct"]
+ + results["task_type_breakdown"]["T3"]["correct"]
)
-
+
if t2_t3_completed > 0:
accuracy = t2_t3_correct / t2_t3_completed * 100
print(f"\nAccuracy rate (T2+T3 correct/completed): {accuracy:.1f}%")
- print(f" (T1 tasks excluded due to outdated ground truth)")
+ print(" (T1 tasks excluded due to outdated ground truth)")
# Task type breakdown
print("\n" + "-" * 70)
print("TASK TYPE BREAKDOWN")
print("-" * 70)
-
+
for task_type in ["T1", "T2", "T3", "Unknown"]:
breakdown = results["task_type_breakdown"][task_type]
if breakdown["total"] > 0:
completion_rate = breakdown["completed"] / breakdown["total"] * 100
if task_type == "T1":
print(f"{task_type} (Time-Sensitive Data Fetching):")
- print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
- print(f" Note: Excluded from correctness evaluation (outdated ground truth)")
+ print(
+ f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)"
+ )
+ print(
+ " Note: Excluded from correctness evaluation (outdated ground truth)"
+ )
else:
- accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0
- print(f"{task_type} ({'Simple Historical Lookup' if task_type == 'T2' else 'Complex Historical Investigation'}):")
- print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
- print(f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}")
+ accuracy_rate = (
+ breakdown["correct"] / breakdown["completed"] * 100
+ if breakdown["completed"] > 0
+ else 0
+ )
+ print(
+ f"{task_type} ({'Simple Historical Lookup' if task_type == 'T2' else 'Complex Historical Investigation'}):"
+ )
+ print(
+ f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)"
+ )
+ print(
+ f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}"
+ )
print(f" Accuracy: {accuracy_rate:.1f}%")
# Regional breakdown for T2 and T3
print("\n" + "-" * 70)
print("REGIONAL BREAKDOWN (T2 & T3 TASKS)")
print("-" * 70)
-
+
for region in ["Global", "Greater China"]:
print(f"\n{region} Region:")
for task_type in ["T2", "T3"]:
breakdown = results["regional_breakdown"][region][task_type]
if breakdown["total"] > 0:
completion_rate = breakdown["completed"] / breakdown["total"] * 100
- accuracy_rate = breakdown["correct"] / breakdown["completed"] * 100 if breakdown["completed"] > 0 else 0
- task_name = "Simple Historical Lookup" if task_type == "T2" else "Complex Historical Investigation"
+ accuracy_rate = (
+ breakdown["correct"] / breakdown["completed"] * 100
+ if breakdown["completed"] > 0
+ else 0
+ )
+ task_name = (
+ "Simple Historical Lookup"
+ if task_type == "T2"
+ else "Complex Historical Investigation"
+ )
print(f" {task_type} ({task_name}):")
- print(f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)")
- print(f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}")
+ print(
+ f" Total: {breakdown['total']:3d}, Completed: {breakdown['completed']:3d} ({completion_rate:.1f}%)"
+ )
+ print(
+ f" Correct: {breakdown['correct']:3d}, Incorrect: {breakdown['incorrect']:3d}"
+ )
print(f" Accuracy: {accuracy_rate:.1f}%")
print("\n" + "-" * 70)
print(f"SUMMARY: {completed} tasks completed, {correct} T2+T3 tasks correct")
- print(f" (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)")
+ print(
+ f" (T1 tasks: {results['task_type_breakdown']['T1']['completed']} completed, excluded from evaluation)"
+ )
print("-" * 70)
# Show some example files for verification
@@ -298,15 +339,17 @@ def main():
try:
print(f"Analyzing FinSearchComp benchmark results in: {log_folder}")
- results, correct_files, incorrect_files, error_files = analyze_finsearchcomp_results(
- log_folder
+ results, correct_files, incorrect_files, error_files = (
+ analyze_finsearchcomp_results(log_folder)
)
display_results(results, correct_files, incorrect_files, error_files)
except Exception as e:
print(f"Error: {e}")
print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]")
- print(f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_20250924_1555")
+ print(
+ f"Example: python {sys.argv[0]} logs/finsearchcomp/agent_finsearchcomp_20250924_1555"
+ )
return 1
return 0
diff --git a/utils/progress_check/check_xbench_progress.py b/utils/progress_check/check_xbench_progress.py
index 7122454..7637512 100644
--- a/utils/progress_check/check_xbench_progress.py
+++ b/utils/progress_check/check_xbench_progress.py
@@ -234,7 +234,9 @@ def main():
except Exception as e:
print(f"Error: {e}")
print(f"\nUsage: python {sys.argv[0]} [LOG_FOLDER_PATH]")
- print(f"Example: python {sys.argv[0]} logs/xbench-ds/claude03_claude_dual/run_1")
+ print(
+ f"Example: python {sys.argv[0]} logs/xbench-ds/claude03_claude_dual/run_1"
+ )
return 1
return 0
diff --git a/utils/util_llm_parallel_thinking.py b/utils/util_llm_parallel_thinking.py
index 0914437..c7113e6 100644
--- a/utils/util_llm_parallel_thinking.py
+++ b/utils/util_llm_parallel_thinking.py
@@ -64,7 +64,7 @@ def process_message_history(main_agent_message_history: Dict[str, Any]) -> str:
# Process the last message content
final_content = message_history[-1]["content"][0]["text"]
final_content = final_content.replace(
- "O3 extracted final answer:", "## Final Answer Reasoning\n"
+ "LLM extracted final answer:", "## Final Answer Reasoning\n"
)
# Concatenate the two parts
@@ -373,7 +373,11 @@ def create_parallel_thinking_xbench_prompt(
async def process_single_task(
- benchmark_name: str, task_id: str, data: List[Dict[str, Any]], n_runs: int, semaphore: asyncio.Semaphore
+ benchmark_name: str,
+ task_id: str,
+ data: List[Dict[str, Any]],
+ n_runs: int,
+ semaphore: asyncio.Semaphore,
) -> Tuple[str, Dict[str, Any], Any]:
"""Process a single task and return its result."""
# Choose prompt function based on benchmark
@@ -565,7 +569,9 @@ def save_results(
async def main(
- benchmark_name: str, results_dir: str, max_concurrent_requests: int = MAX_CONCURRENT_REQUESTS
+ benchmark_name: str,
+ results_dir: str,
+ max_concurrent_requests: int = MAX_CONCURRENT_REQUESTS,
) -> None:
"""Main function to analyze results and select best solutions."""
if not os.path.exists(results_dir):
@@ -585,7 +591,9 @@ async def main(
n_runs = len([d for d in run_dirs if os.path.isdir(d)])
# Process all tasks
- task_results = await process_tasks(benchmark_name, task_score_dict, n_runs, max_concurrent_requests)
+ task_results = await process_tasks(
+ benchmark_name, task_score_dict, n_runs, max_concurrent_requests
+ )
# Save results
save_results(results_dir, task_results, n_runs)
@@ -593,14 +601,18 @@ async def main(
if __name__ == "__main__":
args = ArgumentParser()
- args.add_argument("--benchmark", type=str, default="gaia", choices=["gaia", "xbench-ds"])
+ args.add_argument(
+ "--benchmark", type=str, default="gaia", choices=["gaia", "xbench-ds"]
+ )
args.add_argument("--results_dirs", type=str, default=[])
args.add_argument("--max_concurrent_requests", type=int, default=25)
args = args.parse_args()
benchmark_name = args.benchmark
max_concurrent_requests = args.max_concurrent_requests
- results_dirs = list(args.results_dirs.split(",")) # Use single or multiple directory mode based on whether results_dirs is defined above
+ results_dirs = list(
+ args.results_dirs.split(",")
+ ) # Use single or multiple directory mode based on whether results_dirs is defined above
if results_dirs:
# Multiple directories mode
diff --git a/utils/util_llm_simple_voting.py b/utils/util_llm_simple_voting.py
index aefa377..795864d 100644
--- a/utils/util_llm_simple_voting.py
+++ b/utils/util_llm_simple_voting.py
@@ -56,7 +56,7 @@ def process_message_history(main_agent_message_history: Dict[str, Any]) -> str:
# Process the last message content
final_content = message_history[-1]["content"][0]["text"]
final_content = final_content.replace(
- "O3 extracted final answer:", "## Final Answer Reasoning\n"
+ "LLM extracted final answer:", "## Final Answer Reasoning\n"
)
# Concatenate the two parts