MiroMindAI · BinWang28 · Sep 25, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/common_benchmark.py b/common_benchmark.py
@@ -210,6 +210,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
                             sub_agent_tool_managers=self.sub_agent_tool_managers,
                             output_formatter=self.output_formatter,
                             ground_truth=task.ground_truth,
+                            metadata=task.metadata,
                             log_path=self.output_dir
                             / f"task_{task.task_id}_attempt_{attempt}.json",
                         )
@@ -242,6 +243,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
                             question=task.task_question,
                             target=task.ground_truth,
                             predicted_answer=attempt_result["model_boxed_answer"],
+                            metadata=task.metadata,
                         )
                         attempt_result["judge_result"] = evaluation_result
                         attempt_result["is_correct"] = evaluation_result == "CORRECT"

diff --git a/config/agent_finsearchcomp.yaml b/config/agent_finsearchcomp.yaml
@@ -0,0 +1,69 @@
+defaults:
+  - benchmark: finsearchcomp
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "MiroThinkerSGLangClient"
+    model_name: "MODEL_NAME"
+    async_client: true
+    temperature: 0.6
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 8192
+    oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+    oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+    keep_tool_result: -1
+    oai_tool_thinking: false
+
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 20  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+  input_process:
+    o3_hint: true
+  output_process:
+    o3_final_answer: true
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for o3 hints and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "MiroThinkerSGLangClient"
+      model_name: "MODEL_NAME"
+      async_client: true
+      temperature: 0.6
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 8192
+      oai_mirothinker_api_key: "${oc.env:OAI_MIROTHINKER_API_KEY,dummy_key}"
+      oai_mirothinker_base_url: "${oc.env:OAI_MIROTHINKER_BASE_URL,http://localhost:61005/v1}"
+      keep_tool_result: -1
+      oai_tool_thinking: false
+
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 20  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
diff --git a/config/benchmark/finsearchcomp.yaml b/config/benchmark/finsearchcomp.yaml
@@ -0,0 +1,19 @@
+# config/benchmark/finsearchcomp.yaml
+defaults:
+  - default
+  - _self_
+
+name: "finsearchcomp"
+
+data:
+  data_dir: "${data_dir}/finsearchcomp"  # Path to finsearchcomp dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 5    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for finsearchcomp since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
diff --git a/docs/mkdocs/docs/finsearchcomp.md b/docs/mkdocs/docs/finsearchcomp.md
@@ -0,0 +1,178 @@
+# FinSearchComp
+
+MiroFlow's evaluation on the FinSearchComp benchmark demonstrates capabilities in financial information search and analysis tasks, showcasing advanced reasoning abilities in complex financial research scenarios.
+
+More details: [FinSearchComp Dataset](https://huggingface.co/datasets/ByteSeedXpert/FinSearchComp)
+
+---
+
+## Dataset Overview
+
+!!! info "FinSearchComp Dataset"
+    The FinSearchComp dataset consists of financial search and analysis tasks that require comprehensive research capabilities including:
+
+    - Financial data retrieval and analysis
+    - Market research and company analysis
+    - Investment decision support
+    - Financial news and report interpretation
+    - Time-sensitive financial information gathering
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: 635 (across T1, T2, T3 categories)
+    - **Task Types**: 
+        - **T1**: Time-Sensitive Data Fetching
+        - **T2**: Financial Analysis and Research
+        - **T3**: Complex Historical Investigation
+    - **Answer Format**: Detailed financial analysis and research reports
+    - **Ground Truth**: Available for T2 and T3 tasks, changes dynamically for T1 tasks
+    - **Evaluation**: Judge-based evaluation with correctness assessment
+
+---
+
+## Quick Start Guide
+
+!!! note "Quick Start Instructions"
+    This section provides step-by-step instructions to run the FinSearchComp benchmark and prepare submission results. **Note**: This is a quick start guide for running the benchmark, not for reproducing exact submitted results.
+
+### Step 1: Prepare the FinSearchComp Dataset
+
+!!! tip "Dataset Setup"
+    Use the integrated prepare-benchmark command to download and process the dataset:
+
+```bash title="Download FinSearchComp Dataset"
+uv run main.py prepare-benchmark get finsearchcomp
+```
+
+This will create the standardized dataset at `data/finsearchcomp/standardized_data.jsonl`.
+
+### Step 2: Configure API Keys
+
+!!! warning "API Key Configuration"
+    Set up the required API keys for model access and tool functionality. Update the `.env` file to include the following keys:
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# We use MiroThinker model for financial analysis
+OAI_MIROTHINKER_API_KEY="xxx"
+OAI_MIROTHINKER_BASE_URL="http://localhost:61005/v1"
+
+# Used for o3 hints and final answer extraction
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+
+# Used for Claude vision understanding
+ANTHROPIC_API_KEY="xxx"
+
+# Used for Gemini vision
+GEMINI_API_KEY="xxx"
+```
+
+### Step 3: Run the Evaluation
+
+!!! example "Evaluation Execution"
+    Execute the following command to run evaluation on the FinSearchComp dataset:
+
+```bash title="Run FinSearchComp Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Progress Monitoring and Resume"
+    To check the progress while running:
+
+    ```bash title="Check Progress"
+    uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG
+    ```
+
+    If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.
+
+    ```bash title="Resume Evaluation, e.g."
+    uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=${PATH_TO_LOG}
+    ```
+
+### Step 4: Extract Results
+
+!!! example "Result Extraction"
+    After evaluation completion, the results are automatically generated in the output directory:
+
+- `benchmark_results.jsonl`: Detailed results for each task
+- `benchmark_results_pass_at_1_accuracy.txt`: Summary accuracy statistics
+- `task_*_attempt_1.json`: Individual task execution traces
+
+---
+
+## Evaluation Notes
+
+!!! warning "Task Type Considerations"
+    The FinSearchComp dataset includes different task types with varying evaluation criteria:
+
+    - **T1 Tasks**: Time-Sensitive Data Fetching tasks are excluded from correctness evaluation due to outdated ground truth, but completion is still tracked
+    - **T2 Tasks**: Financial Analysis tasks are evaluated for correctness and quality
+    - **T3 Tasks**: Complex Historical Investigation tasks require comprehensive research and analysis
+
+!!! info "Output Analysis"
+    The evaluation generates detailed execution traces showing:
+
+    - Research process for each financial task
+    - Information gathering from multiple sources
+    - Financial calculations and analysis
+    - Comprehensive reports with insights and recommendations
+
+### Directory Structure
+
+After running evaluations, you'll find the following structure:
+
+```
+logs/finsearchcomp/agent_finsearchcomp_YYYYMMDD_HHMM/
+├── benchmark_results.jsonl              # Task results summary
+├── benchmark_results_pass_at_1_accuracy.txt  # Accuracy statistics
+├── task_(T1)Time_Sensitive_Data_Fetching_*.json  # T1 task traces
+├── task_(T2)Financial_Analysis_*.json   # T2 task traces
+├── task_(T3)Complex_Historical_Investigation_*.json  # T3 task traces
+└── output.log                           # Execution log
+```
+
+### Task Categories Breakdown
+
+The progress checker provides detailed statistics:
+
+- **Total Tasks**: Complete count across all categories
+- **Completed Tasks**: Successfully finished tasks
+- **Correct Tasks**: Tasks with judge_result "CORRECT" (T2 and T3 only)
+- **Category Breakdown**: Separate counts for T1, T2, and T3 tasks
+- **Accuracy Metrics**: Pass@1 accuracy for evaluable tasks
+
+---
+
+## Usage Examples
+
+### Single Run Evaluation
+```bash title="Basic Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Limited Task Testing
+```bash title="Test with Limited Tasks"
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp benchmark.execution.max_tasks=5 output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Custom Agent Configuration
+```bash title="Different Agent Setup"
+uv run main.py common-benchmark --config_file_name=agent_gaia-validation benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+### Multiple Runs for Reliability
+```bash title="Multiple Runs"
+NUM_RUNS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** September 2025 · **Doc Contributor:** Team @ MiroMind AI
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -54,6 +54,7 @@ nav:
       - GAIA-Test: gaia_test.md
       - FutureX: futurex.md
       - xBench-DeepSearch: xbench_ds.md
+      - FinSearchComp: finsearchcomp.md
     - Download Datasets: download_datasets.md
     - Add New Benchmarks: contribute_benchmarks.md
 

diff --git a/scripts/run_evaluate_multiple_runs_finsearchcomp.sh b/scripts/run_evaluate_multiple_runs_finsearchcomp.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Multiple runs FinSearchComp evaluation script
+# Based on the working command: uv run main.py common-benchmark --config_file_name=agent_finsearchcomp benchmark=finsearchcomp output_dir=logs/finsearchcomp/$(date +"%Y%m%d_%H%M")
+
+# Configuration parameters
+NUM_RUNS=${NUM_RUNS:-3}
+MAX_TASKS=${MAX_TASKS:-1}
+MAX_CONCURRENT=${MAX_CONCURRENT:-5}
+BENCHMARK_NAME="finsearchcomp"
+AGENT_SET=${AGENT_SET:-"agent_finsearchcomp"}
+
+# Set results directory with timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+RESULTS_DIR="logs/${BENCHMARK_NAME}/${AGENT_SET}_${TIMESTAMP}"
+
+export LOGGER_LEVEL="INFO"
+
+echo "🚀 Starting $NUM_RUNS runs of FinSearchComp evaluation..."
+echo "📊 Using max_tasks: $MAX_TASKS (set MAX_TASKS=null for full dataset)"
+echo "📊 Using max_concurrent: $MAX_CONCURRENT"
+echo "📁 Results will be saved in: $RESULTS_DIR"
+
+# Create results directory
+mkdir -p "$RESULTS_DIR"
+
+# Launch all parallel tasks
+for i in $(seq 1 $NUM_RUNS); do
+    echo "=========================================="
+    echo "🚀 Launching experiment $i/$NUM_RUNS"
+    echo "📝 Output log: $RESULTS_DIR/run_${i}_output.log"
+    echo "=========================================="
+
+    # Set specific identifier for this run
+    RUN_ID="run_$i"
+
+    # Run experiment (background execution)
+    (
+        echo "Starting run $i at $(date)"
+        uv run main.py common-benchmark \
+            --config_file_name=$AGENT_SET \
+            benchmark=$BENCHMARK_NAME \
+            benchmark.execution.max_tasks=$MAX_TASKS \
+            benchmark.execution.max_concurrent=$MAX_CONCURRENT \
+            benchmark.execution.pass_at_k=1 \
+            output_dir=${RESULTS_DIR}/$RUN_ID \
+            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
+
+        # Check if run was successful
+        if [ $? -eq 0 ]; then
+            echo "✅ Run $i completed successfully at $(date)"
+            RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
+            if [ -f "$RESULT_FILE" ]; then
+                echo "📊 Results saved to $RESULT_FILE"
+            else
+                echo "⚠️  Warning: Result file not found for run $i"
+            fi
+        else
+            echo "❌ Run $i failed at $(date)!"
+        fi
+    ) &
+
+    # Small delay between launches
+    sleep 2
+done
+
+echo "🎯 All $NUM_RUNS runs have been launched in parallel"
+echo "⏳ Waiting for all runs to complete..."
+
+# Wait for all background tasks to complete
+wait
+
+echo "=========================================="
+echo "🎉 All $NUM_RUNS runs completed!"
+echo "=========================================="
+
+# Show progress summary
+echo "=========================================="
+echo "📊 Progress Summary:"
+echo "=========================================="
+
+echo "=========================================="
+echo "🎯 Multiple runs FinSearchComp evaluation completed!"
+echo "📁 Check results in: $RESULTS_DIR"
+echo "📝 Check individual run logs: $RESULTS_DIR/run_*_output.log"
+echo "=========================================="
+echo ""
+echo "💡 Usage examples:"
+echo "   # Default: 3 runs with full dataset"
+echo "   ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Custom parameters"
+echo "   NUM_RUNS=5 MAX_TASKS=10 MAX_CONCURRENT=3 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Different agent configuration"
+echo "   AGENT_SET=agent_gaia-validation ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"
+echo ""
+echo "   # Limited tasks for testing"
+echo "   MAX_TASKS=5 ./scripts/run_evaluate_multiple_runs_finsearchcomp.sh"