add hle

Yue Deng · Yue Deng · commit 9a9c2e509bf5 · 2025-10-14T15:37:52.000+08:00
diff --git a/config/agent_hle_claude37sonnet.yaml b/config/agent_hle_claude37sonnet.yaml
@@ -0,0 +1,78 @@
+defaults:
+  - benchmark: hle
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 50  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
diff --git a/config/benchmark/hle.yaml b/config/benchmark/hle.yaml
@@ -0,0 +1,20 @@
+# config/benchmark/browsecomp-en.yaml
+defaults:
+  - default
+  - _self_
+
+name: "hle"
+
+data:
+  data_dir: "${data_dir}/hle"  # Path to hle dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 10    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for browsecomp since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+
diff --git a/docs/mkdocs/docs/hle.md b/docs/mkdocs/docs/hle.md
@@ -0,0 +1,99 @@
+# HLE
+
+MiroFlow's evaluation on the HLE benchmark demonstrates capabilities in multimodal reasoning and question answering tasks that require human-level understanding across vision and language.
+
+More details: [HLE Dataset on HuggingFace](https://huggingface.co/datasets/cais/hle)
+
+---
+
+## Dataset Overview
+
+!!! info "HLE Dataset"
+    The HLE dataset consists of challenging multimodal tasks that test AI systems' ability to perform human-level reasoning with both visual and textual information.
+
+!!! abstract "Key Dataset Characteristics"
+
+    - **Total Tasks**: Test split from HuggingFace `cais/hle` dataset
+    - **Task Type**: Multimodal question answering and reasoning
+    - **Modalities**: Text + Images
+    - **Ground Truth**: Available for evaluation
+
+---
+
+## Quick Start Guide
+
+### Step 1: Prepare the HLE Dataset
+
+```bash title="Download HLE Dataset"
+uv run main.py prepare-benchmark get hle
+```
+
+This will download the dataset and save images to `data/hle/images/`.
+
+### Step 2: Configure API Keys
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# Claude-3.7-Sonnet via OpenRouter
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Vision understanding
+ANTHROPIC_API_KEY="xxx"
+GEMINI_API_KEY="xxx"
+
+# Hint generation and final answer extraction
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+```bash title="Run HLE Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_hle_claude37sonnet benchmark=hle output_dir="logs/hle/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Resume Interrupted Evaluation"
+    Specify the same output directory to continue from where you left off:
+    
+    ```bash
+    uv run main.py common-benchmark --config_file_name=agent_hle_claude37sonnet benchmark=hle output_dir="logs/hle/20251014_1504"
+    ```
+
+### Step 4: Review Results
+
+```bash title="Check Results"
+# View accuracy summary
+cat logs/hle/*/benchmark_results_pass_at_1_accuracy.txt
+
+# View detailed results
+cat logs/hle/*/benchmark_results.jsonl
+```
+
+---
+
+## Usage Examples
+
+### Test with Limited Tasks
+
+```bash
+uv run main.py common-benchmark --config_file_name=agent_hle_claude37sonnet benchmark=hle benchmark.execution.max_tasks=10 output_dir="logs/hle/$(date +"%Y%m%d_%H%M")"
+```
+
+### Adjust Concurrency
+
+```bash
+uv run main.py common-benchmark --config_file_name=agent_hle_claude37sonnet benchmark=hle benchmark.execution.max_concurrent=5 output_dir="logs/hle/$(date +"%Y%m%d_%H%M")"
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
+
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -64,6 +64,7 @@ nav:
     - FutureX: futurex.md
     - xBench-DeepSearch: xbench_ds.md
     - FinSearchComp: finsearchcomp.md
+    - HLE: hle.md
 
     # - Benchmarks: 
     #   - GAIA-Validation-Text-Only: gaia_validation_text_only.md