feat(benchmark): add hle-text-only (#81)

ntudy · Yue Deng · web-flow · commit 10d9ab6ad193 · 2025-10-14T17:14:32.000+08:00
* add hle-text-only

* add doc

---------

Co-authored-by: Yue Deng &lt;yue.deng@miromind.ai&gt;
diff --git a/config/agent_hle-text-only_claude37sonnet.yaml b/config/agent_hle-text-only_claude37sonnet.yaml
@@ -0,0 +1,78 @@
+defaults:
+  - benchmark: hle-text-only
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPrompt_GAIA
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-reasoning
+
+  max_turns: 50  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: true
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+  output_process:
+    final_answer_extraction: true
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents:
+  agent-worker:
+    prompt_class: SubAgentWorkerPrompt
+    llm: 
+      provider_class: "ClaudeOpenRouterClient"
+      model_name: "anthropic/claude-3.7-sonnet"
+      async_client: true
+      temperature: 0.3
+      top_p: 0.95
+      min_p: 0.0
+      top_k: -1
+      max_tokens: 32000
+      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+      openrouter_provider: "anthropic"
+      disable_cache_control: false
+      keep_tool_result: -1
+      oai_tool_thinking: false
+    
+    tool_config:
+      - tool-searching
+      - tool-image-video
+      - tool-reading
+      - tool-code
+      - tool-audio
+
+    max_turns: 50  # Maximum number of turns for main agent execution
+    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
+
diff --git a/config/benchmark/hle-text-only.yaml b/config/benchmark/hle-text-only.yaml
@@ -0,0 +1,20 @@
+# config/benchmark/hle-text-only.yaml
+defaults:
+  - default
+  - _self_
+
+name: "hle-text-only"
+
+data:
+  data_dir: "${data_dir}/hle-text-only"  # Path to hle-text-only dataset
+  metadata_file: "standardized_data.jsonl"  # Metadata filename
+  whitelist: []  # Optional: List of specific task_ids to run
+
+execution:
+  max_tasks: null      # null = no limit, or specify a number
+  max_concurrent: 10    # Number of parallel tasks
+  pass_at_k: 1         # Number of attempts per task
+
+# OpenAI API key for evaluation (required for hle-text-only since it has ground truth)
+openai_api_key: "${oc.env:OPENAI_API_KEY,???}"
+
diff --git a/docs/mkdocs/docs/hle-text-only.md b/docs/mkdocs/docs/hle-text-only.md
@@ -0,0 +1,92 @@
+# HLE
+
+MiroFlow's evaluation on the HLE-text-only benchmark demonstrates capabilities in multimodal reasoning and question answering tasks that require human-level understanding across vision and language.
+
+More details: [HLE text only Dataset on HuggingFace](https://huggingface.co/datasets/macabdul9/hle_text_only)
+
+---
+
+## Dataset Overview
+
+!!! info "HLE Dataset (text only)"
+    The dataset is a text-only subset of HLE. 
+
+---
+
+## Quick Start Guide
+
+### Step 1: Prepare the HLE(text only) Dataset
+
+```bash title="Download HLE(text only) Dataset"
+uv run main.py prepare-benchmark get hle-text-only
+```
+
+This will download the dataset to `data/hle-text-only/`.
+
+### Step 2: Configure API Keys
+
+```env title=".env Configuration"
+# For searching and web scraping
+SERPER_API_KEY="xxx"
+JINA_API_KEY="xxx"
+
+# For Linux sandbox (code execution environment)
+E2B_API_KEY="xxx"
+
+# Claude-3.7-Sonnet via OpenRouter
+OPENROUTER_API_KEY="xxx"
+OPENROUTER_BASE_URL="https://openrouter.ai/api/v1"
+
+# Vision understanding
+ANTHROPIC_API_KEY="xxx"
+GEMINI_API_KEY="xxx"
+
+# Hint generation and final answer extraction
+OPENAI_API_KEY="xxx"
+OPENAI_BASE_URL="https://api.openai.com/v1"
+```
+
+### Step 3: Run the Evaluation
+
+```bash title="Run HLE Evaluation"
+uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")"
+```
+
+!!! tip "Resume Interrupted Evaluation"
+    Specify the same output directory to continue from where you left off:
+    
+    ```bash
+    uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet output_dir="logs/hle-text-only/20251014_1504"
+    ```
+
+### Step 4: Review Results
+
+```bash title="Check Results"
+# View accuracy summary
+cat logs/hle-text-only/*/benchmark_results_pass_at_1_accuracy.txt
+
+# View detailed results
+cat logs/hle-text-only/*/benchmark_results.jsonl
+```
+
+---
+
+## Usage Examples
+
+### Test with Limited Tasks
+
+```bash
+uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")"
+```
+
+### Adjust Concurrency
+
+```bash
+uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet benchmark.execution.max_concurrent=5 output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")"
+```
+
+---
+
+!!! info "Documentation Info"
+    **Last Updated:** October 2025 · **Doc Contributor:** Team @ MiroMind AI
+
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -65,6 +65,7 @@ nav:
     - xBench-DeepSearch: xbench_ds.md
     - FinSearchComp: finsearchcomp.md
     - HLE: hle.md
+    - HLE(text only): hle_text_only.md
 
     # - Benchmarks: 
     #   - GAIA-Validation-Text-Only: gaia_validation_text_only.md
diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh
@@ -20,6 +20,7 @@ uv run main.py prepare-benchmark get webwalkerqa
 uv run main.py prepare-benchmark get browsecomp-test
 uv run main.py prepare-benchmark get browsecomp-zh-test
 uv run main.py prepare-benchmark get hle
+uv run main.py prepare-benchmark get hle-text-only
 uv run main.py prepare-benchmark get xbench-ds
 uv run main.py prepare-benchmark get futurex
 uv run main.py prepare-benchmark get finsearchcomp
diff --git a/utils/prepare_benchmark/gen_hle_text_only.py b/utils/prepare_benchmark/gen_hle_text_only.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Generator, MutableMapping
+
+from datasets import load_dataset
+
+from utils.prepare_benchmark.common import Task
+
+
+def gen_hle_text_only(hf_token: str) -> Generator[Task, None, None]:
+    dataset = load_dataset("macabdul9/hle_text_only", split="test", token=hf_token)
+    for x in dataset:
+        metadata: MutableMapping = x  # type: ignore
+        task_id = metadata.pop("id")
+        question = metadata.pop("question")
+        gt = metadata.pop("answer")
+        metadata.pop("image_preview")
+        metadata.pop("rationale_image")
+        task = Task(
+            task_id=task_id,
+            task_question=question,
+            ground_truth=gt,
+            file_path=None,
+            metadata=metadata,
+        )
+        yield task
+
+    return
diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py
@@ -16,6 +16,7 @@
 from utils.prepare_benchmark.gen_gaia import gen_gaia_validation
 from utils.prepare_benchmark.gen_gaia_text_only import gen_gaia_text_only
 from utils.prepare_benchmark.gen_hle import gen_hle_test
+from utils.prepare_benchmark.gen_hle_text_only import gen_hle_text_only
 from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa
 from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds
 from utils.prepare_benchmark.gen_futurex import gen_futurex
@@ -32,6 +33,7 @@ class _Env:
         "browsecomp-test",
         "browsecomp-zh-test",
         "hle",
+        "hle-text-only",
         "xbench-ds",
         "futurex",
         "finsearchcomp",
@@ -105,6 +107,13 @@ def gen():
                 for x in gen_hle_test(env.hf_token, env.data_dir):
                     yield x
 
+            return gen
+        case "hle-text-only":
+
+            def gen():
+                for x in gen_hle_text_only(env.hf_token):
+                    yield x
+
             return gen
         case "xbench-ds":