From c8462cdf0c2367f1eccc38b1a43b2528b77729e5 Mon Sep 17 00:00:00 2001 From: Yue Deng Date: Tue, 14 Oct 2025 16:45:42 +0800 Subject: [PATCH 1/2] add hle-text-only --- .../agent_hle-text-only_claude37sonnet.yaml | 78 ++++++++++++++++ config/benchmark/hle-text-only.yaml | 20 ++++ docs/mkdocs/docs/hle-text-only.md | 92 +++++++++++++++++++ scripts/run_prepare_benchmark.sh | 1 + utils/prepare_benchmark/gen_hle_text_only.py | 30 ++++++ utils/prepare_benchmark/main.py | 9 ++ 6 files changed, 230 insertions(+) create mode 100644 config/agent_hle-text-only_claude37sonnet.yaml create mode 100644 config/benchmark/hle-text-only.yaml create mode 100644 docs/mkdocs/docs/hle-text-only.md create mode 100644 utils/prepare_benchmark/gen_hle_text_only.py diff --git a/config/agent_hle-text-only_claude37sonnet.yaml b/config/agent_hle-text-only_claude37sonnet.yaml new file mode 100644 index 0000000..075c721 --- /dev/null +++ b/config/agent_hle-text-only_claude37sonnet.yaml @@ -0,0 +1,78 @@ +defaults: + - benchmark: hle-text-only + - override hydra/job_logging: none + - _self_ # Allow defining variables at the top of this file + + +main_agent: + prompt_class: MainAgentPrompt_GAIA + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-reasoning + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + input_process: + hint_generation: true + hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}" + output_process: + final_answer_extraction: true + final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}" + + openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction + add_message_id: true + keep_tool_result: -1 + chinese_context: "${oc.env:CHINESE_CONTEXT,false}" + + +sub_agents: + agent-worker: + prompt_class: SubAgentWorkerPrompt + llm: + provider_class: "ClaudeOpenRouterClient" + model_name: "anthropic/claude-3.7-sonnet" + async_client: true + temperature: 0.3 + top_p: 0.95 + min_p: 0.0 + top_k: -1 + max_tokens: 32000 + openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}" + openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}" + openrouter_provider: "anthropic" + disable_cache_control: false + keep_tool_result: -1 + oai_tool_thinking: false + + tool_config: + - tool-searching + - tool-image-video + - tool-reading + - tool-code + - tool-audio + + max_turns: 50 # Maximum number of turns for main agent execution + max_tool_calls_per_turn: 10 # Maximum number of tool calls per turn + + +# Can define some top-level or default parameters here +output_dir: logs/ +data_dir: "${oc.env:DATA_DIR,data}" # Points to where data is stored + + diff --git a/config/benchmark/hle-text-only.yaml b/config/benchmark/hle-text-only.yaml new file mode 100644 index 0000000..870e73c --- /dev/null +++ b/config/benchmark/hle-text-only.yaml @@ -0,0 +1,20 @@ +# config/benchmark/hle-text-only.yaml +defaults: + - default + - _self_ + +name: "hle-text-only" + +data: + data_dir: "${data_dir}/hle-text-only" # Path to hle-text-only dataset + metadata_file: "standardized_data.jsonl" # Metadata filename + whitelist: [] # Optional: List of specific task_ids to run + +execution: + max_tasks: null # null = no limit, or specify a number + max_concurrent: 10 # Number of parallel tasks + pass_at_k: 1 # Number of attempts per task + +# OpenAI API key for evaluation (required for hle-text-only since it has ground truth) +openai_api_key: "${oc.env:OPENAI_API_KEY,???}" + diff --git a/docs/mkdocs/docs/hle-text-only.md b/docs/mkdocs/docs/hle-text-only.md new file mode 100644 index 0000000..ab4881b --- /dev/null +++ b/docs/mkdocs/docs/hle-text-only.md @@ -0,0 +1,92 @@ +# HLE + +MiroFlow's evaluation on the HLE-text-only benchmark demonstrates capabilities in multimodal reasoning and question answering tasks that require human-level understanding across vision and language. + +More details: [HLE text only Dataset on HuggingFace](https://huggingface.co/datasets/macabdul9/hle_text_only) + +--- + +## Dataset Overview + +!!! info "HLE Dataset (text only)" + The dataset is a text-only subset of HLE. + +--- + +## Quick Start Guide + +### Step 1: Prepare the HLE(text only) Dataset + +```bash title="Download HLE(text only) Dataset" +uv run main.py prepare-benchmark get hle-text-only +``` + +This will download the dataset to `data/hle-text-only/`. + +### Step 2: Configure API Keys + +```env title=".env Configuration" +# For searching and web scraping +SERPER_API_KEY="xxx" +JINA_API_KEY="xxx" + +# For Linux sandbox (code execution environment) +E2B_API_KEY="xxx" + +# Claude-3.7-Sonnet via OpenRouter +OPENROUTER_API_KEY="xxx" +OPENROUTER_BASE_URL="https://openrouter.ai/api/v1" + +# Vision understanding +ANTHROPIC_API_KEY="xxx" +GEMINI_API_KEY="xxx" + +# Hint generation and final answer extraction +OPENAI_API_KEY="xxx" +OPENAI_BASE_URL="https://api.openai.com/v1" +``` + +### Step 3: Run the Evaluation + +```bash title="Run HLE Evaluation" +uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")" +``` + +!!! tip "Resume Interrupted Evaluation" + Specify the same output directory to continue from where you left off: + + ```bash + uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet output_dir="logs/hle-text-only/20251014_1504" + ``` + +### Step 4: Review Results + +```bash title="Check Results" +# View accuracy summary +cat logs/hle-text-only/*/benchmark_results_pass_at_1_accuracy.txt + +# View detailed results +cat logs/hle-text-only/*/benchmark_results.jsonl +``` + +--- + +## Usage Examples + +### Test with Limited Tasks + +```bash +uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet benchmark.execution.max_tasks=10 output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")" +``` + +### Adjust Concurrency + +```bash +uv run main.py common-benchmark --config_file_name=agent_hle-text-only_claude37sonnet benchmark.execution.max_concurrent=5 output_dir="logs/hle-text-only/$(date +"%Y%m%d_%H%M")" +``` + +--- + +!!! info "Documentation Info" + **Last Updated:** October 2025 ยท **Doc Contributor:** Team @ MiroMind AI + diff --git a/scripts/run_prepare_benchmark.sh b/scripts/run_prepare_benchmark.sh index 837b2e4..0411b01 100644 --- a/scripts/run_prepare_benchmark.sh +++ b/scripts/run_prepare_benchmark.sh @@ -20,6 +20,7 @@ uv run main.py prepare-benchmark get webwalkerqa uv run main.py prepare-benchmark get browsecomp-test uv run main.py prepare-benchmark get browsecomp-zh-test uv run main.py prepare-benchmark get hle +uv run main.py prepare-benchmark get hle-text-only uv run main.py prepare-benchmark get xbench-ds uv run main.py prepare-benchmark get futurex uv run main.py prepare-benchmark get finsearchcomp \ No newline at end of file diff --git a/utils/prepare_benchmark/gen_hle_text_only.py b/utils/prepare_benchmark/gen_hle_text_only.py new file mode 100644 index 0000000..bc43634 --- /dev/null +++ b/utils/prepare_benchmark/gen_hle_text_only.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Generator, MutableMapping + +from datasets import load_dataset + +from utils.prepare_benchmark.common import Task + + +def gen_hle_text_only(hf_token: str) -> Generator[Task, None, None]: + dataset = load_dataset("macabdul9/hle_text_only", split="test", token=hf_token) + for x in dataset: + metadata: MutableMapping = x # type: ignore + task_id = metadata.pop("id") + question = metadata.pop("question") + gt = metadata.pop("answer") + metadata.pop("image_preview") + metadata.pop("rationale_image") + task = Task( + task_id=task_id, + task_question=question, + ground_truth=gt, + file_path=None, + metadata=metadata, + ) + yield task + + return diff --git a/utils/prepare_benchmark/main.py b/utils/prepare_benchmark/main.py index 4675982..1af6b9f 100644 --- a/utils/prepare_benchmark/main.py +++ b/utils/prepare_benchmark/main.py @@ -16,6 +16,7 @@ from utils.prepare_benchmark.gen_gaia import gen_gaia_validation from utils.prepare_benchmark.gen_gaia_text_only import gen_gaia_text_only from utils.prepare_benchmark.gen_hle import gen_hle_test +from utils.prepare_benchmark.gen_hle_text_only import gen_hle_text_only from utils.prepare_benchmark.gen_webwalkerqa import gen_webwalkerqa from utils.prepare_benchmark.gen_xbench_ds import gen_xbench_ds from utils.prepare_benchmark.gen_futurex import gen_futurex @@ -32,6 +33,7 @@ class _Env: "browsecomp-test", "browsecomp-zh-test", "hle", + "hle-text-only", "xbench-ds", "futurex", "finsearchcomp", @@ -105,6 +107,13 @@ def gen(): for x in gen_hle_test(env.hf_token, env.data_dir): yield x + return gen + case "hle-text-only": + + def gen(): + for x in gen_hle_text_only(env.hf_token): + yield x + return gen case "xbench-ds": From c71b845e2bb0856953c541f6f326573c7a851eac Mon Sep 17 00:00:00 2001 From: Yue Deng Date: Tue, 14 Oct 2025 16:47:03 +0800 Subject: [PATCH 2/2] add doc --- docs/mkdocs/mkdocs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index ea928fa..b51c8b7 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -65,6 +65,7 @@ nav: - xBench-DeepSearch: xbench_ds.md - FinSearchComp: finsearchcomp.md - HLE: hle.md + - HLE(text only): hle_text_only.md # - Benchmarks: # - GAIA-Validation-Text-Only: gaia_validation_text_only.md