docs(quickstart): update docs for quick start (#73)

BinWang28 · web-flow · commit 9ccab0747a84 · 2025-10-11T12:53:23.000+08:00
* update quick start and add tool searching serper, config logging for single task

* add single agent

* pass lint
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ cp .env.template .env
 # Edit .env and add your OPENROUTER_API_KEY
 
 # 3. Run your first agent
-uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
+uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
 ```
 
 🎉 **Expected Output:** Your agent should return **\boxed{Congo Democratic Republic}** 😊
diff --git a/README_ja.md b/README_ja.md
@@ -86,7 +86,7 @@ cp .env.template .env
 # .env を編集して OPENROUTER_API_KEY を追加
 
 # 3. 最初のエージェントを実行
-uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
+uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
 ```
 
 🎉 **想定出力**: エージェントは **\boxed{Congo Democratic Republic}** を返すはずです 😊
diff --git a/README_zh.md b/README_zh.md
@@ -86,7 +86,7 @@ cp .env.template .env
 # 编辑 .env 并添加您的 OPENROUTER_API_KEY
 
 # 3. 运行您的第一个智能体
-uv run main.py trace --config_file_name=agent_quickstart_1 --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
+uv run main.py trace --config_file_name=agent_quickstart_reading --task="What is the first country listed in the XLSX file that have names starting with Co?" --task_file_name="data/FSI-2023-DOWNLOAD.xlsx"
 ```
 
 🎉 **预期输出**: 您的智能体应该返回 **\boxed{Congo Democratic Republic}** 😊
diff --git a/config/agent_quickstart_reading.yaml b/config/agent_quickstart_reading.yaml
@@ -22,7 +22,8 @@ main_agent:
     keep_tool_result: -1
     oai_tool_thinking: false
   
-  tool_config: []
+  tool_config:
+    - tool-reading
 
   max_turns: -1  # Maximum number of turns for main agent execution
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
@@ -40,30 +41,7 @@ main_agent:
   chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
 
 
-sub_agents:
-  agent-worker:
-    prompt_class: SubAgentWorkerPrompt
-    llm: 
-      provider_class: "ClaudeOpenRouterClient"
-      model_name: "anthropic/claude-3.7-sonnet"
-      async_client: true
-      temperature: 0.3
-      top_p: 0.95
-      min_p: 0.0
-      top_k: -1
-      max_tokens: 32000
-      openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
-      openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
-      openrouter_provider: "anthropic"
-      disable_cache_control: false
-      keep_tool_result: -1
-      oai_tool_thinking: false
-    
-    tool_config:
-      - tool-reading
-
-    max_turns: -1  # Maximum number of turns for main agent execution
-    max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+sub_agents: null
 
 
 # Can define some top-level or default parameters here
diff --git a/config/agent_quickstart_search.yaml b/config/agent_quickstart_search.yaml
@@ -0,0 +1,50 @@
+defaults:
+  - benchmark: gaia-validation
+  - override hydra/job_logging: none
+  - _self_  # Allow defining variables at the top of this file
+
+
+main_agent:
+  prompt_class: MainAgentPromptBoxedAnswer
+  llm: 
+    provider_class: "ClaudeOpenRouterClient"
+    model_name: "anthropic/claude-3.7-sonnet"
+    async_client: true
+    temperature: 0.3
+    top_p: 0.95
+    min_p: 0.0
+    top_k: -1
+    max_tokens: 32000
+    openrouter_api_key: "${oc.env:OPENROUTER_API_KEY,???}"
+    openrouter_base_url: "${oc.env:OPENROUTER_BASE_URL,https://openrouter.ai/api/v1}"
+    openrouter_provider: "anthropic"
+    disable_cache_control: false
+    keep_tool_result: -1
+    oai_tool_thinking: false
+  
+  tool_config:
+    - tool-searching-serper
+
+  max_turns: -1  # Maximum number of turns for main agent execution
+  max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
+  
+  input_process:
+    hint_generation: false
+    hint_llm_base_url: "${oc.env:HINT_LLM_BASE_URL,https://api.openai.com/v1}"
+  output_process:
+    final_answer_extraction: false
+    final_answer_llm_base_url: "${oc.env:FINAL_ANSWER_LLM_BASE_URL,https://api.openai.com/v1}"
+
+  openai_api_key: "${oc.env:OPENAI_API_KEY,???}" # used for hint generation and final answer extraction
+  add_message_id: true
+  keep_tool_result: -1
+  chinese_context: "${oc.env:CHINESE_CONTEXT,false}"
+
+
+sub_agents: null
+
+
+# Can define some top-level or default parameters here
+output_dir: logs/
+data_dir: "${oc.env:DATA_DIR,data}"  # Points to where data is stored
+
diff --git a/config/agent_quickstart_single_agent.yaml b/config/agent_quickstart_single_agent.yaml
@@ -23,12 +23,8 @@ main_agent:
     oai_tool_thinking: false
   
   tool_config: 
-    - tool-reasoning-os
-    - tool-searching
-    - tool-image-video-os
     - tool-reading
-    - tool-code
-    - tool-audio-os
+    - tool-searching
 
   max_turns: -1  # Maximum number of turns for main agent execution
   max_tool_calls_per_turn: 10  # Maximum number of tool calls per turn
diff --git a/config/tool/tool-searching-serper.yaml b/config/tool/tool-searching-serper.yaml
@@ -1,8 +1,8 @@
-name: "tool-serper-search"
+name: "tool-searching-serper"
 tool_command: "npx"
 args:
   - "-y"
   - "serper-search-scrape-mcp-server"
 env:
 # Search API key - this value will be loaded from the .env file at runtime
-  SERPER_API_KEY: "${oc.env:SERPER_API_KEY}"
+  SERPER_API_KEY: "${oc.env:SERPER_API_KEY}"
diff --git a/docs/mkdocs/docs/contribute_benchmarks.md b/docs/mkdocs/docs/contribute_benchmarks.md
@@ -148,7 +148,7 @@ Start with a small subset to verify everything works correctly:
 
 ```bash title="Test Benchmark Integration"
 uv run main.py common-benchmark \
-  --config_file_name=agent_quickstart_1 \
+  --config_file_name=agent_quickstart_reading \
   benchmark=your-benchmark \
   benchmark.execution.max_tasks=3 \
   output_dir="logs/test-your-benchmark/$(date +"%Y%m%d_%H%M")"
@@ -160,7 +160,7 @@ Once testing passes, run the complete benchmark:
 
 ```bash title="Run Full Benchmark"
 uv run main.py common-benchmark \
-  --config_file_name=agent_quickstart_1 \
+  --config_file_name=agent_quickstart_reading \
   benchmark=your-benchmark \
   output_dir="logs/your-benchmark/$(date +"%Y%m%d_%H%M")"
 ```
diff --git a/docs/mkdocs/docs/futurex.md b/docs/mkdocs/docs/futurex.md
@@ -72,10 +72,10 @@ OPENAI_BASE_URL="https://api.openai.com/v1"
 ### Step 3: Run the Evaluation
 
 !!! example "Evaluation Execution"
-    Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_1` configuration for quick start purposes.
+    Execute the following command to run evaluation on the Futurex-Online dataset. This uses the basic `agent_quickstart_reading` configuration for quick start purposes.
 
 ```bash title="Run Futurex-Online Evaluation"
-uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
+uv run main.py common-benchmark --config_file_name=agent_quickstart_reading benchmark=futurex output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
 ```
 
 !!! tip "Progress Monitoring and Resume"
@@ -88,7 +88,7 @@ uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=
     If you need to resume an interrupted evaluation, specify the same output directory to continue from where you left off.
 
     ```bash title="Resume Evaluation, e.g."
-    uv run main.py common-benchmark --config_file_name=agent_quickstart_1 benchmark=futurex output_dir="logs/futurex/20250918_1010"
+    uv run main.py common-benchmark --config_file_name=agent_quickstart_reading benchmark=futurex output_dir="logs/futurex/20250918_1010"
     ```
 
 ### Step 4: Extract Results
@@ -184,13 +184,13 @@ Check the generated files for voting analysis:
 
 ```bash title="Check Voting Results"
 # View submission file with voting results
-cat logs/futurex/agent_quickstart_1_*/futurex_submission.jsonl
+cat logs/futurex/agent_quickstart_reading_*/futurex_submission.jsonl
 
 # Check individual run results
-ls logs/futurex/agent_quickstart_1_*/run_*/
+ls logs/futurex/agent_quickstart_reading_*/run_*/
 
 # Check progress and voting statistics
-uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_1_*
+uv run python utils/progress_check/check_futurex_progress.py logs/futurex/agent_quickstart_reading_*
 ```
 
 ### Manual Voting Aggregation
@@ -199,13 +199,13 @@ You can also manually run the voting aggregation:
 
 ```bash title="Manual Voting Aggregation"
 # Aggregate multiple runs with majority voting
-uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* --aggregate
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_* --aggregate
 
 # Force single run mode (if needed)
-uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_*/run_1 --single
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_*/run_1 --single
 
 # Specify custom output file
-uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_1_* -o my_voted_predictions.jsonl
+uv run python utils/extract_futurex_results.py logs/futurex/agent_quickstart_reading_* -o my_voted_predictions.jsonl
 ```
 
 ### Voting Output Format
@@ -249,7 +249,7 @@ For example, `"vote_counts": {"No": 2}` means 2 out of 2 runs predicted "No", in
 After running multiple evaluations, you'll find the following structure:
 
 ```
-logs/futurex/agent_quickstart_1_YYYYMMDD_HHMM/
+logs/futurex/agent_quickstart_reading_YYYYMMDD_HHMM/
 ├── futurex_submission.jsonl          # Final voted predictions
 ├── run_1/                            # First run results
 │   ├── benchmark_results.jsonl       # Individual task results
diff --git a/docs/mkdocs/docs/quickstart.md b/docs/mkdocs/docs/quickstart.md
diff --git a/docs/mkdocs/docs/yaml_config.md b/docs/mkdocs/docs/yaml_config.md
diff --git a/scripts/run_evaluate_multiple_runs_futurex.sh b/scripts/run_evaluate_multiple_runs_futurex.sh
diff --git a/utils/trace_single_task.py b/utils/trace_single_task.py