improve readme and scripts.

gali-leilei · gali-leilei · commit f9633d49a6be · 2025-08-08T18:03:35.000+08:00
diff --git a/README.md b/README.md
@@ -178,6 +178,10 @@ If you wish to use a different LLM as the primary agent model, you will need to
 
 ```bash
 ## copy environment variable template and prepare yours in .env file
+cd miroflow/apps/prepare-benchmark
+cp .env.template .env
+vim .env
+cd miroflow/apps/run-agent
 cp .env.template .env
 vim .env
 ```
@@ -195,7 +199,7 @@ vim .env
    Command example:
    ```shell
    E2B_ACCESS_TOKEN=${your-token}
-   e2b template build -c "/root/.jupyter/start-up.sh" -t <team_id> -n all_pip_apt_pkg` -d ./e2b.Dockerfile
+   e2b template build -c "/root/.jupyter/start-up.sh" -t <team_id> -n "all_pip_apt_pkg" -d ./e2b.Dockerfile
    ```
 
 For additional information, please see the [E2B Docker documentation](https://e2b.dev/docs/sandbox-template).
@@ -218,7 +222,7 @@ Run prebuilt agent on the benchmark data:
 ```bash
 ## download data
 cd miroflow/apps/prepare-benchmark
-uv run python main.py get gaia-val
+uv run main.py get gaia-val
 ## run the code
 cd miroflow/apps/run-agent
 uv run main.py common-benchmark benchmark=gaia-validation
diff --git a/apps/run-agent/.env.template b/apps/run-agent/.env.template
@@ -23,4 +23,5 @@ NEWAPI_API_KEY=""
 NEWAPI_BASE_URL=""
 
 # use HTTPS proxy
-HTTPS_PROXY=""
+HTTPS_PROXY=""
+DATA_DIR="../../data"
diff --git a/apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh b/apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh
@@ -25,7 +25,7 @@ for i in $(seq 1 $NUM_RUNS); do
     RUN_ID="run_$i"
     
     (
-        uv run python benchmarks/common_benchmark.py \
+        uv run main.py common-benchmark \
             benchmark=$BENCHMARK_NAME \
             llm=claude_openrouter \
             llm.provider=$LLM_PROVIDER \
@@ -35,7 +35,7 @@ for i in $(seq 1 $NUM_RUNS); do
             benchmark.execution.max_concurrent=5 \
             benchmark.execution.pass_at_k=1 \
             agent=$AGENT_SET \
-            hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
+            output_dir=${RESULTS_DIR}/$RUN_ID \
             > "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
         
         if [ $? -eq 0 ]; then
@@ -64,7 +64,7 @@ echo "All $NUM_RUNS runs completed!"
 echo "=========================================="
 
 echo "Calculating average scores..."
-uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
+uv run main.py avg-score "$RESULTS_DIR"
 
 echo "=========================================="
 echo "Multiple runs evaluation completed!"