Skip to content

Commit f9633d4

Browse files
committed
improve readme and scripts.
1 parent a11bae2 commit f9633d4

File tree

3 files changed

+11
-6
lines changed

3 files changed

+11
-6
lines changed

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ If you wish to use a different LLM as the primary agent model, you will need to
178178

179179
```bash
180180
## copy environment variable template and prepare yours in .env file
181+
cd miroflow/apps/prepare-benchmark
182+
cp .env.template .env
183+
vim .env
184+
cd miroflow/apps/run-agent
181185
cp .env.template .env
182186
vim .env
183187
```
@@ -195,7 +199,7 @@ vim .env
195199
Command example:
196200
```shell
197201
E2B_ACCESS_TOKEN=${your-token}
198-
e2b template build -c "/root/.jupyter/start-up.sh" -t <team_id> -n all_pip_apt_pkg` -d ./e2b.Dockerfile
202+
e2b template build -c "/root/.jupyter/start-up.sh" -t <team_id> -n "all_pip_apt_pkg" -d ./e2b.Dockerfile
199203
```
200204

201205
For additional information, please see the [E2B Docker documentation](https://e2b.dev/docs/sandbox-template).
@@ -218,7 +222,7 @@ Run prebuilt agent on the benchmark data:
218222
```bash
219223
## download data
220224
cd miroflow/apps/prepare-benchmark
221-
uv run python main.py get gaia-val
225+
uv run main.py get gaia-val
222226
## run the code
223227
cd miroflow/apps/run-agent
224228
uv run main.py common-benchmark benchmark=gaia-validation

apps/run-agent/.env.template

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@ NEWAPI_API_KEY=""
2323
NEWAPI_BASE_URL=""
2424

2525
# use HTTPS proxy
26-
HTTPS_PROXY=""
26+
HTTPS_PROXY=""
27+
DATA_DIR="../../data"

apps/run-agent/scripts/claude-sonnet-3.7/run_evaluate_multiple_runs_gaia-validation.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ for i in $(seq 1 $NUM_RUNS); do
2525
RUN_ID="run_$i"
2626

2727
(
28-
uv run python benchmarks/common_benchmark.py \
28+
uv run main.py common-benchmark \
2929
benchmark=$BENCHMARK_NAME \
3030
llm=claude_openrouter \
3131
llm.provider=$LLM_PROVIDER \
@@ -35,7 +35,7 @@ for i in $(seq 1 $NUM_RUNS); do
3535
benchmark.execution.max_concurrent=5 \
3636
benchmark.execution.pass_at_k=1 \
3737
agent=$AGENT_SET \
38-
hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
38+
output_dir=${RESULTS_DIR}/$RUN_ID \
3939
> "$RESULTS_DIR/${RUN_ID}_output.log" 2>&1
4040

4141
if [ $? -eq 0 ]; then
@@ -64,7 +64,7 @@ echo "All $NUM_RUNS runs completed!"
6464
echo "=========================================="
6565

6666
echo "Calculating average scores..."
67-
uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
67+
uv run main.py avg-score "$RESULTS_DIR"
6868

6969
echo "=========================================="
7070
echo "Multiple runs evaluation completed!"

0 commit comments

Comments
 (0)