Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ __pycache__/
*.py[codz]
*$py.class

# Data
data/*
!data/README.md

# C extensions
*.so

Expand Down
44 changes: 31 additions & 13 deletions apps/run-agent/common_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import asyncio
import json
import os
import signal
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import StrEnum
Expand Down Expand Up @@ -197,7 +198,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult:
output_formatter=self.output_formatter,
ground_truth=task.ground_truth,
log_path=self.output_dir
/ f"{task.task_id}_attempt_{attempt}",
/ f"{task.task_id}_attempt_{attempt}.json",
)

attempt_result["model_response"] = response if response else ""
Expand Down Expand Up @@ -544,11 +545,11 @@ def prepare_task_description(
path = Path(task.file_path)
# check if task.file_path is a relative path
if path.is_absolute():
return task.task_question, str(path.resolve())
return task.task_question, str(path)

# Build complete file path: data directory + relative path
full_file_path = Path(self.data_dir) / path
return task.task_question, str(full_file_path.resolve())
return task.task_question, str(full_file_path)


async def entrypoint(cfg: DictConfig) -> float:
Expand Down Expand Up @@ -631,14 +632,31 @@ def filter_func(x: BenchmarkTask) -> bool:
return accuracy


def main(*args):
def signal_handler(signum, frame):
"""Force exit signal handler"""
print(f"\n⚠️ Received interrupt signal {signum}, forcing immediate exit...")
print("Program will terminate all operations immediately")
os._exit(1) # Force immediate exit


@hydra.main(version_base=None, config_path=config_path(), config_name=config_name())
def main(cfg: DictConfig) -> None:
"""Main entry point using Hydra decorator - automatically creates .hydra directory"""
# Register signal handlers for immediate response to Ctrl+C
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

dotenv.load_dotenv()
with hydra.initialize_config_dir(config_dir=config_path(), version_base=None):
cfg = hydra.compose(config_name=config_name(), overrides=list(args))
_ = bootstrap_logger()
# Default to disable tracing, and don't set key
set_tracing_disabled(True)
set_tracing_export_api_key("fake-key")
# Suppress trace provider warnings
bootstrap_silent_trace_provider()
asyncio.run(entrypoint(cfg))
_ = bootstrap_logger()
# Default to disable tracing, and don't set key
set_tracing_disabled(True)
set_tracing_export_api_key("fake-key")
# Suppress trace provider warnings
bootstrap_silent_trace_provider()

print("✅ Signal handler registered, press Ctrl+C to exit immediately")
asyncio.run(entrypoint(cfg))


if __name__ == "__main__":
main()
45 changes: 34 additions & 11 deletions apps/run-agent/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import calculate_average_score
import calculate_score_from_log
import common_benchmark
import eval_answer_from_log
import trace_single_task

Expand All @@ -22,13 +21,37 @@ def print_config(*args):

if __name__ == "__main__":
install(suppress=[fire, hydra], show_locals=True)
fire.Fire(
{
"print-config": print_config,
"trace": trace_single_task.main,
"common-benchmark": common_benchmark.main,
"eval-answer": eval_answer_from_log.main,
"avg-score": calculate_average_score.main,
"score-from-log": calculate_score_from_log.main,
}
)
import sys

if len(sys.argv) < 2:
print("Available commands:")
print(" print-config - Print configuration")
print(" trace - Run single task trace")
print(" common-benchmark - Run benchmark evaluation")
print(" eval-answer - Evaluate answers from log")
print(" avg-score - Calculate average score")
print(" score-from-log - Calculate score from log")
print("\nExample: python main.py common-benchmark")
sys.exit(1)

command = sys.argv[1]
args = sys.argv[2:]

if command == "print-config":
print_config(*args)
elif command == "trace":
trace_single_task.main(*args)
elif command == "common-benchmark":
# For common-benchmark, call it directly - it will use @hydra.main
import subprocess

subprocess.run(["python", "common_benchmark.py"] + args)
elif command == "eval-answer":
eval_answer_from_log.main(*args)
elif command == "avg-score":
calculate_average_score.main(*args)
elif command == "score-from-log":
calculate_score_from_log.main(*args)
else:
print(f"Unknown command: {command}")
sys.exit(1)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

# DEFAULT TEMPLATE ID
# see README.md on how to build this
DEFAULT_TEMPLATE_ID = "1av7fdjfvcparqo8efq6"
DEFAULT_TEMPLATE_ID = "all_pip_apt_pkg"

# DEFAULT CONFS
DEFAULT_TIMEOUT = 1200 # seconds
Expand Down
2 changes: 1 addition & 1 deletion libs/miroflow/src/miroflow/logging/task_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def save(self):
if not self.log_path.exists():
self.log_path.parent.mkdir(exist_ok=True, parents=True)
with open(self.log_path, mode="w") as dest:
dest.write(self.model_dump_json())
dest.write(self.model_dump_json(indent=2))
except Exception as e:
logger.error(e, stack_info=True, exc_info=True)

Expand Down
8 changes: 6 additions & 2 deletions libs/miroflow/src/miroflow/prebuilt/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ defaults:
- pricing: _default
# disable hydra logging
# see https://github.com/facebookresearch/hydra/issues/2902#issuecomment-2147121325
# - override hydra/hydra_logging: disabled
- override hydra/hydra_logging: disabled
- override hydra/job_logging: none
- _self_ # Allow defining variables at the top of this file

Expand Down Expand Up @@ -34,4 +34,8 @@ env:

# Can define some top-level or default parameters here
project_name: "miroflow"
output_dir: logs/${benchmark.name}/${llm.provider}_${llm.model_name}
output_dir: logs/${benchmark.name}/${llm.provider}_${llm.model_name}

hydra:
run:
dir: ${output_dir}