diff --git a/.gitignore b/.gitignore index ab50ae2..d03d3a7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,10 @@ __pycache__/ *.py[codz] *$py.class +# Data +data/* +!data/README.md + # C extensions *.so diff --git a/apps/run-agent/common_benchmark.py b/apps/run-agent/common_benchmark.py index c46aef5..b414d22 100644 --- a/apps/run-agent/common_benchmark.py +++ b/apps/run-agent/common_benchmark.py @@ -5,6 +5,7 @@ import asyncio import json import os +import signal from abc import ABC, abstractmethod from dataclasses import dataclass, field from enum import StrEnum @@ -197,7 +198,7 @@ async def run_single_task(self, task: BenchmarkTask) -> BenchmarkResult: output_formatter=self.output_formatter, ground_truth=task.ground_truth, log_path=self.output_dir - / f"{task.task_id}_attempt_{attempt}", + / f"{task.task_id}_attempt_{attempt}.json", ) attempt_result["model_response"] = response if response else "" @@ -544,11 +545,11 @@ def prepare_task_description( path = Path(task.file_path) # check if task.file_path is a relative path if path.is_absolute(): - return task.task_question, str(path.resolve()) + return task.task_question, str(path) # Build complete file path: data directory + relative path full_file_path = Path(self.data_dir) / path - return task.task_question, str(full_file_path.resolve()) + return task.task_question, str(full_file_path) async def entrypoint(cfg: DictConfig) -> float: @@ -631,14 +632,31 @@ def filter_func(x: BenchmarkTask) -> bool: return accuracy -def main(*args): +def signal_handler(signum, frame): + """Force exit signal handler""" + print(f"\n⚠️ Received interrupt signal {signum}, forcing immediate exit...") + print("Program will terminate all operations immediately") + os._exit(1) # Force immediate exit + + +@hydra.main(version_base=None, config_path=config_path(), config_name=config_name()) +def main(cfg: DictConfig) -> None: + """Main entry point using Hydra decorator - automatically creates .hydra directory""" + # Register signal handlers for immediate response to Ctrl+C + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + dotenv.load_dotenv() - with hydra.initialize_config_dir(config_dir=config_path(), version_base=None): - cfg = hydra.compose(config_name=config_name(), overrides=list(args)) - _ = bootstrap_logger() - # Default to disable tracing, and don't set key - set_tracing_disabled(True) - set_tracing_export_api_key("fake-key") - # Suppress trace provider warnings - bootstrap_silent_trace_provider() - asyncio.run(entrypoint(cfg)) + _ = bootstrap_logger() + # Default to disable tracing, and don't set key + set_tracing_disabled(True) + set_tracing_export_api_key("fake-key") + # Suppress trace provider warnings + bootstrap_silent_trace_provider() + + print("✅ Signal handler registered, press Ctrl+C to exit immediately") + asyncio.run(entrypoint(cfg)) + + +if __name__ == "__main__": + main() diff --git a/apps/run-agent/main.py b/apps/run-agent/main.py index ac9e3b6..4a24944 100644 --- a/apps/run-agent/main.py +++ b/apps/run-agent/main.py @@ -7,7 +7,6 @@ import calculate_average_score import calculate_score_from_log -import common_benchmark import eval_answer_from_log import trace_single_task @@ -22,13 +21,37 @@ def print_config(*args): if __name__ == "__main__": install(suppress=[fire, hydra], show_locals=True) - fire.Fire( - { - "print-config": print_config, - "trace": trace_single_task.main, - "common-benchmark": common_benchmark.main, - "eval-answer": eval_answer_from_log.main, - "avg-score": calculate_average_score.main, - "score-from-log": calculate_score_from_log.main, - } - ) + import sys + + if len(sys.argv) < 2: + print("Available commands:") + print(" print-config - Print configuration") + print(" trace - Run single task trace") + print(" common-benchmark - Run benchmark evaluation") + print(" eval-answer - Evaluate answers from log") + print(" avg-score - Calculate average score") + print(" score-from-log - Calculate score from log") + print("\nExample: python main.py common-benchmark") + sys.exit(1) + + command = sys.argv[1] + args = sys.argv[2:] + + if command == "print-config": + print_config(*args) + elif command == "trace": + trace_single_task.main(*args) + elif command == "common-benchmark": + # For common-benchmark, call it directly - it will use @hydra.main + import subprocess + + subprocess.run(["python", "common_benchmark.py"] + args) + elif command == "eval-answer": + eval_answer_from_log.main(*args) + elif command == "avg-score": + calculate_average_score.main(*args) + elif command == "score-from-log": + calculate_score_from_log.main(*args) + else: + print(f"Unknown command: {command}") + sys.exit(1) diff --git a/libs/miroflow-tool/src/miroflow/tool/mcp_servers/python_server.py b/libs/miroflow-tool/src/miroflow/tool/mcp_servers/python_server.py index 5ea5aa7..41ad8cc 100755 --- a/libs/miroflow-tool/src/miroflow/tool/mcp_servers/python_server.py +++ b/libs/miroflow-tool/src/miroflow/tool/mcp_servers/python_server.py @@ -13,7 +13,7 @@ # DEFAULT TEMPLATE ID # see README.md on how to build this -DEFAULT_TEMPLATE_ID = "1av7fdjfvcparqo8efq6" +DEFAULT_TEMPLATE_ID = "all_pip_apt_pkg" # DEFAULT CONFS DEFAULT_TIMEOUT = 1200 # seconds diff --git a/libs/miroflow/src/miroflow/logging/task_tracer.py b/libs/miroflow/src/miroflow/logging/task_tracer.py index 449603c..40ffa35 100644 --- a/libs/miroflow/src/miroflow/logging/task_tracer.py +++ b/libs/miroflow/src/miroflow/logging/task_tracer.py @@ -122,7 +122,7 @@ def save(self): if not self.log_path.exists(): self.log_path.parent.mkdir(exist_ok=True, parents=True) with open(self.log_path, mode="w") as dest: - dest.write(self.model_dump_json()) + dest.write(self.model_dump_json(indent=2)) except Exception as e: logger.error(e, stack_info=True, exc_info=True) diff --git a/libs/miroflow/src/miroflow/prebuilt/config/config.yaml b/libs/miroflow/src/miroflow/prebuilt/config/config.yaml index f9ffbb6..0f3c385 100644 --- a/libs/miroflow/src/miroflow/prebuilt/config/config.yaml +++ b/libs/miroflow/src/miroflow/prebuilt/config/config.yaml @@ -6,7 +6,7 @@ defaults: - pricing: _default # disable hydra logging # see https://github.com/facebookresearch/hydra/issues/2902#issuecomment-2147121325 - # - override hydra/hydra_logging: disabled + - override hydra/hydra_logging: disabled - override hydra/job_logging: none - _self_ # Allow defining variables at the top of this file @@ -34,4 +34,8 @@ env: # Can define some top-level or default parameters here project_name: "miroflow" -output_dir: logs/${benchmark.name}/${llm.provider}_${llm.model_name} \ No newline at end of file +output_dir: logs/${benchmark.name}/${llm.provider}_${llm.model_name} + +hydra: + run: + dir: ${output_dir}