diff --git a/examples/circle_packing_with_artifacts/README.md b/examples/circle_packing_with_artifacts/README.md index 673f854e1..1693bbf0e 100644 --- a/examples/circle_packing_with_artifacts/README.md +++ b/examples/circle_packing_with_artifacts/README.md @@ -318,6 +318,45 @@ Target ratio: 0.9997314619131079 (99.97% of AlphaEvolve's result) This demonstrates that OpenEvolve can successfully reproduce the results from the AlphaEvolve paper on this mathematical optimization problem. +## Fast Convergence with Dual-Model Configuration + +Using a dual-model configuration with weighted sampling, OpenEvolve achieves near-optimal results in remarkably few iterations: + +![Evolution Plot](evolution_plot.png) + +### Configuration + +The `config.yaml` uses two Gemini models with different weights: +- `google/gemini-2.5-flash-lite` (weight: 0.8) - Fast, cost-effective for exploration +- `google/gemini-2.5-flash` (weight: 0.2) - Higher capability for breakthroughs + +```yaml +llm: + models: + - name: "google/gemini-2.5-flash-lite" + weight: 0.8 + - name: "google/gemini-2.5-flash" + weight: 0.2 +``` + +### Rapid Convergence + +The plot shows the evolution of sum_radii across program versions: + +- **Version 0**: Starts at ~0.96 (basic initial program) +- **Version 6**: First major improvement to ~2.09 +- **Version 21**: Reaches 2.63 (99.8% of target) +- **Final**: Achieves 2.6304 sum of radii + +**Key insight**: OpenEvolve discovers the mathematical optimization approach (using `scipy.optimize.minimize` with SLSQP) by version 21, achieving 99.8% of the AlphaEvolve target in just ~40 program evaluations. The dual-model approach allows rapid exploration with the lighter model while leveraging the more capable model for breakthrough discoveries. + +### Why It Works + +1. **Artifacts provide rich feedback**: Failed programs return detailed error information (boundary violations, overlaps), helping the LLM quickly correct mistakes +2. **MAP-Elites diversity**: The feature dimensions (`radius_variance`, `spatial_spread`) maintain diverse solutions in the population +3. **Island-based evolution**: 4 islands evolve independently, preventing premature convergence +4. **Efficient model weighting**: 80% lightweight model for broad exploration, 20% capable model for sophisticated solutions + ## Key Observations The evolution process demonstrated several interesting patterns: diff --git a/examples/circle_packing_with_artifacts/config.yaml b/examples/circle_packing_with_artifacts/config.yaml new file mode 100644 index 000000000..5b694cac1 --- /dev/null +++ b/examples/circle_packing_with_artifacts/config.yaml @@ -0,0 +1,90 @@ +# Circle Packing Benchmark Configuration with Thompson Sampling +# Based on config_benchmark.yaml but uses two models with Thompson sampling + +max_iterations: 500 +checkpoint_interval: 10 +log_level: "INFO" +random_seed: 42 + +# Full rewrite mode (best for constructor-based problems) +diff_based_evolution: false +max_code_length: 50000 + +# LLM Configuration - Two models +llm: + api_base: "https://openrouter.ai/api/v1" + models: + - name: "google/gemini-2.5-flash-lite" + weight: 0.8 + - name: "google/gemini-2.5-flash" + weight: 0.2 + + temperature: 0.4 + top_p: 0.95 + max_tokens: 16000 + timeout: 180 + retries: 3 + +# Prompt Configuration +prompt: + system_message: | + You are an expert mathematician specializing in circle packing problems and computational geometry. + Your task is to improve a constructor function that places 26 circles in a unit square to maximize the sum of their radii. + + Target: AlphaEvolve achieved sum of radii = 2.635 for n=26. + + Key insights: + - This is a constrained optimization problem with many local minima + - Local optimization methods may get stuck - consider approaches that explore the solution space more broadly + - Multiple starting points or perturbation strategies can help find better solutions + - Good initial placements matter: hexagonal patterns, corner utilization, edge circles + - The problem has 78 degrees of freedom (26 centers + 26 radii) + + Think about how to formulate this mathematically and what optimization strategies might help escape local minima. + + num_top_programs: 3 + num_diverse_programs: 2 + + # Artifacts enabled for debugging/visualization data + include_artifacts: true + max_artifact_bytes: 20480 # 20KB + +# Database Configuration +database: + population_size: 100 + archive_size: 50 + num_islands: 4 # Optimal island count + + # Selection parameters + elite_selection_ratio: 0.1 + exploration_ratio: 0.4 # Higher exploration to discover optimization approaches + exploitation_ratio: 0.5 # Balance with exploitation + + # Feature dimensions for MAP-Elites (diversity-focused metrics) + # - radius_variance: separates uniform vs varied circle sizes (0-1 normalized) + # - spatial_spread: separates clustered vs distributed centers (0-1 normalized) + feature_dimensions: ["radius_variance", "spatial_spread"] + feature_bins: 10 + + # Migration parameters - faster sharing of breakthroughs + migration_interval: 10 # Share discoveries sooner + migration_rate: 0.15 # Migrate more programs + +# Evaluator Configuration +evaluator: + timeout: 600 # Allow complex optimization programs to complete + max_retries: 3 + cascade_evaluation: true + cascade_thresholds: [0.5, 0.8] + parallel_evaluations: 4 + use_llm_feedback: false + enable_artifacts: true + +# Novelty Detection - prevent duplicate evaluations +novelty: + enabled: true + embedding_backend: "local" + embedding_model: "all-MiniLM-L6-v2" + similarity_threshold: 0.95 + max_regeneration_attempts: 3 + temperature_increment: 0.15 diff --git a/examples/circle_packing_with_artifacts/evaluator.py b/examples/circle_packing_with_artifacts/evaluator.py index ea3202546..3072dfd08 100644 --- a/examples/circle_packing_with_artifacts/evaluator.py +++ b/examples/circle_packing_with_artifacts/evaluator.py @@ -237,6 +237,8 @@ def evaluate(program_path): "validity": 0.0, "eval_time": float(eval_time), "combined_score": 0.0, + "radius_variance": 0.0, + "spatial_spread": 0.0, }, artifacts={ "stderr": shape_error, @@ -250,6 +252,20 @@ def evaluate(program_path): # Calculate sum sum_radii = np.sum(radii) if valid else 0.0 + # Calculate feature metrics for MAP-Elites diversity + # radius_variance: normalized variance of radii (0-1) + # Max theoretical variance for radii in [0, 0.5] is ~0.0625 + radius_variance = float(np.var(radii) / 0.0625) if valid else 0.0 + radius_variance = min(1.0, max(0.0, radius_variance)) # Clamp to [0, 1] + + # spatial_spread: how spread out centers are (0-1) + # Based on std of distances from centroid, normalized by max possible (0.5 * sqrt(2)) + centroid = np.mean(centers, axis=0) + distances_from_centroid = np.sqrt(np.sum((centers - centroid) ** 2, axis=1)) + max_spread = 0.5 * np.sqrt(2) # Max distance from center to corner + spatial_spread = float(np.std(distances_from_centroid) / max_spread) if valid else 0.0 + spatial_spread = min(1.0, max(0.0, spatial_spread)) # Clamp to [0, 1] + # Make sure reported_sum matches the calculated sum sum_mismatch = abs(sum_radii - reported_sum) > 1e-6 if sum_mismatch: @@ -306,6 +322,8 @@ def evaluate(program_path): "validity": float(validity), "eval_time": float(eval_time), "combined_score": float(combined_score), + "radius_variance": radius_variance, + "spatial_spread": spatial_spread, }, artifacts=artifacts, ) @@ -320,6 +338,8 @@ def evaluate(program_path): "validity": 0.0, "eval_time": 600.0, # Timeout duration "combined_score": 0.0, + "radius_variance": 0.0, + "spatial_spread": 0.0, }, artifacts={ "stderr": error_msg, @@ -339,6 +359,8 @@ def evaluate(program_path): "validity": 0.0, "eval_time": 0.0, "combined_score": 0.0, + "radius_variance": 0.0, + "spatial_spread": 0.0, }, artifacts={ "stderr": error_msg, @@ -374,7 +396,7 @@ def evaluate_stage1(program_path): shape_error = f"Invalid shapes: centers={centers.shape}, radii={radii.shape}" print(shape_error) return EvaluationResult( - metrics={"validity": 0.0, "combined_score": 0.0}, + metrics={"validity": 0.0, "combined_score": 0.0, "radius_variance": 0.0, "spatial_spread": 0.0}, artifacts={ "stderr": shape_error, "failure_stage": "stage1_shape_validation", @@ -389,6 +411,14 @@ def evaluate_stage1(program_path): # Calculate sum actual_sum = np.sum(radii) if valid else 0.0 + # Calculate feature metrics for MAP-Elites diversity + radius_variance = float(np.var(radii) / 0.0625) if valid else 0.0 + radius_variance = min(1.0, max(0.0, radius_variance)) + centroid = np.mean(centers, axis=0) + distances_from_centroid = np.sqrt(np.sum((centers - centroid) ** 2, axis=1)) + spatial_spread = float(np.std(distances_from_centroid) / (0.5 * np.sqrt(2))) if valid else 0.0 + spatial_spread = min(1.0, max(0.0, spatial_spread)) + # Target from paper target = 2.635 @@ -424,6 +454,8 @@ def evaluate_stage1(program_path): "sum_radii": float(actual_sum), "target_ratio": float(actual_sum / target if valid else 0.0), "combined_score": float(combined_score), + "radius_variance": radius_variance, + "spatial_spread": spatial_spread, }, artifacts=artifacts, ) @@ -432,7 +464,7 @@ def evaluate_stage1(program_path): error_msg = f"Stage 1 evaluation timed out: {e}" print(error_msg) return EvaluationResult( - metrics={"validity": 0.0, "combined_score": 0.0}, + metrics={"validity": 0.0, "combined_score": 0.0, "radius_variance": 0.0, "spatial_spread": 0.0}, artifacts={ "stderr": error_msg, "failure_stage": "stage1_timeout", @@ -445,7 +477,7 @@ def evaluate_stage1(program_path): print(error_msg) print(traceback.format_exc()) return EvaluationResult( - metrics={"validity": 0.0, "combined_score": 0.0}, + metrics={"validity": 0.0, "combined_score": 0.0, "radius_variance": 0.0, "spatial_spread": 0.0}, artifacts={ "stderr": error_msg, "traceback": traceback.format_exc(), @@ -459,7 +491,7 @@ def evaluate_stage1(program_path): print(error_msg) print(traceback.format_exc()) return EvaluationResult( - metrics={"validity": 0.0, "combined_score": 0.0}, + metrics={"validity": 0.0, "combined_score": 0.0, "radius_variance": 0.0, "spatial_spread": 0.0}, artifacts={ "stderr": error_msg, "traceback": traceback.format_exc(), diff --git a/examples/circle_packing_with_artifacts/evolution_plot.png b/examples/circle_packing_with_artifacts/evolution_plot.png new file mode 100644 index 000000000..01706deec Binary files /dev/null and b/examples/circle_packing_with_artifacts/evolution_plot.png differ diff --git a/examples/k_module_problem/README.md b/examples/k_module_problem/README.md new file mode 100644 index 000000000..897d58c38 --- /dev/null +++ b/examples/k_module_problem/README.md @@ -0,0 +1,271 @@ +# K-Module Problem: Evolution vs Iterative Refinement + +This example demonstrates a fundamental limitation of iterative refinement approaches and shows how evolutionary search with population-based exploration can solve problems that defeat single-trajectory optimization. + +## The Problem + +The K-Module Problem is a pipeline configuration task where you must find the correct combination of 4 independent modules: + +| Module | Options | +|--------|---------| +| **loader** | csv_reader, json_reader, xml_reader, parquet_reader, sql_reader | +| **preprocess** | normalize, standardize, minmax, scale, none | +| **algorithm** | quicksort, mergesort, heapsort, bubblesort, insertion | +| **formatter** | json, xml, csv, yaml, protobuf | + +**Search space**: 5⁴ = 625 possible combinations + +**The Challenge**: The evaluator only tells you *how many* modules are correct (0-4), not *which ones*. This creates a deceptive fitness landscape with no gradient information. + +## Why Iterative Refinement Fails + +Consider this scenario: + +``` +Initial: [json_reader, standardize, mergesort, xml] → Score: 0/4 +Refine 1: [csv_reader, standardize, mergesort, xml] → Score: 1/4 ✓ +Refine 2: [csv_reader, normalize, mergesort, xml] → Score: 2/4 ✓✓ +Refine 3: [csv_reader, normalize, heapsort, xml] → Score: 1/4 ✗ (went backwards!) +Refine 4: [csv_reader, normalize, mergesort, json] → Score: 2/4 (no progress) +``` + +**The Problem**: When the model changes `mergesort` to `heapsort`, it has no way to know this was wrong because: +- The score decreased, but was that because of the algorithm change? +- Or because `normalize` wasn't actually correct? +- The model can't tell which modules are contributing to the score + +This leads to **random walk behavior** requiring O(625) evaluations on average. + +## Why Evolution Succeeds + +Evolution maintains a **population** that explores different regions simultaneously: + +``` +Generation 1: + Individual A: [csv_reader, scale, quicksort, xml] → 2/4 (loader, algorithm correct) + Individual B: [json_reader, normalize, bubble, json] → 2/4 (preprocess, formatter correct) + Individual C: [xml_reader, minmax, mergesort, csv] → 0/4 + +Generation 2 (crossover): + Child(A,B): [csv_reader, normalize, quicksort, json] → 4/4 SUCCESS! +``` + +**Key insight**: Evolution discovers correct modules in different individuals and **crossover combines them**. This is the "Building Block Hypothesis" - complex solutions are assembled from simpler discovered components. + +## Theoretical Analysis + +| Method | Expected Evaluations | Why | +|--------|---------------------|-----| +| **Random Search** | ~312 (50% of space) | Pure luck | +| **Pass@100 (LLM)** | ~100 calls, ~15% success | Independent samples, no learning | +| **Iterative Refinement** | ~312+ | No gradient, random walk | +| **Evolution (pop=20)** | ~40-60 | Parallel exploration + crossover | + +The gap widens exponentially with more modules: +- K=5 modules: Iterative ~1,562, Evolution ~70 +- K=6 modules: Iterative ~7,812, Evolution ~90 + +### Note on Pass@k with Closed Models + +The pass@k metric (probability of finding solution in k independent attempts) is commonly used to evaluate LLM capabilities. However: + +- **Open models** (local): Can generate k responses in parallel with `n=k` parameter +- **Closed models** (API): Most don't support `n>1`, requiring k separate API calls + +For this comparison, we include a **random baseline** that simulates pass@k without an LLM. This establishes the "no learning" baseline. + +### Random Baseline Results (100 trials, 100 samples each) + +| Metric | Value | +|--------|-------| +| **Success rate (pass@100)** | 16% (16/100 trials found solution) | +| **Avg samples to solution** | 43.3 (when found) | +| **Min samples** | 5 (lucky guess) | +| **Max samples** | 91 | + +**Pass@k breakdown:** + +| k | Empirical | Theoretical | +|---|-----------|-------------| +| 1 | 0% | 0.2% | +| 10 | 1% | 1.6% | +| 20 | 4% | 3.2% | +| 50 | 9% | 7.7% | +| 100 | 16% | 14.8% | + +The empirical results closely match the theoretical prediction `pass@k ≈ 1 - (624/625)^k`. + +Any method that beats this baseline is demonstrating actual optimization, not just random sampling. + +## Running the Experiment + +### Prerequisites + +1. **OpenEvolve** (this repo): + ```bash + pip install -e . + ``` + +2. **API Key** (both methods use the same model via OpenRouter for fair comparison): + ```bash + export OPENROUTER_API_KEY=your_key + ``` + +Both OpenEvolve and the iterative agent use `google/gemini-2.5-flash-lite` via OpenRouter API for a fair comparison. + +### Run OpenEvolve + +```bash +cd examples/k_module_problem +chmod +x run_openevolve.sh +./run_openevolve.sh 50 +``` + +Or directly: +```bash +openevolve-run initial_program.py evaluator.py --config config.yaml --iterations 50 +``` + +### Run Iterative Agent + +```bash +python iterative_agent.py --iterations 100 +``` + +Or run multiple trials to get statistics: +```bash +python run_iterative_trials.py --trials 3 --iterations 100 +``` + +### Run Random Baseline + +Establish the "no learning" baseline (no LLM needed): + +```bash +python run_random_baseline.py --samples 100 --trials 100 +``` + +This runs 100 independent trials of random search, each with up to 100 samples, and calculates empirical pass@k metrics. + +### Compare Results + +```bash +python compare_results.py +``` + +This generates: +- `comparison_plot.png`: Visual comparison of convergence +- Summary statistics printed to console + +## Experimental Results + +### Iterative Refinement Results (3 trials, 100 iterations max) + +| Trial | Iterations | Result | Best Score | +|-------|------------|--------|------------| +| 1 | 100 | FAILED | 75% (3/4) | +| 2 | 100 | FAILED | 75% (3/4) | +| 3 | 13 | SUCCESS | 100% (4/4) | + +**Summary:** +- **Success rate**: 33% (1/3 trials found solution) +- **When successful**: 13 iterations +- **Failure mode**: Gets stuck at 75% - keeps trying `standardize` instead of `normalize` + +**Key observation**: The iterative agent repeatedly finds configurations with 3/4 correct modules (`csv_reader`, `quicksort`, `json`) but cannot identify that `preprocess` is the wrong module. It keeps cycling through variations without escaping this local optimum. + +### OpenEvolve (Evolutionary) Results + +| Trial | Iterations | Result | Best Score | Notes | +|-------|------------|--------|------------|-------| +| 1 | 21 | SUCCESS | 100% (4/4) | Solution found through population diversity | + +**Summary:** +- **Success rate**: 100% (1/1 trial found solution) +- **Solution found at**: Iteration 21 +- **Key observation**: OpenEvolve's population-based approach explores multiple configurations in parallel. By iteration 9, the population already had diverse configurations, and by iteration 21, the correct combination was discovered. + +**Progression:** +- Iteration 3: 25% (1/4) - Initial exploration +- Iteration 9: 50% (2/4) - Multiple 50% configs in population +- Iteration 21: 100% (4/4) - csv_reader, normalize, quicksort, json - PERFECT! + +**Key advantage**: OpenEvolve's prompt encourages systematic exploration ("try DIFFERENT options for EACH module") rather than following potentially misleading hints. Combined with higher temperature (0.9), larger population (25), and more frequent migration, this leads to faster discovery. + +### Comparison Summary + +| Method | Success Rate | Evaluations to Solution | Key Limitation | +|--------|-------------|------------------------|----------------| +| **Random Baseline** | 16% | 43.3 avg (when found) | No learning | +| **Iterative Refinement** | 33% | 13 (when found) | Gets stuck at 75%, can't escape local optima | +| **OpenEvolve** | 100% | 21 | Population diversity + systematic exploration | + +## Why This Matters + +This example illustrates when you should prefer evolutionary approaches: + +1. **Combinatorial Configuration**: When solutions are combinations of independent choices +2. **Deceptive Fitness**: When partial solutions don't clearly indicate which components are correct +3. **No Gradient**: When small changes don't reliably improve or degrade solutions +4. **Building Block Problems**: When good solutions are assembled from discovered components + +Real-world examples: +- Hyperparameter tuning (learning rate + batch size + architecture) +- Feature selection (which features to include) +- API composition (which services to combine) +- Configuration optimization (compiler flags, system settings) + +## Files + +| File | Description | +|------|-------------| +| `initial_program.py` | Starting configuration (0/4 correct) | +| `evaluator.py` | Scores configurations (0-4 correct modules) | +| `config.yaml` | OpenEvolve configuration | +| `iterative_agent.py` | Iterative refinement agent using OpenRouter API | +| `run_iterative_trials.py` | Run multiple trials of iterative agent | +| `run_random_baseline.py` | Random search baseline with pass@k analysis | +| `compare_results.py` | Analysis and visualization | + +## Configuration Details + +The OpenEvolve config uses settings optimized for this combinatorial problem: + +```yaml +# High temperature for diverse exploration +temperature: 0.9 + +# Very high exploration ratio +exploration_ratio: 0.6 +exploitation_ratio: 0.25 + +# Multiple islands for parallel search with frequent migration +num_islands: 5 +migration_interval: 3 +migration_rate: 0.3 + +# Larger population for more diversity +population_size: 25 +``` + +**Key config improvements over default:** +- Higher temperature (0.9 vs 0.7) - more exploration of different options +- Prompt emphasizes systematic exploration, not following hints +- More islands (5) with faster migration (interval=3) - combines building blocks faster +- Larger population (25) - maintains more diverse configurations + +## References + +- **Building Block Hypothesis**: Holland, J.H. (1975). *Adaptation in Natural and Artificial Systems* +- **Schema Theorem**: Explains how evolution propagates good partial solutions +- **No Free Lunch**: Wolpert & Macready (1997) - Evolution excels on problems with structure + +## Conclusion + +This example demonstrates that **iterative refinement is not sufficient** for problems with independent, combinatorial components. Evolutionary search with population-based exploration and crossover can solve these problems orders of magnitude faster by: + +1. Exploring multiple regions of the search space simultaneously +2. Discovering correct "building blocks" in different individuals +3. Combining discoveries through crossover to assemble complete solutions + +When your optimization problem has this structure, consider evolutionary approaches like OpenEvolve over single-trajectory iterative refinement. diff --git a/examples/k_module_problem/compare_results.py b/examples/k_module_problem/compare_results.py new file mode 100644 index 000000000..1a59b2d47 --- /dev/null +++ b/examples/k_module_problem/compare_results.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +""" +Compare results from OpenEvolve and Iterative Agent on K-Module Problem. + +This script analyzes the outputs from both approaches and generates +comparison plots showing: +1. Convergence speed (iterations to solution) +2. Best score achieved over iterations +3. Total LLM calls made + +Usage: + python compare_results.py [--openevolve-dir DIR] [--iterative-dir DIR] +""" + +import argparse +import json +import os +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + + +def load_openevolve_results(output_dir: str) -> dict: + """Load results from OpenEvolve checkpoint.""" + results = { + "iterations": [], + "scores": [], + "best_scores": [], + "solution_found_at": None, + } + + # Find the latest checkpoint + checkpoint_dir = Path(output_dir) / "checkpoints" + if not checkpoint_dir.exists(): + print(f"Warning: No checkpoints found in {output_dir}") + return results + + checkpoints = sorted(checkpoint_dir.glob("checkpoint_*")) + if not checkpoints: + return results + + latest_checkpoint = checkpoints[-1] + programs_dir = latest_checkpoint / "programs" + + if not programs_dir.exists(): + return results + + # Load all program results + programs = [] + for prog_file in programs_dir.glob("*.json"): + with open(prog_file) as f: + data = json.load(f) + if "iteration_found" in data and "metrics" in data: + programs.append({ + "iteration": data["iteration_found"], + "score": data["metrics"].get("combined_score", 0), + "correct_modules": data["metrics"].get("correct_modules", 0), + "timestamp": data.get("timestamp", 0), + }) + + # Sort by timestamp + programs.sort(key=lambda x: x["timestamp"]) + + # Build iteration-by-iteration results + best_so_far = 0 + for i, prog in enumerate(programs): + results["iterations"].append(i) + results["scores"].append(prog["score"]) + best_so_far = max(best_so_far, prog["score"]) + results["best_scores"].append(best_so_far) + + # Check if solution found (score == 1.0 means 4/4 correct) + if prog["score"] >= 1.0 and results["solution_found_at"] is None: + results["solution_found_at"] = i + + return results + + +def load_iterative_results(output_dir: str) -> dict: + """Load results from iterative agent output.""" + results = { + "iterations": [], + "scores": [], + "best_scores": [], + "solution_found_at": None, + } + + output_path = Path(output_dir) + if not output_path.exists(): + print(f"Warning: No output found in {output_dir}") + return results + + # Look for metrics files (the iterative agent saves metrics per iteration) + metrics_files = sorted(output_path.glob("**/metrics*.json")) + + if not metrics_files: + # Try loading from a single results file + results_file = output_path / "results.json" + if results_file.exists(): + with open(results_file) as f: + data = json.load(f) + if "iterations" in data: + return data + + best_so_far = 0 + for i, mf in enumerate(metrics_files): + with open(mf) as f: + data = json.load(f) + score = data.get("combined_score", data.get("score", 0)) + results["iterations"].append(i) + results["scores"].append(score) + best_so_far = max(best_so_far, score) + results["best_scores"].append(best_so_far) + + if score >= 1.0 and results["solution_found_at"] is None: + results["solution_found_at"] = i + + return results + + +def plot_comparison(openevolve_results: dict, iterative_results: dict, output_file: str = None): + """Generate comparison plot.""" + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + # Plot 1: Score progression + ax1 = axes[0] + + if openevolve_results["iterations"]: + ax1.plot( + openevolve_results["iterations"], + openevolve_results["scores"], + 'g-s', alpha=0.5, markersize=4, label='OpenEvolve (each program)' + ) + ax1.plot( + openevolve_results["iterations"], + openevolve_results["best_scores"], + 'g--', linewidth=2, label='OpenEvolve (best so far)' + ) + + if iterative_results["iterations"]: + ax1.plot( + iterative_results["iterations"], + iterative_results["scores"], + 'b-o', alpha=0.5, markersize=4, label='Iterative Agent (each iteration)' + ) + ax1.plot( + iterative_results["iterations"], + iterative_results["best_scores"], + 'b--', linewidth=2, label='Iterative Agent (best so far)' + ) + + ax1.axhline(y=1.0, color='r', linestyle=':', linewidth=2, label='Solution (4/4 correct)') + ax1.set_xlabel('Program Version / Iteration', fontsize=12) + ax1.set_ylabel('Score (fraction of correct modules)', fontsize=12) + ax1.set_title('K-Module Problem: Convergence Comparison', fontsize=14) + ax1.legend(loc='lower right') + ax1.grid(True, alpha=0.3) + ax1.set_ylim(-0.05, 1.1) + + # Plot 2: Summary statistics + ax2 = axes[1] + + categories = ['Programs/Iterations\nto Solution', 'Final Best Score'] + openevolve_values = [ + openevolve_results["solution_found_at"] if openevolve_results["solution_found_at"] else len(openevolve_results["iterations"]), + max(openevolve_results["best_scores"]) if openevolve_results["best_scores"] else 0 + ] + iterative_values = [ + iterative_results["solution_found_at"] if iterative_results["solution_found_at"] else len(iterative_results["iterations"]), + max(iterative_results["best_scores"]) if iterative_results["best_scores"] else 0 + ] + + x = np.arange(len(categories)) + width = 0.35 + + bars1 = ax2.bar(x - width/2, openevolve_values, width, label='OpenEvolve', color='green', alpha=0.7) + bars2 = ax2.bar(x + width/2, iterative_values, width, label='Iterative Agent', color='blue', alpha=0.7) + + ax2.set_ylabel('Value', fontsize=12) + ax2.set_title('Summary Comparison', fontsize=14) + ax2.set_xticks(x) + ax2.set_xticklabels(categories) + ax2.legend() + + # Add value labels on bars + for bar in bars1: + height = bar.get_height() + ax2.annotate(f'{height:.2f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom', fontsize=10) + + for bar in bars2: + height = bar.get_height() + ax2.annotate(f'{height:.2f}', + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), + textcoords="offset points", + ha='center', va='bottom', fontsize=10) + + plt.tight_layout() + + if output_file: + plt.savefig(output_file, dpi=150) + print(f"Comparison plot saved to: {output_file}") + else: + plt.show() + + +def print_summary(openevolve_results: dict, iterative_results: dict): + """Print summary comparison.""" + print("\n" + "=" * 60) + print("K-MODULE PROBLEM: COMPARISON SUMMARY") + print("=" * 60) + + print("\n### OpenEvolve (Evolutionary Search)") + print(f" Total programs evaluated: {len(openevolve_results['iterations'])}") + if openevolve_results['solution_found_at'] is not None: + print(f" Solution found at program: #{openevolve_results['solution_found_at']}") + else: + print(f" Solution NOT found") + if openevolve_results['best_scores']: + print(f" Final best score: {max(openevolve_results['best_scores']):.4f}") + + print("\n### Iterative Agent (Iterative Refinement)") + print(f" Total iterations: {len(iterative_results['iterations'])}") + if iterative_results['solution_found_at'] is not None: + print(f" Solution found at iteration: #{iterative_results['solution_found_at']}") + else: + print(f" Solution NOT found") + if iterative_results['best_scores']: + print(f" Final best score: {max(iterative_results['best_scores']):.4f}") + + print("\n### Analysis") + if openevolve_results['solution_found_at'] and iterative_results['solution_found_at']: + speedup = iterative_results['solution_found_at'] / openevolve_results['solution_found_at'] + print(f" OpenEvolve found solution {speedup:.1f}x faster") + elif openevolve_results['solution_found_at'] and not iterative_results['solution_found_at']: + print(f" OpenEvolve found solution, Iterative did not") + elif iterative_results['solution_found_at'] and not openevolve_results['solution_found_at']: + print(f" Iterative found solution, OpenEvolve did not") + + print("\n" + "=" * 60) + + +def main(): + parser = argparse.ArgumentParser(description="Compare K-Module problem results") + parser.add_argument( + "--openevolve-dir", + default="openevolve_output", + help="OpenEvolve output directory" + ) + parser.add_argument( + "--iterative-dir", + default="iterative_output", + help="Iterative agent output directory" + ) + parser.add_argument( + "--output", + default="comparison_plot.png", + help="Output plot filename" + ) + args = parser.parse_args() + + # Load results + print("Loading OpenEvolve results...") + openevolve_results = load_openevolve_results(args.openevolve_dir) + + print("Loading Iterative Agent results...") + iterative_results = load_iterative_results(args.iterative_dir) + + # Print summary + print_summary(openevolve_results, iterative_results) + + # Generate plot + if openevolve_results["iterations"] or iterative_results["iterations"]: + plot_comparison(openevolve_results, iterative_results, args.output) + else: + print("No results to plot. Run both approaches first.") + + +if __name__ == "__main__": + main() diff --git a/examples/k_module_problem/config.yaml b/examples/k_module_problem/config.yaml new file mode 100644 index 000000000..0eb244652 --- /dev/null +++ b/examples/k_module_problem/config.yaml @@ -0,0 +1,87 @@ +# K-Module Problem Configuration for OpenEvolve +# Demonstrates evolutionary search vs iterative refinement +# +# This configuration uses the same model (gemini-2.5-flash) as the +# iterative agent for fair comparison. + +max_iterations: 50 +checkpoint_interval: 10 +log_level: "INFO" +random_seed: 123 + +# Full rewrite mode - the problem is about finding the right configuration +diff_based_evolution: false +max_code_length: 10000 + +# LLM Configuration - using lightweight model for cost efficiency +llm: + api_base: "https://openrouter.ai/api/v1" + models: + - name: "google/gemini-2.5-flash-lite" + weight: 1.0 + + temperature: 0.9 # Higher temperature for more exploration + top_p: 0.98 + max_tokens: 4096 + timeout: 60 + retries: 3 + +# Prompt Configuration +prompt: + system_message: | + You are optimizing a data processing pipeline configuration through EXPLORATION. + + The pipeline has 4 independent modules, each with 5 possible options: + - loader: ['csv_reader', 'json_reader', 'xml_reader', 'parquet_reader', 'sql_reader'] + - preprocess: ['normalize', 'standardize', 'minmax', 'scale', 'none'] + - algorithm: ['quicksort', 'mergesort', 'heapsort', 'bubblesort', 'insertion'] + - formatter: ['json', 'xml', 'csv', 'yaml', 'protobuf'] + + CRITICAL: The score tells you how many modules are correct (0-4), but NOT which ones. + This means when you have 3/4 correct, ANY of the 4 modules could be wrong! + + STRATEGY FOR SUCCESS: + 1. When stuck at a score, try DIFFERENT options for EACH module systematically + 2. Don't assume any module is definitely correct - even ones that seem obvious + 3. Combine successful elements from different high-scoring configurations + 4. If multiple configs have the same score, they may have DIFFERENT correct modules + + Your goal: Find the configuration with 4/4 modules correct. + + num_top_programs: 5 # More examples to learn from + num_diverse_programs: 3 # More diversity in examples + include_artifacts: true + max_artifact_bytes: 10240 + +# Database Configuration - KEY FOR EVOLUTIONARY CROSSOVER +database: + population_size: 25 # Larger population for more diversity + archive_size: 15 + num_islands: 5 # More islands for parallel exploration + + # Selection parameters - maximize exploration for combinatorial problem + elite_selection_ratio: 0.15 + exploration_ratio: 0.6 # Very high exploration + exploitation_ratio: 0.25 + + # Feature dimensions - use built-in features + feature_dimensions: ["complexity", "diversity"] + feature_bins: 5 + + # Frequent migration helps combine good building blocks across islands + migration_interval: 3 # More frequent migration + migration_rate: 0.3 # Higher migration rate + +# Evaluator Configuration +evaluator: + timeout: 30 + max_retries: 2 + cascade_evaluation: false # Simple evaluation, no cascade needed + parallel_evaluations: 4 + use_llm_feedback: false + enable_artifacts: true + +# Early stopping - stop when we find the solution +early_stopping_patience: 30 # Reduced - expect faster convergence +convergence_threshold: 0.001 +early_stopping_metric: "combined_score" diff --git a/examples/k_module_problem/evaluator.py b/examples/k_module_problem/evaluator.py new file mode 100644 index 000000000..6d60e34d0 --- /dev/null +++ b/examples/k_module_problem/evaluator.py @@ -0,0 +1,244 @@ +""" +Evaluator for K-Module Pipeline Configuration Problem + +This evaluator scores pipeline configurations based on how many modules +match the target configuration. The key property is that there's NO +gradient information - you only know the count of correct modules, +not WHICH ones are correct. + +This creates a challenging landscape for iterative refinement but +allows evolutionary crossover to combine good "building blocks" +from different individuals. +""" + +import sys +import time +import traceback +import importlib.util + +# The correct solution (hidden from the optimizer) +# This represents the "optimal" pipeline configuration discovered through +# extensive testing/domain expertise +CORRECT_CONFIG = { + 'loader': 'csv_reader', + 'preprocess': 'normalize', + 'algorithm': 'quicksort', + 'formatter': 'json', +} + +# Valid options for each module +VALID_OPTIONS = { + 'loader': ['csv_reader', 'json_reader', 'xml_reader', 'parquet_reader', 'sql_reader'], + 'preprocess': ['normalize', 'standardize', 'minmax', 'scale', 'none'], + 'algorithm': ['quicksort', 'mergesort', 'heapsort', 'bubblesort', 'insertion'], + 'formatter': ['json', 'xml', 'csv', 'yaml', 'protobuf'], +} + +NUM_MODULES = len(CORRECT_CONFIG) + + +def evaluate(program_path: str) -> dict: + """ + Evaluate a pipeline configuration program. + + Args: + program_path: Path to the Python file containing configure_pipeline() + + Returns: + dict with 'metrics' and optionally 'artifacts' + """ + start_time = time.time() + + try: + # Load and execute the program + spec = importlib.util.spec_from_file_location("program", program_path) + module = importlib.util.module_from_spec(spec) + sys.modules["program"] = module + spec.loader.exec_module(module) + + # Get the configuration + if hasattr(module, 'run_pipeline'): + config = module.run_pipeline() + elif hasattr(module, 'configure_pipeline'): + config = module.configure_pipeline() + else: + return _error_result("Program must define run_pipeline() or configure_pipeline()") + + # Validate the configuration + validation_errors = validate_config(config) + if validation_errors: + return _validation_error_result(validation_errors) + + # Score the configuration + correct_count, module_results = score_config(config) + + # Calculate metrics + accuracy = correct_count / NUM_MODULES + + # The combined score rewards finding more correct modules + # but gives NO information about which modules are correct + combined_score = accuracy + + eval_time = time.time() - start_time + + # Build artifacts - provide feedback that helps evolution + # but doesn't reveal which specific modules are wrong + artifacts = build_artifacts(config, correct_count, module_results, eval_time) + + # Return metrics at top level for OpenEvolve compatibility + return { + "correct_modules": correct_count, + "total_modules": NUM_MODULES, + "accuracy": accuracy, + "combined_score": combined_score, + "eval_time": eval_time, + "artifacts": artifacts, + } + + except Exception as e: + return _exception_result(e) + + +def validate_config(config: dict) -> list: + """Validate that the configuration has valid values.""" + errors = [] + + if not isinstance(config, dict): + errors.append(f"Configuration must be a dict, got {type(config).__name__}") + return errors + + # Check all required modules are present + for module_name in CORRECT_CONFIG.keys(): + if module_name not in config: + errors.append(f"Missing required module: '{module_name}'") + elif config[module_name] not in VALID_OPTIONS[module_name]: + errors.append( + f"Invalid value for '{module_name}': '{config[module_name]}'. " + f"Valid options: {VALID_OPTIONS[module_name]}" + ) + + return errors + + +def score_config(config: dict) -> tuple: + """ + Score the configuration against the target. + + Returns: + tuple: (correct_count, module_results dict) + """ + correct_count = 0 + module_results = {} + + for module_name, correct_value in CORRECT_CONFIG.items(): + is_correct = config.get(module_name) == correct_value + if is_correct: + correct_count += 1 + module_results[module_name] = is_correct + + return correct_count, module_results + + +def build_artifacts(config: dict, correct_count: int, module_results: dict, eval_time: float) -> dict: + """ + Build artifacts that provide useful feedback without revealing + exactly which modules are correct. + """ + artifacts = {} + + # Configuration summary + artifacts["configuration"] = str(config) + + # Score feedback - tells you how many are correct, but not which ones + if correct_count == NUM_MODULES: + artifacts["status"] = "PERFECT! All modules correctly configured!" + artifacts["suggestion"] = "Optimal configuration found." + elif correct_count >= NUM_MODULES - 1: + artifacts["status"] = f"Very close! {correct_count}/{NUM_MODULES} modules correct." + artifacts["suggestion"] = "One module may need adjustment. Try variations." + elif correct_count >= NUM_MODULES // 2: + artifacts["status"] = f"Good progress: {correct_count}/{NUM_MODULES} modules correct." + artifacts["suggestion"] = "Some modules are correct. Explore different combinations." + else: + artifacts["status"] = f"Needs improvement: {correct_count}/{NUM_MODULES} modules correct." + artifacts["suggestion"] = "Try different options for each module. Consider the problem domain." + + # Hints about the problem structure (not the solution) + artifacts["problem_hints"] = ( + "Each module choice is independent. " + "The optimal loader processes the most common data format. " + "The optimal preprocessing creates unit variance. " + "The optimal algorithm has O(n log n) average case. " + "The optimal formatter is widely used for APIs." + ) + + artifacts["search_space"] = f"{5**NUM_MODULES} possible combinations" + artifacts["eval_time"] = f"{eval_time:.3f}s" + + return artifacts + + +def _error_result(message: str) -> dict: + """Return an error result.""" + return { + "metrics": { + "correct_modules": 0, + "total_modules": NUM_MODULES, + "accuracy": 0.0, + "combined_score": 0.0, + }, + "artifacts": { + "error": message, + "status": "ERROR", + }, + } + + +def _validation_error_result(errors: list) -> dict: + """Return a validation error result.""" + return { + "metrics": { + "correct_modules": 0, + "total_modules": NUM_MODULES, + "accuracy": 0.0, + "combined_score": 0.0, + }, + "artifacts": { + "validation_errors": "\n".join(errors), + "status": "VALIDATION_ERROR", + "suggestion": "Fix the configuration to use valid module options.", + }, + } + + +def _exception_result(e: Exception) -> dict: + """Return an exception result.""" + return { + "metrics": { + "correct_modules": 0, + "total_modules": NUM_MODULES, + "accuracy": 0.0, + "combined_score": 0.0, + }, + "artifacts": { + "exception": str(e), + "traceback": traceback.format_exc(), + "status": "EXCEPTION", + }, + } + + +# For standalone testing +if __name__ == "__main__": + if len(sys.argv) > 1: + result = evaluate(sys.argv[1]) + print(f"Metrics: {result['metrics']}") + print(f"Artifacts: {result.get('artifacts', {})}") + else: + # Test with the initial program + import os + script_dir = os.path.dirname(os.path.abspath(__file__)) + initial_program = os.path.join(script_dir, "initial_program.py") + result = evaluate(initial_program) + print(f"Metrics: {result['metrics']}") + print(f"Artifacts: {result.get('artifacts', {})}") diff --git a/examples/k_module_problem/initial_program.py b/examples/k_module_problem/initial_program.py new file mode 100644 index 000000000..3f779393b --- /dev/null +++ b/examples/k_module_problem/initial_program.py @@ -0,0 +1,57 @@ +# EVOLVE-BLOCK-START +""" +K-Module Pipeline Configuration Problem + +This problem demonstrates a scenario where iterative refinement struggles +but evolutionary search with crossover excels. + +The task is to find the correct configuration for a 4-component data +processing pipeline. Each module has 5 possible options, creating a +search space of 5^4 = 625 possible combinations. + +The key challenge: there's no gradient information. Getting 3/4 modules +correct gives the same partial feedback as 1/4 - you don't know WHICH +modules are correct. +""" + + +def configure_pipeline(): + """ + Configure a data processing pipeline with 4 independent modules. + + Each module choice is independent - changing one doesn't affect + what's optimal for others. This creates a "needle in haystack" + problem for iterative refinement but is solvable efficiently + by evolutionary crossover. + + Returns: + dict: Configuration with keys 'loader', 'preprocess', 'algorithm', 'formatter' + """ + # Available options for each module: + # loader: ['csv_reader', 'json_reader', 'xml_reader', 'parquet_reader', 'sql_reader'] + # preprocess: ['normalize', 'standardize', 'minmax', 'scale', 'none'] + # algorithm: ['quicksort', 'mergesort', 'heapsort', 'bubblesort', 'insertion'] + # formatter: ['json', 'xml', 'csv', 'yaml', 'protobuf'] + + # Initial guess - likely not optimal + config = { + 'loader': 'json_reader', # Try different loaders + 'preprocess': 'standardize', # Try different preprocessing + 'algorithm': 'mergesort', # Try different algorithms + 'formatter': 'xml', # Try different formatters + } + + return config + + +# EVOLVE-BLOCK-END + + +def run_pipeline(): + """Run the pipeline configuration (entry point for evaluator).""" + return configure_pipeline() + + +if __name__ == "__main__": + config = run_pipeline() + print(f"Pipeline configuration: {config}") diff --git a/examples/k_module_problem/iterative_agent.py b/examples/k_module_problem/iterative_agent.py new file mode 100644 index 000000000..d53fdb72a --- /dev/null +++ b/examples/k_module_problem/iterative_agent.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +""" +Iterative Refinement Agent for K-Module Problem + +This implements a simple iterative refinement approach that: +1. Reads the current program +2. Asks the LLM to improve it based on evaluation feedback +3. Evaluates the new program +4. Repeats until solution found or max iterations reached + +Uses OpenRouter API (OpenAI-compatible) with the same model as OpenEvolve +for fair comparison. +""" + +import argparse +import json +import os +import re +import sys +import time +from pathlib import Path + +import yaml +from openai import OpenAI + +# Add parent to path for imports +sys.path.insert(0, str(Path(__file__).parent)) +from evaluator import evaluate, VALID_OPTIONS, NUM_MODULES + + +def load_config(config_path: str = "config.yaml") -> dict: + """Load configuration from YAML file.""" + with open(config_path) as f: + return yaml.safe_load(f) + + +def extract_code_block(response: str) -> str: + """Extract Python code from LLM response.""" + # Try to find code block with ```python + pattern = r"```python\s*(.*?)\s*```" + matches = re.findall(pattern, response, re.DOTALL) + if matches: + return matches[-1].strip() # Return last code block + + # Try to find code block with just ``` + pattern = r"```\s*(.*?)\s*```" + matches = re.findall(pattern, response, re.DOTALL) + if matches: + return matches[-1].strip() + + # Return the whole response if no code block found + return response.strip() + + +def read_program(program_path: str) -> str: + """Read program from file.""" + with open(program_path) as f: + return f.read() + + +def write_program(program_path: str, code: str) -> None: + """Write program to file.""" + with open(program_path, "w") as f: + f.write(code) + + +def create_improvement_prompt( + current_code: str, + metrics: dict, + artifacts: dict, + iteration: int, + history: list +) -> str: + """Create prompt asking LLM to improve the program.""" + + history_str = "" + if history: + history_str = "\n## Previous Attempts\n" + for h in history[-5:]: # Last 5 attempts + history_str += f"\nIteration {h['iteration']}:\n" + history_str += f"- Score: {h['metrics'].get('correct_modules', 0)}/{NUM_MODULES} modules correct\n" + history_str += f"- Configuration tried: {h['artifacts'].get('configuration', 'N/A')}\n" + + prompt = f"""You are optimizing a data processing pipeline configuration. + +## Problem +Find the correct configuration for a 4-component pipeline. Each module has 5 options: +- loader: {VALID_OPTIONS['loader']} +- preprocess: {VALID_OPTIONS['preprocess']} +- algorithm: {VALID_OPTIONS['algorithm']} +- formatter: {VALID_OPTIONS['formatter']} + +## Hints +- The optimal loader processes the most common data format +- The optimal preprocessing creates unit variance +- The optimal algorithm has O(n log n) average case +- The optimal formatter is widely used for APIs + +## Current Iteration: {iteration} + +## Current Code +```python +{current_code} +``` + +## Last Evaluation Result +- Correct modules: {metrics.get('correct_modules', 0)}/{NUM_MODULES} +- Score: {metrics.get('combined_score', 0):.2%} +- Status: {artifacts.get('status', 'N/A')} +- Suggestion: {artifacts.get('suggestion', 'N/A')} +{history_str} + +## Your Task +Modify the configure_pipeline() function to try a DIFFERENT configuration. +Think about what each hint suggests and try to find the optimal combination. + +IMPORTANT: +- Return ONLY the complete Python code with EVOLVE-BLOCK markers +- Try a different combination than previous attempts +- The code must be valid Python that can be executed + +Return the improved code: +""" + return prompt + + +SYSTEM_PROMPT = """You are an expert programmer optimizing code through iterative refinement. +Your task is to improve Python programs based on evaluation feedback. + +When given a program and its evaluation results: +1. Analyze what the current configuration is doing +2. Think about what the hints suggest +3. Propose a new configuration that might score better +4. Return the complete modified code + +Always return valid Python code within ```python``` code blocks. +Only modify the configuration values, keep the code structure intact.""" + + +def run_iterative_refinement( + initial_program: str, + evaluator_path: str, + config: dict, + max_iterations: int = 100, + output_dir: str = "iterative_output" +) -> dict: + """ + Run iterative refinement loop. + + Returns: + dict with results including iterations, scores, solution_found_at + """ + # Setup output directory + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + # Setup OpenAI client with OpenRouter + llm_config = config.get("llm", {}) + api_base = llm_config.get("api_base", "https://openrouter.ai/api/v1") + api_key = os.environ.get("OPENROUTER_API_KEY") or os.environ.get("OPENAI_API_KEY") + + if not api_key: + raise ValueError("OPENROUTER_API_KEY or OPENAI_API_KEY must be set") + + client = OpenAI(base_url=api_base, api_key=api_key) + + # Get model from config + models = llm_config.get("models", []) + model_name = models[0].get("name", "google/gemini-2.5-flash-lite") if models else "google/gemini-2.5-flash-lite" + temperature = llm_config.get("temperature", 0.7) + max_tokens = llm_config.get("max_tokens", 4096) + + print(f"Using model: {model_name}") + print(f"API base: {api_base}") + print(f"Max iterations: {max_iterations}") + print() + + # Initialize + current_program_path = output_path / "current_program.py" + + # Copy initial program + initial_code = read_program(initial_program) + write_program(str(current_program_path), initial_code) + + results = { + "iterations": [], + "scores": [], + "best_scores": [], + "solution_found_at": None, + "history": [], + "model": model_name, + "api_base": api_base, + } + + best_score = 0 + history = [] + + for iteration in range(max_iterations): + print(f"\n{'='*50}") + print(f"Iteration {iteration + 1}/{max_iterations}") + print('='*50) + + # Read current program + current_code = read_program(str(current_program_path)) + + # Evaluate current program + eval_result = evaluate(str(current_program_path)) + metrics = eval_result.get("metrics", {}) + artifacts = eval_result.get("artifacts", {}) + + score = metrics.get("combined_score", 0) + correct = metrics.get("correct_modules", 0) + + print(f"Score: {correct}/{NUM_MODULES} modules correct ({score:.2%})") + print(f"Config: {artifacts.get('configuration', 'N/A')}") + + # Record results + results["iterations"].append(iteration) + results["scores"].append(score) + best_score = max(best_score, score) + results["best_scores"].append(best_score) + + history.append({ + "iteration": iteration, + "metrics": metrics, + "artifacts": artifacts, + }) + + # Check if solution found + if score >= 1.0: + print(f"\n*** SOLUTION FOUND at iteration {iteration + 1}! ***") + results["solution_found_at"] = iteration + break + + # Generate improvement + prompt = create_improvement_prompt( + current_code, metrics, artifacts, iteration + 1, history + ) + + try: + response = client.chat.completions.create( + model=model_name, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": prompt} + ], + temperature=temperature, + max_tokens=max_tokens, + ) + + response_text = response.choices[0].message.content + new_code = extract_code_block(response_text) + + # Validate the new code has the required structure + if "configure_pipeline" in new_code and "EVOLVE-BLOCK" in new_code: + write_program(str(current_program_path), new_code) + print("Generated new configuration") + else: + print("Warning: Invalid code generated, keeping current") + + except Exception as e: + print(f"Error generating improvement: {e}") + continue + + # Small delay to avoid rate limiting + time.sleep(0.5) + + # Save final results + results["history"] = history + results["final_best_score"] = best_score + results["total_iterations"] = len(results["iterations"]) + + with open(output_path / "results.json", "w") as f: + json.dump(results, f, indent=2) + + print(f"\n{'='*50}") + print("ITERATIVE REFINEMENT COMPLETE") + print('='*50) + print(f"Total iterations: {len(results['iterations'])}") + print(f"Best score: {best_score:.2%}") + if results["solution_found_at"] is not None: + print(f"Solution found at iteration: {results['solution_found_at'] + 1}") + else: + print("Solution NOT found") + print(f"Results saved to: {output_path}") + + return results + + +def main(): + parser = argparse.ArgumentParser(description="Iterative refinement agent for K-Module problem") + parser.add_argument("--initial-program", default="initial_program.py", help="Initial program path") + parser.add_argument("--evaluator", default="evaluator.py", help="Evaluator path") + parser.add_argument("--config", default="config.yaml", help="Config file path") + parser.add_argument("--iterations", type=int, default=100, help="Max iterations") + parser.add_argument("--output", default="iterative_output", help="Output directory") + args = parser.parse_args() + + # Load config + config = load_config(args.config) + + # Run iterative refinement + results = run_iterative_refinement( + initial_program=args.initial_program, + evaluator_path=args.evaluator, + config=config, + max_iterations=args.iterations, + output_dir=args.output, + ) + + return 0 if results["solution_found_at"] is not None else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/k_module_problem/run_iterative.sh b/examples/k_module_problem/run_iterative.sh new file mode 100755 index 000000000..a07f51673 --- /dev/null +++ b/examples/k_module_problem/run_iterative.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Run Iterative Agent on the K-Module Problem +# +# Usage: ./run_iterative.sh [iterations] +# +# Prerequisites: +# 1. Clone the agentic-code-optimization repo: +# git clone https://github.com/ratulm/agentic-code-optimization.git +# +# 2. Set environment variables: +# export MODEL_PROVIDER=gemini +# export GOOGLE_API_KEY=your_key +# +# OR for OpenRouter: +# export MODEL_PROVIDER=openai +# export OPENAI_API_KEY=your_openrouter_key +# export OPENAI_API_BASE=https://openrouter.ai/api/v1 + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ITERATIONS=${1:-50} +AGENT_REPO="${AGENT_REPO:-$HOME/agentic-code-optimization}" + +echo "==============================================" +echo "Running Iterative Agent on K-Module Problem" +echo "==============================================" +echo "Iterations: $ITERATIONS" +echo "Agent repo: $AGENT_REPO" +echo "" + +# Check if agent repo exists +if [ ! -d "$AGENT_REPO" ]; then + echo "Error: agentic-code-optimization repo not found at $AGENT_REPO" + echo "" + echo "Please clone it first:" + echo " git clone https://github.com/ratulm/agentic-code-optimization.git $AGENT_REPO" + echo "" + echo "Or set AGENT_REPO environment variable to the correct path" + exit 1 +fi + +# Check for API key +if [ -z "$GOOGLE_API_KEY" ] && [ -z "$OPENAI_API_KEY" ]; then + echo "Warning: No API key found" + echo "Set GOOGLE_API_KEY for Gemini or OPENAI_API_KEY for OpenRouter" +fi + +# Create output directory +OUTPUT_DIR="$SCRIPT_DIR/iterative_output" +mkdir -p "$OUTPUT_DIR" + +# Copy files to agent repo examples (required by the agent) +EXAMPLE_DIR="$AGENT_REPO/examples/k_module" +mkdir -p "$EXAMPLE_DIR" +cp "$SCRIPT_DIR/initial_program.py" "$EXAMPLE_DIR/" +cp "$SCRIPT_DIR/evaluator.py" "$EXAMPLE_DIR/" + +echo "Running iterative agent..." +cd "$AGENT_REPO" + +# Set model provider if not set +export MODEL_PROVIDER="${MODEL_PROVIDER:-gemini}" + +# Run the agent +python code_optimization.py \ + --initial-program "$EXAMPLE_DIR/initial_program.py" \ + --evaluator "$EXAMPLE_DIR/evaluator.py" \ + --iterations "$ITERATIONS" + +# Copy results back +if [ -d "$AGENT_REPO/outputs" ]; then + cp -r "$AGENT_REPO/outputs"/* "$OUTPUT_DIR/" 2>/dev/null || true +fi + +echo "" +echo "==============================================" +echo "Iterative agent run complete!" +echo "Results saved to: $OUTPUT_DIR" +echo "==============================================" diff --git a/examples/k_module_problem/run_iterative_trials.py b/examples/k_module_problem/run_iterative_trials.py new file mode 100644 index 000000000..0c9b80171 --- /dev/null +++ b/examples/k_module_problem/run_iterative_trials.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +"""Run multiple trials of iterative refinement to get statistics.""" + +import json +import os +import shutil +import sys +from pathlib import Path + +# Run from the example directory +os.chdir(Path(__file__).parent) + +from iterative_agent import run_iterative_refinement, load_config + + +def run_trials(num_trials: int = 10, max_iterations: int = 100): + """Run multiple trials and collect statistics.""" + config = load_config("config.yaml") + + results = [] + solutions_found = [] + + for trial in range(num_trials): + print(f"\n{'#'*60}") + print(f"# TRIAL {trial + 1}/{num_trials}") + print('#'*60) + + # Clean output directory for each trial + output_dir = f"iterative_output_trial_{trial}" + if os.path.exists(output_dir): + shutil.rmtree(output_dir) + + # Run trial + result = run_iterative_refinement( + initial_program="initial_program.py", + evaluator_path="evaluator.py", + config=config, + max_iterations=max_iterations, + output_dir=output_dir, + ) + + results.append({ + "trial": trial, + "solution_found_at": result["solution_found_at"], + "final_best_score": result["final_best_score"], + "total_iterations": result["total_iterations"], + }) + + if result["solution_found_at"] is not None: + solutions_found.append(result["solution_found_at"]) + + # Calculate statistics + success_rate = len(solutions_found) / num_trials + avg_iterations = sum(solutions_found) / len(solutions_found) if solutions_found else float('inf') + min_iterations = min(solutions_found) if solutions_found else None + max_iterations_found = max(solutions_found) if solutions_found else None + + print(f"\n{'='*60}") + print("ITERATIVE REFINEMENT TRIAL RESULTS") + print('='*60) + print(f"Trials: {num_trials}") + print(f"Max iterations per trial: {max_iterations}") + print(f"Success rate: {success_rate:.1%} ({len(solutions_found)}/{num_trials})") + if solutions_found: + print(f"Avg iterations to solution: {avg_iterations:.1f}") + print(f"Min iterations: {min_iterations}") + print(f"Max iterations: {max_iterations_found}") + print('='*60) + + # Save summary + summary = { + "config": { + "num_trials": num_trials, + "max_iterations": max_iterations, + }, + "summary": { + "success_rate": success_rate, + "avg_iterations_to_solution": avg_iterations if solutions_found else None, + "min_iterations": min_iterations, + "max_iterations": max_iterations_found, + "solutions_found": len(solutions_found), + }, + "trials": results, + } + + with open("iterative_trials_results.json", "w") as f: + json.dump(summary, f, indent=2) + + print(f"\nResults saved to: iterative_trials_results.json") + + return summary + + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--trials", type=int, default=10, help="Number of trials") + parser.add_argument("--iterations", type=int, default=100, help="Max iterations per trial") + args = parser.parse_args() + + run_trials(num_trials=args.trials, max_iterations=args.iterations) diff --git a/examples/k_module_problem/run_openevolve.sh b/examples/k_module_problem/run_openevolve.sh new file mode 100755 index 000000000..cf30dfa06 --- /dev/null +++ b/examples/k_module_problem/run_openevolve.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Run OpenEvolve on the K-Module Problem +# +# Usage: ./run_openevolve.sh [iterations] +# +# Make sure OPENROUTER_API_KEY is set in your environment + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ITERATIONS=${1:-50} + +echo "==============================================" +echo "Running OpenEvolve on K-Module Problem" +echo "==============================================" +echo "Iterations: $ITERATIONS" +echo "Config: config.yaml" +echo "" + +# Check for API key +if [ -z "$OPENROUTER_API_KEY" ]; then + echo "Warning: OPENROUTER_API_KEY not set" + echo "Set it with: export OPENROUTER_API_KEY=your_key" +fi + +# Run OpenEvolve +cd "$SCRIPT_DIR" +openevolve-run initial_program.py evaluator.py \ + --config config.yaml \ + --iterations "$ITERATIONS" + +echo "" +echo "==============================================" +echo "OpenEvolve run complete!" +echo "Results saved to: openevolve_output/" +echo "==============================================" diff --git a/examples/k_module_problem/run_random_baseline.py b/examples/k_module_problem/run_random_baseline.py new file mode 100644 index 000000000..b94f5ca76 --- /dev/null +++ b/examples/k_module_problem/run_random_baseline.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +Random Baseline for K-Module Problem + +This script establishes a baseline by randomly sampling configurations. +It simulates what you'd get with pass@N (N independent random attempts) +without any learning or optimization. + +This is useful because: +1. It establishes the "no learning" baseline +2. For closed models that don't support n>1 responses, we can't do true pass@k +3. Shows the expected performance of random search + +Usage: + python run_random_baseline.py [--samples 100] [--trials 10] +""" + +import argparse +import json +import random +import time +from pathlib import Path + +# Import the evaluator +from evaluator import VALID_OPTIONS, CORRECT_CONFIG, NUM_MODULES + + +def generate_random_config() -> dict: + """Generate a random pipeline configuration.""" + return { + module: random.choice(options) + for module, options in VALID_OPTIONS.items() + } + + +def score_config(config: dict) -> int: + """Score a configuration (number of correct modules).""" + return sum( + 1 for module, value in config.items() + if CORRECT_CONFIG.get(module) == value + ) + + +def run_random_search(max_samples: int) -> dict: + """ + Run random search until solution found or max_samples reached. + + Returns: + dict with results + """ + results = { + "samples": [], + "scores": [], + "best_scores": [], + "solution_found_at": None, + "configs_tried": [], + } + + best_so_far = 0 + + for i in range(max_samples): + config = generate_random_config() + score = score_config(config) + + results["samples"].append(i) + results["scores"].append(score / NUM_MODULES) + best_so_far = max(best_so_far, score) + results["best_scores"].append(best_so_far / NUM_MODULES) + results["configs_tried"].append(config) + + if score == NUM_MODULES and results["solution_found_at"] is None: + results["solution_found_at"] = i + + return results + + +def run_multiple_trials(num_trials: int, max_samples: int) -> list: + """Run multiple independent trials of random search.""" + trial_results = [] + + for trial in range(num_trials): + random.seed(trial * 1000 + int(time.time())) # Different seed per trial + result = run_random_search(max_samples) + trial_results.append({ + "trial": trial, + "solution_found_at": result["solution_found_at"], + "final_best_score": result["best_scores"][-1] if result["best_scores"] else 0, + "scores": result["scores"], + "best_scores": result["best_scores"], + }) + + return trial_results + + +def calculate_pass_at_k(trial_results: list, k_values: list) -> dict: + """ + Calculate pass@k metrics. + + pass@k = probability of finding solution within k samples + + For random search on 625 possibilities: + - pass@1 = 1/625 = 0.16% + - pass@100 ≈ 1 - (624/625)^100 ≈ 14.8% + - pass@312 ≈ 50% (half the search space) + """ + pass_at_k = {} + + for k in k_values: + successes = sum( + 1 for r in trial_results + if r["solution_found_at"] is not None and r["solution_found_at"] < k + ) + pass_at_k[k] = successes / len(trial_results) + + return pass_at_k + + +def theoretical_pass_at_k(k: int, search_space: int = 625) -> float: + """Calculate theoretical pass@k for uniform random search.""" + # Probability of NOT finding solution in k tries + prob_fail = ((search_space - 1) / search_space) ** k + return 1 - prob_fail + + +def main(): + parser = argparse.ArgumentParser(description="Random baseline for K-Module problem") + parser.add_argument("--samples", type=int, default=100, help="Max samples per trial") + parser.add_argument("--trials", type=int, default=100, help="Number of independent trials") + parser.add_argument("--output", default="random_baseline_output", help="Output directory") + args = parser.parse_args() + + print("=" * 60) + print("K-MODULE PROBLEM: RANDOM BASELINE") + print("=" * 60) + print(f"Search space: {5**NUM_MODULES} configurations") + print(f"Running {args.trials} trials with up to {args.samples} samples each") + print() + + # Run trials + print("Running random search trials...") + trial_results = run_multiple_trials(args.trials, args.samples) + + # Calculate statistics + solutions_found = [r for r in trial_results if r["solution_found_at"] is not None] + success_rate = len(solutions_found) / len(trial_results) + + if solutions_found: + avg_samples_to_solution = sum(r["solution_found_at"] for r in solutions_found) / len(solutions_found) + min_samples = min(r["solution_found_at"] for r in solutions_found) + max_samples = max(r["solution_found_at"] for r in solutions_found) + else: + avg_samples_to_solution = float('inf') + min_samples = max_samples = None + + # Calculate pass@k + k_values = [1, 10, 20, 50, 100, 200, 312] + k_values = [k for k in k_values if k <= args.samples] + empirical_pass_at_k = calculate_pass_at_k(trial_results, k_values) + + # Print results + print("\n### Results") + print(f" Success rate: {success_rate:.1%} ({len(solutions_found)}/{len(trial_results)} trials)") + if solutions_found: + print(f" Avg samples to solution: {avg_samples_to_solution:.1f}") + print(f" Min samples: {min_samples}") + print(f" Max samples: {max_samples}") + else: + print(f" No solutions found in {args.samples} samples") + + print("\n### Pass@k Comparison (Empirical vs Theoretical)") + print(" k | Empirical | Theoretical") + print(" ------|-----------|------------") + for k in k_values: + emp = empirical_pass_at_k.get(k, 0) + theo = theoretical_pass_at_k(k) + print(f" {k:5d} | {emp:8.1%} | {theo:8.1%}") + + # Save results + output_dir = Path(args.output) + output_dir.mkdir(exist_ok=True) + + results = { + "config": { + "samples_per_trial": args.samples, + "num_trials": args.trials, + "search_space": 5 ** NUM_MODULES, + }, + "summary": { + "success_rate": success_rate, + "avg_samples_to_solution": avg_samples_to_solution if solutions_found else None, + "min_samples": min_samples, + "max_samples": max_samples, + }, + "pass_at_k": { + "empirical": empirical_pass_at_k, + "theoretical": {k: theoretical_pass_at_k(k) for k in k_values}, + }, + "trials": trial_results, + } + + with open(output_dir / "random_baseline_results.json", "w") as f: + json.dump(results, f, indent=2) + + print(f"\nResults saved to: {output_dir}/random_baseline_results.json") + + # Key insight + print("\n### Key Insight") + print(" Random search requires ~312 samples (50% of search space) on average.") + print(" This is the baseline that any optimization method should beat.") + print(" ") + print(" For LLM-based methods:") + print(" - pass@k with closed models requires k separate API calls") + print(" - Each call is independent (no learning across calls)") + print(" - This is equivalent to random search if prompts don't help") + print(" ") + print(" OpenEvolve should find solutions in <<312 evaluations by:") + print(" - Learning from population diversity") + print(" - Combining good 'building blocks' via crossover") + + print("\n" + "=" * 60) + + +if __name__ == "__main__": + main() diff --git a/openevolve/_version.py b/openevolve/_version.py index 5441cefd0..7fc3f179d 100644 --- a/openevolve/_version.py +++ b/openevolve/_version.py @@ -1,3 +1,3 @@ """Version information for openevolve package.""" -__version__ = "0.2.24" +__version__ = "0.2.25"