diff --git a/docs/evals.md b/docs/evals.md index 95efebd..7fa7c30 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci | `--azure-api-version` | ❌ | Azure OpenAI API version (default: 2025-01-01-preview) | | `--models` | ❌ | Models for benchmark mode (benchmark only) | | `--latency-iterations` | ❌ | Latency test samples (default: 25) (benchmark only) | +| `--max-parallel-models` | ❌ | Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) | +| `--benchmark-chunk-size` | ❌ | Sample chunk size per model for memory-efficient benchmarking (benchmark only) | ## Configuration @@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h - **Multi-stage evaluation**: pre_flight, input, output stages - **Automatic stage detection**: Evaluates all stages found in configuration - **Batch processing**: Configurable parallel processing +- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults +- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking - **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds - **Latency testing**: End-to-end guardrail performance measurement - **Visualization**: Automatic chart and graph generation diff --git a/src/__tests__/unit/evals/guardrail-evals.test.ts b/src/__tests__/unit/evals/guardrail-evals.test.ts new file mode 100644 index 0000000..cf78c56 --- /dev/null +++ b/src/__tests__/unit/evals/guardrail-evals.test.ts @@ -0,0 +1,94 @@ +/** + * Unit tests for guardrail evaluation utilities. + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { GuardrailEval } from '../../../evals/guardrail-evals'; +import type { Sample } from '../../../evals/core/types'; +import * as os from 'os'; + +vi.mock('os', () => { + return { + default: { + cpus: vi.fn(), + }, + cpus: vi.fn(), + }; +}); + +function buildSamples(count: number): Sample[] { + /**Build synthetic samples for chunking tests. + * + * @param count - Number of synthetic samples to build. + * @returns List of Sample instances configured for evaluation. + */ + return Array.from({ length: count }, (_, idx) => ({ + id: `sample-${idx}`, + data: `payload-${idx}`, + expectedTriggers: { g: Boolean(idx % 2) }, + })); +} + +describe('GuardrailEval._determineParallelModelLimit', () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('should use cpu_count when explicit parallelism is not provided', () => { + vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]); + + expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4); + expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2); + }); + + it('should honor user-provided parallelism constraints', () => { + expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3); + expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive'); + }); + + it('should throw error for invalid model count', () => { + expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive'); + }); +}); + +describe('GuardrailEval._chunkSamples', () => { + it('should return the original sample list when no chunk size is provided', () => { + const samples = buildSamples(3); + const chunks = Array.from(GuardrailEval._chunkSamples(samples, null)); + expect(chunks.length).toBe(1); + expect(chunks[0]).toBe(samples); + }); + + it('should split samples into evenly sized chunks', () => { + const samples = buildSamples(5); + const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2)); + expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]); + expect(chunks[0][0].id).toBe('sample-0'); + expect(chunks[1][0].id).toBe('sample-2'); + expect(chunks[2][0].id).toBe('sample-4'); + }); + + it('should reject invalid chunk sizes', () => { + const samples = buildSamples(2); + expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided'); + }); + + it('should return single chunk when chunk size is larger than samples', () => { + const samples = buildSamples(3); + const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10)); + expect(chunks.length).toBe(1); + expect(chunks[0]).toBe(samples); + }); + + it('should handle empty samples', () => { + const samples: Sample[] = []; + const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2)); + expect(chunks.length).toBe(1); + expect(chunks[0]).toEqual([]); + }); +}); + diff --git a/src/cli.ts b/src/cli.ts index cedaa22..af3b958 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -36,6 +36,16 @@ interface CliArgs { batchSize?: number; outputDir?: string; multiTurn?: boolean; + maxParallelModels?: number | null; + benchmarkChunkSize?: number | null; + mode?: 'evaluate' | 'benchmark'; + stages?: string[]; + models?: string[]; + latencyIterations?: number; + apiKey?: string | null; + baseUrl?: string | null; + azureEndpoint?: string | null; + azureApiVersion?: string; help?: boolean; } @@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs { args.outputDir = argv[++i]; } else if (arg === '--multi-turn') { args.multiTurn = true; + } else if (arg === '--max-parallel-models') { + const value = parseInt(argv[++i], 10); + if (isNaN(value) || value <= 0) { + console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`); + process.exit(1); + } + args.maxParallelModels = value; + } else if (arg === '--benchmark-chunk-size') { + const value = parseInt(argv[++i], 10); + if (isNaN(value) || value <= 0) { + console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`); + process.exit(1); + } + args.benchmarkChunkSize = value; + } else if (arg === '--mode') { + const mode = argv[++i]; + if (mode !== 'evaluate' && mode !== 'benchmark') { + console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`); + process.exit(1); + } + args.mode = mode as 'evaluate' | 'benchmark'; + } else if (arg === '--stages') { + args.stages = []; + while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) { + args.stages.push(argv[++i]); + } + } else if (arg === '--models') { + args.models = []; + while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) { + args.models.push(argv[++i]); + } + } else if (arg === '--latency-iterations') { + const value = parseInt(argv[++i], 10); + if (isNaN(value) || value <= 0) { + console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`); + process.exit(1); + } + args.latencyIterations = value; + } else if (arg === '--api-key') { + args.apiKey = argv[++i]; + } else if (arg === '--base-url') { + args.baseUrl = argv[++i]; + } else if (arg === '--azure-endpoint') { + args.azureEndpoint = argv[++i]; + } else if (arg === '--azure-api-version') { + args.azureApiVersion = argv[++i]; } else if (!args.configFile && !arg.startsWith('-')) { args.configFile = arg; } @@ -119,6 +175,12 @@ function showHelp(): void { console.log( ' --dataset-path Path to evaluation dataset (required)' ); + console.log( + ' --mode Evaluation mode: "evaluate" or "benchmark" (default: evaluate)' + ); + console.log( + ' --stages ... Pipeline stages to evaluate: pre_flight, input, output' + ); console.log( ' --batch-size Number of samples to process in parallel (default: 32)' ); @@ -128,6 +190,32 @@ function showHelp(): void { console.log( ' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)' ); + console.log('Benchmark Options:'); + console.log( + ' --models ... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)' + ); + console.log( + ' --latency-iterations Number of iterations for latency testing (default: 25)' + ); + console.log( + ' --max-parallel-models Maximum number of models to benchmark concurrently (default: min(models, cpu_count))' + ); + console.log( + ' --benchmark-chunk-size Optional number of samples per chunk when benchmarking to limit long-running runs' + ); + console.log('API Configuration:'); + console.log( + ' --api-key API key for OpenAI, Azure OpenAI, or OpenAI-compatible API' + ); + console.log( + ' --base-url Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)' + ); + console.log( + ' --azure-endpoint Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)' + ); + console.log( + ' --azure-api-version Azure OpenAI API version (default: 2025-01-01-preview)' + ); console.log(''); console.log('Examples:'); console.log(' guardrails validate config.json'); @@ -136,6 +224,12 @@ function showHelp(): void { console.log( ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results' ); + console.log( + ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini' + ); + console.log( + ' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key' + ); console.log(' guardrails validate-dataset dataset.jsonl'); } @@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise { process.exit(1); } + if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) { + console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`); + process.exit(1); + } + + if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) { + console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`); + process.exit(1); + } + + if (args.latencyIterations !== undefined && args.latencyIterations <= 0) { + console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`); + process.exit(1); + } + + if (args.stages) { + const validStages = new Set(['pre_flight', 'input', 'output']); + const invalidStages = args.stages.filter((s) => !validStages.has(s)); + if (invalidStages.length > 0) { + console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`); + process.exit(1); + } + } + + if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) { + console.warn('⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.'); + } + + if (args.azureEndpoint && args.baseUrl) { + console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.'); + process.exit(1); + } + + if (args.azureEndpoint && !args.apiKey) { + console.error('❌ Error: --api-key is required when using --azure-endpoint'); + process.exit(1); + } + try { await runEvaluationCLI({ configPath: args.configPath, datasetPath: args.datasetPath, + stages: args.stages || null, batchSize: args.batchSize || 32, outputDir: args.outputDir || 'results', + apiKey: args.apiKey || null, + baseUrl: args.baseUrl || null, + azureEndpoint: args.azureEndpoint || null, + azureApiVersion: args.azureApiVersion || '2025-01-01-preview', + mode: args.mode || 'evaluate', + models: args.models || null, + latencyIterations: args.latencyIterations, multiTurn: args.multiTurn, + maxParallelModels: args.maxParallelModels, + benchmarkChunkSize: args.benchmarkChunkSize, }); console.log('Evaluation completed successfully!'); diff --git a/src/evals/core/async-engine.ts b/src/evals/core/async-engine.ts index f39b94e..8a3a5e9 100644 --- a/src/evals/core/async-engine.ts +++ b/src/evals/core/async-engine.ts @@ -46,9 +46,9 @@ export class AsyncRunEngine implements RunEngine { } const results: SampleResult[] = []; - let processed = 0; + const totalSamples = samples.length; - console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`); + console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`); for (let i = 0; i < samples.length; i += batchSize) { const batch = samples.slice(i, i + batchSize); @@ -56,8 +56,7 @@ export class AsyncRunEngine implements RunEngine { batch.map((sample) => this.evaluateSample(context, sample)) ); results.push(...batchResults); - processed += batch.length; - console.log(`Processed ${processed}/${samples.length} samples`); + console.log(`Processed ${results.length}/${totalSamples} samples`); } return results; diff --git a/src/evals/core/benchmark-calculator.ts b/src/evals/core/benchmark-calculator.ts new file mode 100644 index 0000000..3362698 --- /dev/null +++ b/src/evals/core/benchmark-calculator.ts @@ -0,0 +1,295 @@ +/** + * Advanced metrics calculator for guardrail benchmarking. + * + * This module implements advanced evaluation metrics for benchmarking guardrail performance + * across different models. + */ + +import { SampleResult } from './types'; + +/** + * Calculates advanced benchmarking metrics for guardrail evaluation. + */ +export class BenchmarkMetricsCalculator { + /** + * Calculate advanced metrics for a specific guardrail. + * + * @param results - List of evaluation results + * @param guardrailName - Name of the guardrail to analyze + * @param guardrailConfig - Guardrail configuration to check for confidence thresholds + * @returns Dictionary containing advanced metrics, or empty dict if not applicable + */ + calculateAdvancedMetrics( + results: SampleResult[], + guardrailName: string, + guardrailConfig?: Record | null + ): Record { + if (!guardrailConfig || !('confidence_threshold' in guardrailConfig)) { + return {}; + } + + if (results.length === 0) { + throw new Error('Cannot calculate metrics for empty results list'); + } + + const { yTrue, yScores } = this.extractLabelsAndScores(results, guardrailName); + + if (yTrue.length === 0) { + throw new Error(`No valid data found for guardrail '${guardrailName}'`); + } + + return this.calculateMetrics(yTrue, yScores); + } + + private extractLabelsAndScores( + results: SampleResult[], + guardrailName: string + ): { yTrue: number[]; yScores: number[] } { + const yTrue: number[] = []; + const yScores: number[] = []; + + for (const result of results) { + if (!(guardrailName in result.expectedTriggers)) { + console.warn( + `Guardrail '${guardrailName}' not found in expectedTriggers for sample ${result.id}` + ); + continue; + } + + const expected = result.expectedTriggers[guardrailName]; + yTrue.push(expected ? 1 : 0); + + // Get confidence score from details, fallback to binary + const confidence = this.getConfidenceScore(result, guardrailName); + yScores.push(confidence); + } + + return { yTrue, yScores }; + } + + private getConfidenceScore(result: SampleResult, guardrailName: string): number { + if (guardrailName in result.details) { + const guardrailDetails = result.details[guardrailName]; + if ( + typeof guardrailDetails === 'object' && + guardrailDetails !== null && + 'confidence' in guardrailDetails + ) { + const conf = guardrailDetails.confidence; + if (typeof conf === 'number') { + return conf; + } + } + } + + // Fallback to binary: 1.0 if triggered, 0.0 if not + const actual = result.triggered[guardrailName] || false; + return actual ? 1.0 : 0.0; + } + + private calculateMetrics(yTrue: number[], yScores: number[]): Record { + const metrics: Record = {}; + + // Calculate ROC AUC + try { + metrics.roc_auc = this.calculateRocAuc(yTrue, yScores); + } catch (error) { + console.warn(`Could not calculate ROC AUC: ${error}`); + metrics.roc_auc = NaN; + } + + // Calculate precision at different recall thresholds + try { + const { precision, recall } = this.precisionRecallCurve(yTrue, yScores); + metrics.prec_at_r80 = this.precisionAtRecall(precision, recall, 0.8); + metrics.prec_at_r90 = this.precisionAtRecall(precision, recall, 0.9); + metrics.prec_at_r95 = this.precisionAtRecall(precision, recall, 0.95); + } catch (error) { + console.warn(`Could not calculate precision at recall thresholds: ${error}`); + metrics.prec_at_r80 = NaN; + metrics.prec_at_r90 = NaN; + metrics.prec_at_r95 = NaN; + } + + // Calculate recall at FPR = 0.01 + try { + const { fpr, tpr } = this.rocCurve(yTrue, yScores); + metrics.recall_at_fpr01 = this.recallAtFpr(fpr, tpr, 0.01); + } catch (error) { + console.warn(`Could not calculate recall at FPR=0.01: ${error}`); + metrics.recall_at_fpr01 = NaN; + } + + return metrics; + } + + private calculateRocAuc(yTrue: number[], yScores: number[]): number { + // Sort by score descending + const combined = yTrue.map((label, i) => ({ label, score: yScores[i] })); + combined.sort((a, b) => b.score - a.score); + + const totalPositives = yTrue.filter((y) => y === 1).length; + const totalNegatives = yTrue.length - totalPositives; + + if (totalPositives === 0 || totalNegatives === 0) { + throw new Error('Need both positive and negative samples to calculate ROC AUC'); + } + + let auc = 0; + let tp = 0; + let fp = 0; + let prevTpr = 0; + let prevFpr = 0; + + for (const item of combined) { + if (item.label === 1) { + tp += 1; + } else { + fp += 1; + } + + const tpr = tp / totalPositives; + const fpr = fp / totalNegatives; + + // Trapezoidal rule + auc += (fpr - prevFpr) * (tpr + prevTpr) / 2; + + prevTpr = tpr; + prevFpr = fpr; + } + + return auc; + } + + private precisionRecallCurve(yTrue: number[], yScores: number[]): { + precision: number[]; + recall: number[]; + } { + // Sort by score descending + const combined = yTrue.map((label, i) => ({ label, score: yScores[i] })); + combined.sort((a, b) => b.score - a.score); + + const totalPositives = yTrue.filter((y) => y === 1).length; + if (totalPositives === 0) { + return { precision: [1], recall: [0] }; + } + + const precision: number[] = []; + const recall: number[] = []; + + let tp = 0; + let fp = 0; + + // Add initial point (recall=0, precision=1) + precision.push(1); + recall.push(0); + + for (const item of combined) { + if (item.label === 1) { + tp += 1; + } else { + fp += 1; + } + + const prec = tp + fp > 0 ? tp / (tp + fp) : 1; + const rec = tp / totalPositives; + + precision.push(prec); + recall.push(rec); + } + + return { precision, recall }; + } + + private rocCurve(yTrue: number[], yScores: number[]): { fpr: number[]; tpr: number[] } { + // Sort by score descending + const combined = yTrue.map((label, i) => ({ label, score: yScores[i] })); + combined.sort((a, b) => b.score - a.score); + + const totalPositives = yTrue.filter((y) => y === 1).length; + const totalNegatives = yTrue.length - totalPositives; + + const fpr: number[] = [0]; + const tpr: number[] = [0]; + + let tp = 0; + let fp = 0; + + for (const item of combined) { + if (item.label === 1) { + tp += 1; + } else { + fp += 1; + } + + tpr.push(tp / totalPositives); + fpr.push(fp / totalNegatives); + } + + return { fpr, tpr }; + } + + private precisionAtRecall(precision: number[], recall: number[], targetRecall: number): number { + let bestPrecision = 0; + + for (let i = 0; i < recall.length; i += 1) { + if (recall[i] >= targetRecall) { + bestPrecision = Math.max(bestPrecision, precision[i]); + } + } + + return bestPrecision; + } + + private recallAtFpr(fpr: number[], tpr: number[], targetFpr: number): number { + let bestRecall = 0; + + for (let i = 0; i < fpr.length; i += 1) { + if (fpr[i] <= targetFpr) { + bestRecall = Math.max(bestRecall, tpr[i]); + } + } + + return bestRecall; + } + + /** + * Calculate advanced metrics for all guardrails in the results. + * + * @param results - List of evaluation results + * @returns Dictionary mapping guardrail names to their advanced metrics + */ + calculateAllGuardrailMetrics( + results: SampleResult[] + ): Record> { + if (results.length === 0) { + return {}; + } + + const guardrailNames = new Set(); + for (const result of results) { + Object.keys(result.expectedTriggers).forEach((name) => guardrailNames.add(name)); + } + + const metrics: Record> = {}; + + for (const guardrailName of guardrailNames) { + try { + const guardrailMetrics = this.calculateAdvancedMetrics(results, guardrailName); + metrics[guardrailName] = guardrailMetrics; + } catch (error) { + console.error(`Failed to calculate metrics for guardrail '${guardrailName}': ${error}`); + metrics[guardrailName] = { + roc_auc: NaN, + prec_at_r80: NaN, + prec_at_r90: NaN, + prec_at_r95: NaN, + recall_at_fpr01: NaN, + }; + } + } + + return metrics; + } +} + diff --git a/src/evals/core/benchmark-reporter.ts b/src/evals/core/benchmark-reporter.ts new file mode 100644 index 0000000..62d39b1 --- /dev/null +++ b/src/evals/core/benchmark-reporter.ts @@ -0,0 +1,291 @@ +/** + * Benchmark results reporter for guardrail evaluation. + * + * This module handles saving benchmark results in a specialized format with analysis + * folders containing visualizations and detailed metrics. + */ + +import { SampleResult } from './types'; +import * as fs from 'fs/promises'; +import * as path from 'path'; + +/** + * Reports benchmark results with specialized output format. + */ +export class BenchmarkReporter { + private readonly outputDir: string; + + /** + * Initialize the benchmark reporter. + * + * @param outputDir - Base directory for benchmark results + */ + constructor(outputDir: string) { + this.outputDir = outputDir; + } + + /** + * Save benchmark results in organized folder structure. + * + * @param resultsByModel - Dictionary mapping model names to their results + * @param metricsByModel - Dictionary mapping model names to their metrics + * @param latencyResults - Dictionary mapping model names to their latency data + * @param guardrailName - Name of the guardrail being benchmarked + * @param datasetSize - Number of samples in the dataset + * @param latencyIterations - Number of iterations used for latency testing + * @returns Path to the benchmark results directory + */ + async saveBenchmarkResults( + resultsByModel: Record, + metricsByModel: Record>, + latencyResults: Record>, + guardrailName: string, + datasetSize: number, + latencyIterations: number + ): Promise { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19).replace('T', '_'); + const benchmarkDir = path.join(this.outputDir, `benchmark_${guardrailName}_${timestamp}`); + + await fs.mkdir(benchmarkDir, { recursive: true }); + + // Create subdirectories + const resultsDir = path.join(benchmarkDir, 'results'); + const graphsDir = path.join(benchmarkDir, 'graphs'); + await fs.mkdir(resultsDir, { recursive: true }); + await fs.mkdir(graphsDir, { recursive: true }); + + try { + // Save per-model results + for (const [modelName, results] of Object.entries(resultsByModel)) { + const modelResultsFile = path.join( + resultsDir, + `eval_results_${guardrailName}_${modelName}.jsonl` + ); + await this.saveResultsJsonl(results, modelResultsFile); + console.info(`Model ${modelName} results saved to ${modelResultsFile}`); + } + + // Save combined data + await this.saveMetricsJson(metricsByModel, path.join(resultsDir, 'performance_metrics.json')); + await this.saveLatencyJson(latencyResults, path.join(resultsDir, 'latency_results.json')); + + // Save summary files + const summaryFile = path.join(benchmarkDir, 'benchmark_summary.txt'); + await this.saveBenchmarkSummary( + summaryFile, + guardrailName, + resultsByModel, + metricsByModel, + latencyResults, + datasetSize, + latencyIterations + ); + + await this.saveSummaryTables(benchmarkDir, metricsByModel, latencyResults); + } catch (error) { + console.error(`Failed to save benchmark results: ${error}`); + throw error; + } + + console.info(`Benchmark results saved to: ${benchmarkDir}`); + return benchmarkDir; + } + + private createPerformanceTable( + metricsByModel: Record> + ): string[][] { + if (Object.keys(metricsByModel).length === 0) { + return []; + } + + const metricKeys = ['precision', 'recall', 'f1Score', 'roc_auc']; + const metricNames = ['Precision', 'Recall', 'F1 Score', 'ROC AUC']; + + const table: string[][] = []; + const header = ['Model', ...metricNames]; + table.push(header); + + for (const [modelName, modelMetrics] of Object.entries(metricsByModel)) { + const row: string[] = [modelName]; + for (const key of metricKeys) { + const value = modelMetrics[key]; + if (value === undefined || isNaN(value)) { + row.push('N/A'); + } else { + row.push(value.toFixed(4)); + } + } + table.push(row); + } + + return table; + } + + private createLatencyTable(latencyResults: Record>): string[][] { + if (Object.keys(latencyResults).length === 0) { + return []; + } + + const table: string[][] = []; + const header = ['Model', 'TTC P50 (ms)', 'TTC P95 (ms)']; + table.push(header); + + for (const [modelName, modelLatency] of Object.entries(latencyResults)) { + const row: string[] = [modelName]; + + if ('ttc' in modelLatency && typeof modelLatency.ttc === 'object' && modelLatency.ttc !== null) { + const ttcData = modelLatency.ttc as Record; + const p50 = ttcData.p50; + const p95 = ttcData.p95; + + row.push( + typeof p50 === 'number' && !isNaN(p50) ? p50.toFixed(1) : 'N/A', + typeof p95 === 'number' && !isNaN(p95) ? p95.toFixed(1) : 'N/A' + ); + } else { + row.push('N/A', 'N/A'); + } + + table.push(row); + } + + return table; + } + + private formatTable(table: string[][]): string { + if (table.length === 0) { + return 'No data available'; + } + + // Calculate column widths + const widths: number[] = []; + for (let col = 0; col < table[0].length; col += 1) { + let maxWidth = 0; + for (const row of table) { + if (row[col]) { + maxWidth = Math.max(maxWidth, row[col].length); + } + } + widths.push(maxWidth); + } + + // Format rows + const lines: string[] = []; + for (const row of table) { + const formattedRow = row + .map((cell, i) => (cell || '').padEnd(widths[i] || 0)) + .join(' '); + lines.push(formattedRow); + } + + return lines.join('\n'); + } + + private async saveSummaryTables( + benchmarkDir: string, + metricsByModel: Record>, + latencyResults: Record> + ): Promise { + const outputFile = path.join(benchmarkDir, 'benchmark_summary_tables.txt'); + + try { + const perfTable = this.createPerformanceTable(metricsByModel); + const latencyTable = this.createLatencyTable(latencyResults); + + let content = 'BENCHMARK SUMMARY TABLES\n'; + content += '='.repeat(80) + '\n\n'; + + content += 'PERFORMANCE METRICS\n'; + content += '-'.repeat(80) + '\n'; + content += perfTable.length > 0 ? this.formatTable(perfTable) : 'No data available'; + content += '\n\n'; + + content += 'LATENCY RESULTS (Time to Completion)\n'; + content += '-'.repeat(80) + '\n'; + content += latencyTable.length > 0 ? this.formatTable(latencyTable) : 'No data available'; + content += '\n\n'; + + await fs.writeFile(outputFile, content, 'utf-8'); + console.info(`Summary tables saved to: ${outputFile}`); + } catch (error) { + console.error(`Failed to save summary tables: ${error}`); + } + } + + private async saveResultsJsonl(results: SampleResult[], filepath: string): Promise { + const lines = results.map((result) => + JSON.stringify({ + id: result.id, + expected_triggers: result.expectedTriggers, + triggered: result.triggered, + details: result.details || {}, + }) + ); + await fs.writeFile(filepath, lines.join('\n'), 'utf-8'); + } + + private async saveMetricsJson( + metricsByModel: Record>, + filepath: string + ): Promise { + await fs.writeFile(filepath, JSON.stringify(metricsByModel, null, 2), 'utf-8'); + } + + private async saveLatencyJson( + latencyResults: Record>, + filepath: string + ): Promise { + await fs.writeFile(filepath, JSON.stringify(latencyResults, null, 2), 'utf-8'); + } + + private async saveBenchmarkSummary( + filepath: string, + guardrailName: string, + resultsByModel: Record, + metricsByModel: Record>, + latencyResults: Record>, + datasetSize: number, + latencyIterations: number + ): Promise { + let content = 'Guardrail Benchmark Results\n'; + content += '===========================\n\n'; + content += `Guardrail: ${guardrailName}\n`; + content += `Timestamp: ${new Date().toISOString()}\n`; + content += `Dataset size: ${datasetSize} samples\n`; + content += `Latency iterations: ${latencyIterations}\n\n`; + + content += `Models evaluated: ${Object.keys(resultsByModel).join(', ')}\n\n`; + + content += 'Performance Metrics Summary:\n'; + content += '---------------------------\n'; + for (const [modelName, metrics] of Object.entries(metricsByModel)) { + content += `\n${modelName}:\n`; + for (const [metricName, value] of Object.entries(metrics)) { + if (typeof value === 'number' && !isNaN(value)) { + content += ` ${metricName}: ${value}\n`; + } else { + content += ` ${metricName}: N/A\n`; + } + } + } + + content += '\nLatency Summary:\n'; + content += '----------------\n'; + for (const [modelName, latencyData] of Object.entries(latencyResults)) { + content += `\n${modelName}:\n`; + if ('error' in latencyData) { + content += ` Error: ${latencyData.error}\n`; + } else { + const ttft = latencyData.ttft as Record | undefined; + const ttc = latencyData.ttc as Record | undefined; + if (ttft && ttc) { + content += ` TTFT P50: ${ttft.p50?.toFixed(1) || 'N/A'}ms, P95: ${ttft.p95?.toFixed(1) || 'N/A'}ms\n`; + content += ` TTC P50: ${ttc.p50?.toFixed(1) || 'N/A'}ms, P95: ${ttc.p95?.toFixed(1) || 'N/A'}ms\n`; + } + } + } + + await fs.writeFile(filepath, content, 'utf-8'); + } +} + diff --git a/src/evals/core/index.ts b/src/evals/core/index.ts index 6438799..2de8e5d 100644 --- a/src/evals/core/index.ts +++ b/src/evals/core/index.ts @@ -11,3 +11,7 @@ export * from './validate-dataset'; export * from './async-engine'; export * from './calculator'; export * from './json-reporter'; +export * from './benchmark-calculator'; +export * from './benchmark-reporter'; +export * from './latency-tester'; +export * from './visualizer'; diff --git a/src/evals/core/latency-tester.ts b/src/evals/core/latency-tester.ts new file mode 100644 index 0000000..7134889 --- /dev/null +++ b/src/evals/core/latency-tester.ts @@ -0,0 +1,124 @@ +/** + * Latency testing for guardrail benchmarking. + * + * This module implements end-to-end guardrail latency testing for different models. + */ + +import { Context, Sample } from './types'; +import { AsyncRunEngine } from './async-engine'; +import { instantiateGuardrails, GuardrailBundle } from '../../runtime'; + +/** + * Tests end-to-end guardrail latency for different models. + */ +export class LatencyTester { + private readonly iterations: number; + + /** + * Initialize the latency tester. + * + * @param iterations - Number of samples to time per model + */ + constructor(iterations: number = 20) { + this.iterations = iterations; + } + + /** + * Calculate latency statistics from a list of times. + * + * @param times - List of latency times in seconds + * @returns Dictionary with P50, P95, mean, and std dev (in milliseconds) + */ + calculateLatencyStats(times: number[]): Record { + if (times.length === 0) { + return { p50: NaN, p95: NaN, mean: NaN, std: NaN }; + } + + const timesMs = times.map((t) => t * 1000); // Convert to milliseconds + const sorted = [...timesMs].sort((a, b) => a - b); + + const p50 = this.percentile(sorted, 50); + const p95 = this.percentile(sorted, 95); + const mean = timesMs.reduce((a, b) => a + b, 0) / timesMs.length; + const variance = timesMs.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / timesMs.length; + const std = Math.sqrt(variance); + + return { + p50, + p95, + mean, + std, + }; + } + + /** + * Measure end-to-end guardrail latency per sample for a single model. + * + * @param context - Evaluation context with LLM client + * @param stageBundle - Stage bundle configured for the specific model + * @param samples - Full dataset samples + * @param iterations - Number of samples to time (uses first N samples) + * @param desc - Optional progress bar description + * @returns Dictionary with latency statistics and raw times + */ + async testGuardrailLatencyForModel( + context: Context, + stageBundle: GuardrailBundle, + samples: Sample[], + iterations: number, + desc?: string + ): Promise> { + const guardrails = await instantiateGuardrails(stageBundle); + const engine = new AsyncRunEngine(guardrails); + + const num = Math.min(iterations, samples.length); + if (num <= 0) { + return this.emptyLatencyResult(); + } + + const ttcTimes: number[] = []; + const barDesc = desc || 'Latency'; + + console.log(`${barDesc}: ${num} samples`); + + for (let i = 0; i < num; i += 1) { + const sample = samples[i]; + const start = performance.now() / 1000; // Convert to seconds + await engine.run(context, [sample], 1, undefined); + const ttc = performance.now() / 1000 - start; + ttcTimes.push(ttc); + console.log(`${barDesc}: Processed ${i + 1}/${num} samples`); + } + + const ttcStats = this.calculateLatencyStats(ttcTimes); + + return { + ttft: ttcStats, // TTFT same as TTC at guardrail level + ttc: ttcStats, + rawTimes: { ttft: ttcTimes, ttc: ttcTimes }, + iterations: ttcTimes.length, + }; + } + + private emptyLatencyResult(): Record { + const emptyStats = { p50: NaN, p95: NaN, mean: NaN, std: NaN }; + return { + ttft: emptyStats, + ttc: emptyStats, + rawTimes: { ttft: [], ttc: [] }, + iterations: 0, + }; + } + + private percentile(sorted: number[], p: number): number { + if (sorted.length === 0) { + return NaN; + } + const index = (p / 100) * (sorted.length - 1); + const lower = Math.floor(index); + const upper = Math.ceil(index); + const weight = index - lower; + return sorted[lower] * (1 - weight) + sorted[upper] * weight; + } +} + diff --git a/src/evals/core/visualizer.ts b/src/evals/core/visualizer.ts new file mode 100644 index 0000000..a637195 --- /dev/null +++ b/src/evals/core/visualizer.ts @@ -0,0 +1,68 @@ +/** + * Visualization module for guardrail benchmarking. + * + * This module generates charts and graphs for benchmark results. + * Note: Full visualization requires additional plotting libraries. + * This is a stub implementation that matches the Python interface. + */ + +import * as fs from 'fs/promises'; +import * as path from 'path'; + +/** + * Generates visualizations for guardrail benchmark results. + */ +export class BenchmarkVisualizer { + private readonly outputDir: string; + + /** + * Initialize the visualizer. + * + * @param outputDir - Directory to save generated charts + */ + constructor(outputDir: string) { + this.outputDir = outputDir; + } + + /** + * Create all visualizations for a benchmark run. + * + * @param resultsByModel - Dictionary mapping model names to their results + * @param metricsByModel - Dictionary mapping model names to their metrics + * @param latencyResults - Dictionary mapping model names to their latency data + * @param guardrailName - Name of the guardrail being evaluated + * @param _expectedTriggers - Expected trigger values for each sample (reserved for future use) + * @returns List of paths to saved visualization files + */ + async createAllVisualizations( + resultsByModel: Record, + metricsByModel: Record>, + latencyResults: Record>, + guardrailName: string, + _expectedTriggers: Record + ): Promise { + const savedFiles: string[] = []; + + // Ensure output directory exists + await fs.mkdir(this.outputDir, { recursive: true }); + + // Note: Full visualization requires plotting libraries (e.g., plotly, chart.js, etc.) + // For now, we create a placeholder file indicating visualizations would be generated here + try { + const placeholderFile = path.join(this.outputDir, 'visualizations_placeholder.txt'); + await fs.writeFile( + placeholderFile, + `Visualizations would be generated here for guardrail: ${guardrailName}\n` + + `Models: ${Object.keys(resultsByModel).join(', ')}\n` + + `Note: Full visualization requires additional plotting libraries.\n`, + 'utf-8' + ); + savedFiles.push(placeholderFile); + } catch (error) { + console.error(`Failed to create visualization placeholder: ${error}`); + } + + return savedFiles; + } +} + diff --git a/src/evals/guardrail-evals.ts b/src/evals/guardrail-evals.ts index a40e352..9bcb813 100644 --- a/src/evals/guardrail-evals.ts +++ b/src/evals/guardrail-evals.ts @@ -1,17 +1,34 @@ /** - * Guardrail evaluation runner. + * Guardrail evaluation runner and CLI. * - * This class provides the main interface for running guardrail evaluations on datasets. - * It loads guardrail configurations, runs evaluations asynchronously, calculates metrics, and saves results. + * This script provides a command-line interface and class for running guardrail evaluations on datasets. */ -import { Context } from './core/types'; +import { Context, Sample, SampleResult } from './core/types'; import { JsonlDatasetLoader } from './core/jsonl-loader'; import { AsyncRunEngine } from './core/async-engine'; import { GuardrailMetricsCalculator } from './core/calculator'; import { JsonResultsReporter } from './core/json-reporter'; -import { loadConfigBundleFromFile, instantiateGuardrails } from '../runtime'; +import { BenchmarkMetricsCalculator } from './core/benchmark-calculator'; +import { BenchmarkReporter } from './core/benchmark-reporter'; +import { BenchmarkVisualizer } from './core/visualizer'; +import { LatencyTester } from './core/latency-tester'; +import { + instantiateGuardrails, + loadPipelineBundles, + PipelineConfig, + GuardrailBundle, +} from '../runtime'; import { OpenAI } from 'openai'; +import * as os from 'os'; +import * as fs from 'fs/promises'; +import * as path from 'path'; + +// Default models for benchmark mode +const DEFAULT_BENCHMARK_MODELS = ['gpt-5', 'gpt-5-mini', 'gpt-4.1', 'gpt-4.1-mini']; +const DEFAULT_BATCH_SIZE = 32; +const DEFAULT_LATENCY_ITERATIONS = 25; +const VALID_STAGES = new Set(['pre_flight', 'input', 'output']); /** * Class for running guardrail evaluations. @@ -19,69 +36,676 @@ import { OpenAI } from 'openai'; export class GuardrailEval { private configPath: string; private datasetPath: string; + private stages: string[] | null; private batchSize: number; private outputDir: string; + private apiKey: string | null; + private baseUrl: string | null; + private azureEndpoint: string | null; + private azureApiVersion: string; + private mode: 'evaluate' | 'benchmark'; + private models: string[]; + private latencyIterations: number; private multiTurn: boolean; + private maxParallelModels: number; + private benchmarkChunkSize: number | null; /** * Initialize the evaluator. * - * @param configPath - Path to the guardrail config file - * @param datasetPath - Path to the evaluation dataset + * @param configPath - Path to pipeline configuration file + * @param datasetPath - Path to evaluation dataset (JSONL) + * @param stages - Specific stages to evaluate (pre_flight, input, output) * @param batchSize - Number of samples to process in parallel * @param outputDir - Directory to save evaluation results + * @param apiKey - API key for OpenAI, Azure OpenAI, or OpenAI-compatible API + * @param baseUrl - Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1) + * @param azureEndpoint - Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com) + * @param azureApiVersion - Azure OpenAI API version (e.g., 2025-01-01-preview) + * @param mode - Evaluation mode ("evaluate" or "benchmark") + * @param models - Models to test in benchmark mode + * @param latencyIterations - Number of iterations for latency testing + * @param multiTurn - Whether to evaluate guardrails on multi-turn conversations + * @param maxParallelModels - Maximum number of models to benchmark concurrently + * @param benchmarkChunkSize - Optional sample chunk size for per-model benchmarking */ constructor( configPath: string, datasetPath: string, - batchSize: number = 32, + stages: string[] | null = null, + batchSize: number = DEFAULT_BATCH_SIZE, outputDir: string = 'results', - multiTurn: boolean = false + apiKey: string | null = null, + baseUrl: string | null = null, + azureEndpoint: string | null = null, + azureApiVersion: string = '2025-01-01-preview', + mode: 'evaluate' | 'benchmark' = 'evaluate', + models: string[] | null = null, + latencyIterations: number = DEFAULT_LATENCY_ITERATIONS, + multiTurn: boolean = false, + maxParallelModels: number | null = null, + benchmarkChunkSize: number | null = null ) { + // Note: File existence validation will happen in run() method + // since constructor cannot be async + if (batchSize <= 0) { + throw new Error(`Batch size must be positive, got: ${batchSize}`); + } + + if (mode !== 'evaluate' && mode !== 'benchmark') { + throw new Error(`Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`); + } + + if (latencyIterations <= 0) { + throw new Error(`Latency iterations must be positive, got: ${latencyIterations}`); + } + + if (maxParallelModels !== null && maxParallelModels <= 0) { + throw new Error(`max_parallel_models must be positive, got: ${maxParallelModels}`); + } + + if (benchmarkChunkSize !== null && benchmarkChunkSize <= 0) { + throw new Error(`benchmark_chunk_size must be positive, got: ${benchmarkChunkSize}`); + } + this.configPath = configPath; this.datasetPath = datasetPath; + this.stages = stages; this.batchSize = batchSize; this.outputDir = outputDir; + this.apiKey = apiKey; + this.baseUrl = baseUrl; + this.azureEndpoint = azureEndpoint; + this.azureApiVersion = azureApiVersion; + this.mode = mode; + this.models = models || [...DEFAULT_BENCHMARK_MODELS]; + this.latencyIterations = latencyIterations; this.multiTurn = multiTurn; + this.maxParallelModels = GuardrailEval._determineParallelModelLimit( + this.models.length, + maxParallelModels + ); + this.benchmarkChunkSize = benchmarkChunkSize; + } + + private async _validateFilePaths(): Promise { + try { + await fs.access(this.configPath); + } catch { + throw new Error(`Config file not found: ${this.configPath}`); + } + + try { + await fs.access(this.datasetPath); + } catch { + throw new Error(`Dataset file not found: ${this.datasetPath}`); + } } /** - * Run the evaluation pipeline. + * Resolve the number of benchmark tasks that can run concurrently. * - * @param desc - Description for the evaluation process + * @param modelCount - Total number of models scheduled for benchmarking + * @param requestedLimit - Optional user-provided parallelism limit + * @returns Number of concurrent benchmark tasks to run */ - async run(desc: string = 'Evaluating samples'): Promise { - // Load/validate config, instantiate guardrails - const bundle = await loadConfigBundleFromFile(this.configPath); - const guardrails = await instantiateGuardrails(bundle); + static _determineParallelModelLimit(modelCount: number, requestedLimit?: number | null): number { + if (modelCount <= 0) { + throw new Error('modelCount must be positive'); + } + + if (requestedLimit !== null && requestedLimit !== undefined) { + if (requestedLimit <= 0) { + throw new Error('maxParallelModels must be positive'); + } + return Math.min(requestedLimit, modelCount); + } + + const cpuCount = os.cpus().length || 1; + return Math.max(1, Math.min(cpuCount, modelCount)); + } + + /** + * Yield contiguous sample chunks respecting the configured chunk size. + * + * @param samples - Samples to evaluate + * @param chunkSize - Optional maximum chunk size to enforce + * @returns Generator yielding slices of the provided samples + */ + static *_chunkSamples(samples: Sample[], chunkSize?: number | null): Generator { + if (chunkSize !== null && chunkSize !== undefined && chunkSize <= 0) { + throw new Error('chunkSize must be positive when provided'); + } + + if (!samples || samples.length === 0 || chunkSize === null || chunkSize === undefined || chunkSize >= samples.length) { + yield samples; + return; + } + + for (let start = 0; start < samples.length; start += chunkSize) { + yield samples.slice(start, start + chunkSize); + } + } + + /** + * Run the evaluation pipeline for all specified stages. + */ + async run(): Promise { + await this._validateFilePaths(); + try { + if (this.mode === 'benchmark') { + await this._runBenchmark(); + } else { + await this._runEvaluation(); + } + } catch (error) { + console.error(`Evaluation failed: ${error}`); + throw error; + } + } + + private async _runEvaluation(): Promise { + const pipelineBundles = await loadPipelineBundles(this.configPath); + const stagesToEvaluate = this._getValidStages(pipelineBundles); + + if (stagesToEvaluate.length === 0) { + throw new Error('No valid stages found in configuration'); + } + + console.info(`event="evaluation_start" stages="${stagesToEvaluate.join(', ')}" mode="evaluate"`); - // Load and validate dataset const loader = new JsonlDatasetLoader(); const samples = await loader.load(this.datasetPath); + console.info(`Loaded ${samples.length} samples from dataset`); + + const context = this._createContext(); + const calculator = new GuardrailMetricsCalculator(); + const reporter = new JsonResultsReporter(); + + const allResults: Record = {}; + const allMetrics: Record> = {}; + + for (const stage of stagesToEvaluate) { + console.info(`Starting ${stage} stage evaluation`); + + try { + const stageResults = await this._evaluateSingleStage( + stage, + pipelineBundles, + samples, + context, + calculator + ); + + if (stageResults) { + allResults[stage] = stageResults.results; + allMetrics[stage] = stageResults.metrics; + console.info(`Completed ${stage} stage evaluation`); + } else { + console.warn(`Stage '${stage}' evaluation returned no results`); + } + } catch (error) { + console.error(`Failed to evaluate stage '${stage}': ${error}`); + } + } + + if (Object.keys(allResults).length === 0) { + throw new Error('No stages were successfully evaluated'); + } + + // Note: JsonResultsReporter.save_multi_stage would need to be implemented + // For now, save each stage separately + for (const [stage, results] of Object.entries(allResults)) { + const stageMetrics = allMetrics[stage] as ReturnType; + await reporter.save(results, stageMetrics, this.outputDir); + } + + console.info(`Evaluation completed. Results saved to: ${this.outputDir}`); + } + + private async _runBenchmark(): Promise { + console.info(`event="benchmark_start" duration_ms=0 models="${this.models.join(', ')}"`); + console.info( + `event="benchmark_parallel_config" duration_ms=0 parallel_limit=${this.maxParallelModels} chunk_size=${ + this.benchmarkChunkSize || 'dataset' + } batch_size=${this.batchSize}` + ); - // Initialize components - if (!process.env.OPENAI_API_KEY) { + const pipelineBundles = await loadPipelineBundles(this.configPath); + const { stageToTest, guardrailName } = this._getBenchmarkTarget(pipelineBundles); + + // Validate guardrail has model configuration + const stageBundle = (pipelineBundles as Record)[stageToTest]; + if (!this._hasModelConfiguration(stageBundle)) { throw new Error( - 'OPENAI_API_KEY environment variable is required. Please set it with: export OPENAI_API_KEY="your-api-key-here"' + `Guardrail '${guardrailName}' does not have a model configuration. ` + + 'Benchmark mode requires LLM-based guardrails with configurable models.' ); } + console.info(`event="benchmark_target" duration_ms=0 guardrail="${guardrailName}" stage="${stageToTest}"`); + + const loader = new JsonlDatasetLoader(); + const samples = await loader.load(this.datasetPath); + console.info(`event="benchmark_samples_loaded" duration_ms=0 count=${samples.length}`); + + const context = this._createContext(); + const benchmarkCalculator = new BenchmarkMetricsCalculator(); + const basicCalculator = new GuardrailMetricsCalculator(); + const benchmarkReporter = new BenchmarkReporter(this.outputDir); + + // Run benchmark for all models + const { resultsByModel, metricsByModel } = await this._benchmarkAllModels( + stageToTest, + guardrailName, + samples, + context, + benchmarkCalculator, + basicCalculator, + pipelineBundles + ); + + // Run latency testing + console.info(`event="benchmark_latency_start" duration_ms=0 model_count=${this.models.length}`); + const latencyResults = await this._runLatencyTests(stageToTest, samples, pipelineBundles); + + // Save benchmark results + const benchmarkDir = await benchmarkReporter.saveBenchmarkResults( + resultsByModel, + metricsByModel, + latencyResults, + guardrailName, + samples.length, + this.latencyIterations + ); + + // Create visualizations + console.info(`event="benchmark_visualization_start" duration_ms=0 guardrail="${guardrailName}"`); + const visualizer = new BenchmarkVisualizer(path.join(benchmarkDir, 'graphs')); + const visualizationFiles = await visualizer.createAllVisualizations( + resultsByModel, + metricsByModel, + latencyResults, + guardrailName, + samples[0]?.expectedTriggers || {} + ); + + console.info(`event="benchmark_complete" duration_ms=0 output="${benchmarkDir}"`); + console.info(`event="benchmark_visualization_complete" duration_ms=0 count=${visualizationFiles.length}`); + } + + private _hasModelConfiguration(stageBundle: GuardrailBundle | undefined): boolean { + if (!stageBundle || !stageBundle.guardrails || stageBundle.guardrails.length === 0) { + return false; + } + + const guardrailConfig = stageBundle.guardrails[0]?.config; + if (!guardrailConfig) { + return false; + } + + if (typeof guardrailConfig === 'object' && 'model' in guardrailConfig) { + return true; + } + + return false; + } + + private async _runLatencyTests( + stageToTest: string, + samples: Sample[], + pipelineBundles: PipelineConfig + ): Promise>> { + const latencyResults: Record> = {}; + const latencyTester = new LatencyTester(this.latencyIterations); + + for (const model of this.models) { + const stageBundle = (pipelineBundles as Record)[stageToTest]; + const modelStageBundle = this._createModelSpecificStageBundle(stageBundle, model); + const modelContext = this._createContext(); + latencyResults[model] = await latencyTester.testGuardrailLatencyForModel( + modelContext, + modelStageBundle, + samples, + this.latencyIterations, + `Testing latency: ${model}` + ); + } + + return latencyResults; + } + + private _createContext(): Context { + // Azure OpenAI + if (this.azureEndpoint) { + // Validate API key availability + const apiKey = this.apiKey || process.env.OPENAI_API_KEY; + if (!apiKey) { + throw new Error( + 'API key is required for Azure OpenAI. Please provide --api-key or set OPENAI_API_KEY environment variable.' + ); + } + + const azureKwargs: Record = { + azureEndpoint: this.azureEndpoint, + apiVersion: this.azureApiVersion, + }; + if (this.apiKey) { + azureKwargs.apiKey = this.apiKey; + } + + // Note: Azure OpenAI client creation would need AzureOpenAI import + // For now, fall back to regular OpenAI with base URL const openaiClient = new OpenAI({ - apiKey: process.env.OPENAI_API_KEY, - }); - const context: Context = { guardrailLlm: openaiClient }; + apiKey: apiKey, + baseURL: `https://${this.azureEndpoint.replace(/^https?:\/\//, '')}/openai/deployments`, + }); + console.info(`event="client_created" type="azure" endpoint="${this.azureEndpoint}"`); + return { guardrailLlm: openaiClient }; + } + // OpenAI or OpenAI-compatible API + else { + const openaiKwargs: Record = {}; + if (this.apiKey) { + openaiKwargs.apiKey = this.apiKey; + } else if (process.env.OPENAI_API_KEY) { + openaiKwargs.apiKey = process.env.OPENAI_API_KEY; + } else { + throw new Error( + 'OPENAI_API_KEY environment variable is required. Please set it with: export OPENAI_API_KEY="your-api-key-here"' + ); + } + if (this.baseUrl) { + openaiKwargs.baseURL = this.baseUrl; + console.info(`event="client_created" type="openai_compatible" base_url="${this.baseUrl}"`); + } else { + console.info(`event="client_created" type="openai"`); + } + + const openaiClient = new OpenAI(openaiKwargs); + return { guardrailLlm: openaiClient }; + } + } + + private _isValidStage(pipelineBundles: PipelineConfig, stage: string): boolean { + const bundles = pipelineBundles as Record; + const stageBundle = bundles[stage]; + return stageBundle !== undefined && stageBundle !== null && stageBundle.guardrails && stageBundle.guardrails.length > 0; + } + + /** + * Create a modified copy of a stage bundle with model-specific configuration. + * + * @param stageBundle - Original stage bundle + * @param model - Model name to inject into guardrail configs + * @returns Modified stage bundle with updated model configuration + */ + private _createModelSpecificStageBundle(stageBundle: GuardrailBundle, model: string): GuardrailBundle { + // Deep copy the bundle using structuredClone for better performance + // Fall back to JSON parse/stringify for compatibility + let modifiedBundle: GuardrailBundle; + try { + modifiedBundle = structuredClone(stageBundle); + } catch { + modifiedBundle = JSON.parse(JSON.stringify(stageBundle)); + } + + for (const guardrail of modifiedBundle.guardrails) { + if (guardrail.config && typeof guardrail.config === 'object' && 'model' in guardrail.config) { + guardrail.config.model = model; + } + } + + return modifiedBundle; + } + + private _getValidStages(pipelineBundles: PipelineConfig): string[] { + if (this.stages === null) { + // Auto-detect all valid stages + const availableStages = Array.from(VALID_STAGES).filter((stage) => + this._isValidStage(pipelineBundles, stage) + ); + + if (availableStages.length === 0) { + throw new Error('No valid stages found in configuration'); + } + + console.info(`event="stage_auto_detection" stages="${availableStages.join(', ')}"`); + return availableStages; + } else { + // Validate requested stages + const validRequestedStages: string[] = []; + for (const stage of this.stages) { + if (!VALID_STAGES.has(stage)) { + console.warn(`Invalid stage '${stage}', skipping`); + continue; + } + + if (!this._isValidStage(pipelineBundles, stage)) { + console.warn(`Stage '${stage}' not found or has no guardrails configured, skipping`); + continue; + } + + validRequestedStages.push(stage); + } + + if (validRequestedStages.length === 0) { + throw new Error('No valid stages found in configuration'); + } + + return validRequestedStages; + } + } + + private async _evaluateSingleStage( + stage: string, + pipelineBundles: PipelineConfig, + samples: Sample[], + context: Context, + calculator: GuardrailMetricsCalculator + ): Promise<{ results: SampleResult[]; metrics: Record } | null> { + try { + const stageBundle = (pipelineBundles as Record)[stage]; + const guardrails = await instantiateGuardrails(stageBundle); + + const engine = new AsyncRunEngine(guardrails, this.multiTurn); + + const stageResults = await engine.run(context, samples, this.batchSize, `Evaluating ${stage} stage`); + + const stageMetrics = calculator.calculate(stageResults); + + return { results: stageResults, metrics: stageMetrics }; + } catch (error) { + console.error(`Failed to evaluate stage '${stage}': ${error}`); + return null; + } + } + + private _getBenchmarkTarget(pipelineBundles: PipelineConfig): { stageToTest: string; guardrailName: string } { + let stageToTest: string; + if (this.stages && this.stages.length > 0) { + stageToTest = this.stages[0]; + if (!this._isValidStage(pipelineBundles, stageToTest)) { + throw new Error(`Stage '${stageToTest}' has no guardrails configured`); + } + } else { + // Find first valid stage + stageToTest = Array.from(VALID_STAGES).find((stage) => this._isValidStage(pipelineBundles, stage)) || ''; + if (!stageToTest) { + throw new Error('No valid stage found for benchmarking'); + } + } + + const stageBundle = (pipelineBundles as Record)[stageToTest]; + const guardrailName = stageBundle.guardrails[0]?.name || 'unknown'; + + return { stageToTest, guardrailName }; + } + + private async _benchmarkAllModels( + stageToTest: string, + guardrailName: string, + samples: Sample[], + context: Context, + benchmarkCalculator: BenchmarkMetricsCalculator, + basicCalculator: GuardrailMetricsCalculator, + pipelineBundles: PipelineConfig + ): Promise<{ + resultsByModel: Record; + metricsByModel: Record>; + }> { + const stageBundle = (pipelineBundles as Record)[stageToTest]; + + const resultsByModel: Record = {}; + const metricsByModel: Record> = {}; + + // Create semaphore for concurrency control using a proper async queue + const maxActive = this.maxParallelModels; + const semaphore: Array<() => void> = []; + let running = 0; + + const acquire = (): Promise => { + return new Promise((resolve) => { + const tryAcquire = () => { + if (running < maxActive) { + running += 1; + resolve(); + } else { + semaphore.push(tryAcquire); + } + }; + tryAcquire(); + }); + }; + + const release = (): void => { + running -= 1; + if (semaphore.length > 0) { + const next = semaphore.shift(); + if (next) { + next(); + } + } + }; + + const runModelTask = async (index: number, model: string): Promise => { + await acquire(); + + const startTime = performance.now(); + console.info(`event="benchmark_model_start" duration_ms=0 model="${model}" position=${index} total=${this.models.length} active=${running}/${maxActive}`); + + try { + const modifiedStageBundle = this._createModelSpecificStageBundle(stageBundle, model); + + const modelResults = await this._benchmarkSingleModel( + model, + modifiedStageBundle, + samples, + context, + guardrailName, + benchmarkCalculator, + basicCalculator + ); + + const elapsedMs = performance.now() - startTime; + + if (modelResults) { + resultsByModel[model] = modelResults.results; + metricsByModel[model] = modelResults.metrics; + console.info(`event="benchmark_model_complete" duration_ms=${elapsedMs.toFixed(2)} model="${model}" status="success"`); + } else { + resultsByModel[model] = []; + metricsByModel[model] = {}; + console.warn(`event="benchmark_model_empty" duration_ms=${elapsedMs.toFixed(2)} model="${model}" status="no_results"`); + } + } catch (error) { + const elapsedMs = performance.now() - startTime; + resultsByModel[model] = []; + metricsByModel[model] = {}; + console.error(`event="benchmark_model_failure" duration_ms=${elapsedMs.toFixed(2)} model="${model}" error="${error}"`); + } finally { + release(); + } + }; + + // Start all tasks in parallel (they will be throttled by the semaphore) + const tasks = this.models.map((model, idx) => runModelTask(idx + 1, model)); + await Promise.all(tasks); + + // Log summary + const successfulModels = this.models.filter((model) => resultsByModel[model] && resultsByModel[model].length > 0); + const failedModels = this.models.filter((model) => !resultsByModel[model] || resultsByModel[model].length === 0); + + console.info(`event="benchmark_summary" duration_ms=0 successful=${successfulModels.length} failed=${failedModels.length}`); + console.info(`event="benchmark_successful_models" duration_ms=0 models="${successfulModels.join(', ') || 'None'}"`); + if (failedModels.length > 0) { + console.warn(`event="benchmark_failed_models" duration_ms=0 models="${failedModels.join(', ')}"`); + } + console.info(`event="benchmark_total_models" duration_ms=0 total=${this.models.length}`); + + return { resultsByModel, metricsByModel }; + } + + private async _benchmarkSingleModel( + model: string, + stageBundle: GuardrailBundle, + samples: Sample[], + context: Context, + guardrailName: string, + benchmarkCalculator: BenchmarkMetricsCalculator, + basicCalculator: GuardrailMetricsCalculator + ): Promise<{ results: SampleResult[]; metrics: Record } | null> { + try { + const guardrails = await instantiateGuardrails(stageBundle); const engine = new AsyncRunEngine(guardrails, this.multiTurn); - const calculator = new GuardrailMetricsCalculator(); - const reporter = new JsonResultsReporter(); + const chunkTotal = this.benchmarkChunkSize && samples.length > 0 + ? Math.max(1, Math.ceil(samples.length / this.benchmarkChunkSize)) + : 1; - // Run evaluations - const results = await engine.run(context, samples, this.batchSize, desc); + const modelResults: SampleResult[] = []; + let chunkIndex = 1; + for (const chunk of GuardrailEval._chunkSamples(samples, this.benchmarkChunkSize)) { + const chunkDesc = + chunkTotal === 1 + ? `Benchmarking ${model}` + : `Benchmarking ${model} (${chunkIndex}/${chunkTotal})`; + const chunkResults = await engine.run(context, chunk, this.batchSize, chunkDesc); + modelResults.push(...chunkResults); + chunkIndex += 1; + } - // Calculate metrics - const metrics = calculator.calculate(results); + const guardrailConfig = stageBundle.guardrails[0]?.config || null; + + const advancedMetrics = benchmarkCalculator.calculateAdvancedMetrics( + modelResults, + guardrailName, + guardrailConfig as Record | null + ); - // Save results - await reporter.save(results, metrics, this.outputDir); + const basicMetrics = basicCalculator.calculate(modelResults); + + let basicMetricsDict: Record = {}; + if (guardrailName in basicMetrics) { + const guardrailMetrics = basicMetrics[guardrailName]; + basicMetricsDict = { + precision: guardrailMetrics.precision, + recall: guardrailMetrics.recall, + f1Score: guardrailMetrics.f1Score, + truePositives: guardrailMetrics.truePositives, + falsePositives: guardrailMetrics.falsePositives, + falseNegatives: guardrailMetrics.falseNegatives, + trueNegatives: guardrailMetrics.trueNegatives, + totalSamples: guardrailMetrics.totalSamples, + }; + } + + const combinedMetrics = { ...basicMetricsDict, ...advancedMetrics }; + + return { results: modelResults, metrics: combinedMetrics }; + } catch (error) { + console.error(`Failed to benchmark model ${model}: ${error}`); + return null; + } } } @@ -93,16 +717,36 @@ export class GuardrailEval { export async function runEvaluationCLI(args: { configPath: string; datasetPath: string; + stages?: string[] | null; batchSize?: number; outputDir?: string; + apiKey?: string | null; + baseUrl?: string | null; + azureEndpoint?: string | null; + azureApiVersion?: string; + mode?: 'evaluate' | 'benchmark'; + models?: string[] | null; + latencyIterations?: number; multiTurn?: boolean; + maxParallelModels?: number | null; + benchmarkChunkSize?: number | null; }): Promise { const evaluator = new GuardrailEval( args.configPath, args.datasetPath, - args.batchSize || 32, + args.stages || null, + args.batchSize || DEFAULT_BATCH_SIZE, args.outputDir || 'results', - Boolean(args.multiTurn) + args.apiKey || null, + args.baseUrl || null, + args.azureEndpoint || null, + args.azureApiVersion || '2025-01-01-preview', + args.mode || 'evaluate', + args.models || null, + args.latencyIterations || DEFAULT_LATENCY_ITERATIONS, + Boolean(args.multiTurn), + args.maxParallelModels || null, + args.benchmarkChunkSize || null ); await evaluator.run();