Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/evals.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci
| `--azure-api-version` | ❌ | Azure OpenAI API version (default: 2025-01-01-preview) |
| `--models` | ❌ | Models for benchmark mode (benchmark only) |
| `--latency-iterations` | ❌ | Latency test samples (default: 25) (benchmark only) |
| `--max-parallel-models` | ❌ | Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) |
| `--benchmark-chunk-size` | ❌ | Sample chunk size per model for memory-efficient benchmarking (benchmark only) |

## Configuration

Expand Down Expand Up @@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
- **Multi-stage evaluation**: pre_flight, input, output stages
- **Automatic stage detection**: Evaluates all stages found in configuration
- **Batch processing**: Configurable parallel processing
- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults
- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking
- **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds
- **Latency testing**: End-to-end guardrail performance measurement
- **Visualization**: Automatic chart and graph generation
Expand Down
94 changes: 94 additions & 0 deletions src/__tests__/unit/evals/guardrail-evals.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Unit tests for guardrail evaluation utilities.
*/

import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import { GuardrailEval } from '../../../evals/guardrail-evals';
import type { Sample } from '../../../evals/core/types';
import * as os from 'os';

vi.mock('os', () => {
return {
default: {
cpus: vi.fn(),
},
cpus: vi.fn(),
};
});

function buildSamples(count: number): Sample[] {
/**Build synthetic samples for chunking tests.
*
* @param count - Number of synthetic samples to build.
* @returns List of Sample instances configured for evaluation.
*/
return Array.from({ length: count }, (_, idx) => ({
id: `sample-${idx}`,
data: `payload-${idx}`,
expectedTriggers: { g: Boolean(idx % 2) },
}));
}

describe('GuardrailEval._determineParallelModelLimit', () => {
beforeEach(() => {
vi.clearAllMocks();
});

afterEach(() => {
vi.restoreAllMocks();
});

it('should use cpu_count when explicit parallelism is not provided', () => {
vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]);

expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4);
expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2);
});

it('should honor user-provided parallelism constraints', () => {
expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
});

it('should throw error for invalid model count', () => {
expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
});
});

describe('GuardrailEval._chunkSamples', () => {
it('should return the original sample list when no chunk size is provided', () => {
const samples = buildSamples(3);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, null));
expect(chunks.length).toBe(1);
expect(chunks[0]).toBe(samples);
});

it('should split samples into evenly sized chunks', () => {
const samples = buildSamples(5);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]);
expect(chunks[0][0].id).toBe('sample-0');
expect(chunks[1][0].id).toBe('sample-2');
expect(chunks[2][0].id).toBe('sample-4');
});

it('should reject invalid chunk sizes', () => {
const samples = buildSamples(2);
expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
});

it('should return single chunk when chunk size is larger than samples', () => {
const samples = buildSamples(3);
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10));
expect(chunks.length).toBe(1);
expect(chunks[0]).toBe(samples);
});

it('should handle empty samples', () => {
const samples: Sample[] = [];
const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
expect(chunks.length).toBe(1);
expect(chunks[0]).toEqual([]);
});
});

142 changes: 142 additions & 0 deletions src/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ interface CliArgs {
batchSize?: number;
outputDir?: string;
multiTurn?: boolean;
maxParallelModels?: number | null;
benchmarkChunkSize?: number | null;
mode?: 'evaluate' | 'benchmark';
stages?: string[];
models?: string[];
latencyIterations?: number;
apiKey?: string | null;
baseUrl?: string | null;
azureEndpoint?: string | null;
azureApiVersion?: string;
help?: boolean;
}

Expand Down Expand Up @@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
args.outputDir = argv[++i];
} else if (arg === '--multi-turn') {
args.multiTurn = true;
} else if (arg === '--max-parallel-models') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.maxParallelModels = value;
} else if (arg === '--benchmark-chunk-size') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.benchmarkChunkSize = value;
} else if (arg === '--mode') {
const mode = argv[++i];
if (mode !== 'evaluate' && mode !== 'benchmark') {
console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
process.exit(1);
}
args.mode = mode as 'evaluate' | 'benchmark';
} else if (arg === '--stages') {
args.stages = [];
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
args.stages.push(argv[++i]);
}
} else if (arg === '--models') {
args.models = [];
while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
args.models.push(argv[++i]);
}
} else if (arg === '--latency-iterations') {
const value = parseInt(argv[++i], 10);
if (isNaN(value) || value <= 0) {
console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
process.exit(1);
}
args.latencyIterations = value;
} else if (arg === '--api-key') {
args.apiKey = argv[++i];
} else if (arg === '--base-url') {
args.baseUrl = argv[++i];
} else if (arg === '--azure-endpoint') {
args.azureEndpoint = argv[++i];
} else if (arg === '--azure-api-version') {
args.azureApiVersion = argv[++i];
} else if (!args.configFile && !arg.startsWith('-')) {
args.configFile = arg;
}
Expand Down Expand Up @@ -119,6 +175,12 @@ function showHelp(): void {
console.log(
' --dataset-path <path> Path to evaluation dataset (required)'
);
console.log(
' --mode <mode> Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
);
console.log(
' --stages <stage>... Pipeline stages to evaluate: pre_flight, input, output'
);
console.log(
' --batch-size <number> Number of samples to process in parallel (default: 32)'
);
Expand All @@ -128,6 +190,32 @@ function showHelp(): void {
console.log(
' --multi-turn Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
);
console.log('Benchmark Options:');
console.log(
' --models <model>... Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
);
console.log(
' --latency-iterations <number> Number of iterations for latency testing (default: 25)'
);
console.log(
' --max-parallel-models <number> Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
);
console.log(
' --benchmark-chunk-size <number> Optional number of samples per chunk when benchmarking to limit long-running runs'
);
console.log('API Configuration:');
console.log(
' --api-key <key> API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
);
console.log(
' --base-url <url> Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
);
console.log(
' --azure-endpoint <endpoint> Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
);
console.log(
' --azure-api-version <version> Azure OpenAI API version (default: 2025-01-01-preview)'
);
console.log('');
console.log('Examples:');
console.log(' guardrails validate config.json');
Expand All @@ -136,6 +224,12 @@ function showHelp(): void {
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
);
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
);
console.log(
' guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
);
console.log(' guardrails validate-dataset dataset.jsonl');
}

Expand All @@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
process.exit(1);
}

if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) {
console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`);
process.exit(1);
}

if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) {
console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`);
process.exit(1);
}

if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
process.exit(1);
}

if (args.stages) {
const validStages = new Set(['pre_flight', 'input', 'output']);
const invalidStages = args.stages.filter((s) => !validStages.has(s));
if (invalidStages.length > 0) {
console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
process.exit(1);
}
}

if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
console.warn('⚠️ Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
}

if (args.azureEndpoint && args.baseUrl) {
console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
process.exit(1);
}

if (args.azureEndpoint && !args.apiKey) {
console.error('❌ Error: --api-key is required when using --azure-endpoint');
process.exit(1);
}

try {
await runEvaluationCLI({
configPath: args.configPath,
datasetPath: args.datasetPath,
stages: args.stages || null,
batchSize: args.batchSize || 32,
outputDir: args.outputDir || 'results',
apiKey: args.apiKey || null,
baseUrl: args.baseUrl || null,
azureEndpoint: args.azureEndpoint || null,
azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
mode: args.mode || 'evaluate',
models: args.models || null,
latencyIterations: args.latencyIterations,
multiTurn: args.multiTurn,
maxParallelModels: args.maxParallelModels,
benchmarkChunkSize: args.benchmarkChunkSize,
});

console.log('Evaluation completed successfully!');
Expand Down
7 changes: 3 additions & 4 deletions src/evals/core/async-engine.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
}

const results: SampleResult[] = [];
let processed = 0;
const totalSamples = samples.length;

console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);

for (let i = 0; i < samples.length; i += batchSize) {
const batch = samples.slice(i, i + batchSize);
const batchResults = await Promise.all(
batch.map((sample) => this.evaluateSample(context, sample))
);
results.push(...batchResults);
processed += batch.length;
console.log(`Processed ${processed}/${samples.length} samples`);
console.log(`Processed ${results.length}/${totalSamples} samples`);
}

return results;
Expand Down
Loading