openai · gabor-openai · Nov 19, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/docs/evals.md b/docs/evals.md
@@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci
 | `--azure-api-version` | ❌ | Azure OpenAI API version (default: 2025-01-01-preview) |
 | `--models` | ❌ | Models for benchmark mode (benchmark only) |
 | `--latency-iterations` | ❌ | Latency test samples (default: 25) (benchmark only) |
+| `--max-parallel-models` | ❌ | Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) |
+| `--benchmark-chunk-size` | ❌ | Sample chunk size per model for memory-efficient benchmarking (benchmark only) |
 
 ## Configuration
 
@@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
 - **Multi-stage evaluation**: pre_flight, input, output stages
 - **Automatic stage detection**: Evaluates all stages found in configuration
 - **Batch processing**: Configurable parallel processing
+- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults
+- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking
 - **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds
 - **Latency testing**: End-to-end guardrail performance measurement
 - **Visualization**: Automatic chart and graph generation

diff --git a/src/__tests__/unit/evals/guardrail-evals.test.ts b/src/__tests__/unit/evals/guardrail-evals.test.ts
@@ -0,0 +1,94 @@
+/**
+ * Unit tests for guardrail evaluation utilities.
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { GuardrailEval } from '../../../evals/guardrail-evals';
+import type { Sample } from '../../../evals/core/types';
+import * as os from 'os';
+
+vi.mock('os', () => {
+  return {
+    default: {
+      cpus: vi.fn(),
+    },
+    cpus: vi.fn(),
+  };
+});
+
+function buildSamples(count: number): Sample[] {
+  /**Build synthetic samples for chunking tests.
+   *
+   * @param count - Number of synthetic samples to build.
+   * @returns List of Sample instances configured for evaluation.
+   */
+  return Array.from({ length: count }, (_, idx) => ({
+    id: `sample-${idx}`,
+    data: `payload-${idx}`,
+    expectedTriggers: { g: Boolean(idx % 2) },
+  }));
+}
+
+describe('GuardrailEval._determineParallelModelLimit', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('should use cpu_count when explicit parallelism is not provided', () => {
+    vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]);
+
+    expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4);
+    expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2);
+  });
+
+  it('should honor user-provided parallelism constraints', () => {
+    expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
+    expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
+  });
+
+  it('should throw error for invalid model count', () => {
+    expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
+  });
+});
+
+describe('GuardrailEval._chunkSamples', () => {
+  it('should return the original sample list when no chunk size is provided', () => {
+    const samples = buildSamples(3);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, null));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toBe(samples);
+  });
+
+  it('should split samples into evenly sized chunks', () => {
+    const samples = buildSamples(5);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
+    expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]);
+    expect(chunks[0][0].id).toBe('sample-0');
+    expect(chunks[1][0].id).toBe('sample-2');
+    expect(chunks[2][0].id).toBe('sample-4');
+  });
+
+  it('should reject invalid chunk sizes', () => {
+    const samples = buildSamples(2);
+    expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
+  });
+
+  it('should return single chunk when chunk size is larger than samples', () => {
+    const samples = buildSamples(3);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toBe(samples);
+  });
+
+  it('should handle empty samples', () => {
+    const samples: Sample[] = [];
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toEqual([]);
+  });
+});
+
diff --git a/src/cli.ts b/src/cli.ts
@@ -36,6 +36,16 @@ interface CliArgs {
   batchSize?: number;
   outputDir?: string;
   multiTurn?: boolean;
+  maxParallelModels?: number | null;
+  benchmarkChunkSize?: number | null;
+  mode?: 'evaluate' | 'benchmark';
+  stages?: string[];
+  models?: string[];
+  latencyIterations?: number;
+  apiKey?: string | null;
+  baseUrl?: string | null;
+  azureEndpoint?: string | null;
+  azureApiVersion?: string;
   help?: boolean;
 }
 
@@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
       args.outputDir = argv[++i];
     } else if (arg === '--multi-turn') {
       args.multiTurn = true;
+    } else if (arg === '--max-parallel-models') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.maxParallelModels = value;
+    } else if (arg === '--benchmark-chunk-size') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.benchmarkChunkSize = value;
+    } else if (arg === '--mode') {
+      const mode = argv[++i];
+      if (mode !== 'evaluate' && mode !== 'benchmark') {
+        console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
+        process.exit(1);
+      }
+      args.mode = mode as 'evaluate' | 'benchmark';
+    } else if (arg === '--stages') {
+      args.stages = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.stages.push(argv[++i]);
+      }
+    } else if (arg === '--models') {
+      args.models = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.models.push(argv[++i]);
+      }
+    } else if (arg === '--latency-iterations') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.latencyIterations = value;
+    } else if (arg === '--api-key') {
+      args.apiKey = argv[++i];
+    } else if (arg === '--base-url') {
+      args.baseUrl = argv[++i];
+    } else if (arg === '--azure-endpoint') {
+      args.azureEndpoint = argv[++i];
+    } else if (arg === '--azure-api-version') {
+      args.azureApiVersion = argv[++i];
     } else if (!args.configFile && !arg.startsWith('-')) {
       args.configFile = arg;
     }
@@ -119,6 +175,12 @@ function showHelp(): void {
   console.log(
     '  --dataset-path <path>                         Path to evaluation dataset (required)'
   );
+  console.log(
+    '  --mode <mode>                                 Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
+  );
+  console.log(
+    '  --stages <stage>...                            Pipeline stages to evaluate: pre_flight, input, output'
+  );
   console.log(
     '  --batch-size <number>                         Number of samples to process in parallel (default: 32)'
   );
@@ -128,6 +190,32 @@ function showHelp(): void {
   console.log(
     '  --multi-turn                                  Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
   );
+  console.log('Benchmark Options:');
+  console.log(
+    '  --models <model>...                            Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
+  );
+  console.log(
+    '  --latency-iterations <number>                 Number of iterations for latency testing (default: 25)'
+  );
+  console.log(
+    '  --max-parallel-models <number>                Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
+  );
+  console.log(
+    '  --benchmark-chunk-size <number>                Optional number of samples per chunk when benchmarking to limit long-running runs'
+  );
+  console.log('API Configuration:');
+  console.log(
+    '  --api-key <key>                               API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
+  );
+  console.log(
+    '  --base-url <url>                              Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
+  );
+  console.log(
+    '  --azure-endpoint <endpoint>                   Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
+  );
+  console.log(
+    '  --azure-api-version <version>                 Azure OpenAI API version (default: 2025-01-01-preview)'
+  );
   console.log('');
   console.log('Examples:');
   console.log('  guardrails validate config.json');
@@ -136,6 +224,12 @@ function showHelp(): void {
   console.log(
     '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
   );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
+  );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
+  );
   console.log('  guardrails validate-dataset dataset.jsonl');
 }
 
@@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
     process.exit(1);
   }
 
+  if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) {
+    console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`);
+    process.exit(1);
+  }
+
+  if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) {
+    console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`);
+    process.exit(1);
+  }
+
+  if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
+    console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
+    process.exit(1);
+  }
+
+  if (args.stages) {
+    const validStages = new Set(['pre_flight', 'input', 'output']);
+    const invalidStages = args.stages.filter((s) => !validStages.has(s));
+    if (invalidStages.length > 0) {
+      console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
+      process.exit(1);
+    }
+  }
+
+  if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
+    console.warn('⚠️  Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
+  }
+
+  if (args.azureEndpoint && args.baseUrl) {
+    console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
+    process.exit(1);
+  }
+
+  if (args.azureEndpoint && !args.apiKey) {
+    console.error('❌ Error: --api-key is required when using --azure-endpoint');
+    process.exit(1);
+  }
+
   try {
     await runEvaluationCLI({
       configPath: args.configPath,
       datasetPath: args.datasetPath,
+      stages: args.stages || null,
       batchSize: args.batchSize || 32,
       outputDir: args.outputDir || 'results',
+      apiKey: args.apiKey || null,
+      baseUrl: args.baseUrl || null,
+      azureEndpoint: args.azureEndpoint || null,
+      azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
+      mode: args.mode || 'evaluate',
+      models: args.models || null,
+      latencyIterations: args.latencyIterations,
       multiTurn: args.multiTurn,
+      maxParallelModels: args.maxParallelModels,
+      benchmarkChunkSize: args.benchmarkChunkSize,
     });
 
     console.log('Evaluation completed successfully!');

diff --git a/src/evals/core/async-engine.ts b/src/evals/core/async-engine.ts
@@ -46,18 +46,17 @@ export class AsyncRunEngine implements RunEngine {
     }
 
     const results: SampleResult[] = [];
-    let processed = 0;
+    const totalSamples = samples.length;
 
-    console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
+    console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);
 
     for (let i = 0; i < samples.length; i += batchSize) {
       const batch = samples.slice(i, i + batchSize);
       const batchResults = await Promise.all(
         batch.map((sample) => this.evaluateSample(context, sample))
       );
       results.push(...batchResults);
-      processed += batch.length;
-      console.log(`Processed ${processed}/${samples.length} samples`);
+      console.log(`Processed ${results.length}/${totalSamples} samples`);
     }
 
     return results;