diff --git a/docs/evals.md b/docs/evals.md
index 95efebd..7fa7c30 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -35,6 +35,8 @@ The evals tool is included with the TypeScript package. No additional dependenci
 | `--azure-api-version` | ❌ | Azure OpenAI API version (default: 2025-01-01-preview) |
 | `--models` | ❌ | Models for benchmark mode (benchmark only) |
 | `--latency-iterations` | ❌ | Latency test samples (default: 25) (benchmark only) |
+| `--max-parallel-models` | ❌ | Maximum concurrent models in benchmark mode (default: CPU count) (benchmark only) |
+| `--benchmark-chunk-size` | ❌ | Sample chunk size per model for memory-efficient benchmarking (benchmark only) |
 
 ## Configuration
 
@@ -154,6 +156,8 @@ npm run eval -- --config-path config.json --dataset-path data.jsonl --base-url h
 - **Multi-stage evaluation**: pre_flight, input, output stages
 - **Automatic stage detection**: Evaluates all stages found in configuration
 - **Batch processing**: Configurable parallel processing
+- **Parallel benchmarking**: Run multiple models concurrently with CPU-aware defaults
+- **Memory-efficient chunking**: Process large datasets in smaller chunks during benchmarking
 - **Benchmark mode**: Model performance comparison with ROC AUC, precision at recall thresholds
 - **Latency testing**: End-to-end guardrail performance measurement
 - **Visualization**: Automatic chart and graph generation
diff --git a/src/__tests__/unit/evals/guardrail-evals.test.ts b/src/__tests__/unit/evals/guardrail-evals.test.ts
new file mode 100644
index 0000000..cf78c56
--- /dev/null
+++ b/src/__tests__/unit/evals/guardrail-evals.test.ts
@@ -0,0 +1,94 @@
+/**
+ * Unit tests for guardrail evaluation utilities.
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { GuardrailEval } from '../../../evals/guardrail-evals';
+import type { Sample } from '../../../evals/core/types';
+import * as os from 'os';
+
+vi.mock('os', () => {
+  return {
+    default: {
+      cpus: vi.fn(),
+    },
+    cpus: vi.fn(),
+  };
+});
+
+function buildSamples(count: number): Sample[] {
+  /**Build synthetic samples for chunking tests.
+   *
+   * @param count - Number of synthetic samples to build.
+   * @returns List of Sample instances configured for evaluation.
+   */
+  return Array.from({ length: count }, (_, idx) => ({
+    id: `sample-${idx}`,
+    data: `payload-${idx}`,
+    expectedTriggers: { g: Boolean(idx % 2) },
+  }));
+}
+
+describe('GuardrailEval._determineParallelModelLimit', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('should use cpu_count when explicit parallelism is not provided', () => {
+    vi.mocked(os.cpus).mockReturnValue(Array(4).fill({}) as os.CpuInfo[]);
+
+    expect(GuardrailEval._determineParallelModelLimit(10, null)).toBe(4);
+    expect(GuardrailEval._determineParallelModelLimit(2, null)).toBe(2);
+  });
+
+  it('should honor user-provided parallelism constraints', () => {
+    expect(GuardrailEval._determineParallelModelLimit(5, 3)).toBe(3);
+    expect(() => GuardrailEval._determineParallelModelLimit(5, 0)).toThrow('maxParallelModels must be positive');
+  });
+
+  it('should throw error for invalid model count', () => {
+    expect(() => GuardrailEval._determineParallelModelLimit(0, null)).toThrow('modelCount must be positive');
+  });
+});
+
+describe('GuardrailEval._chunkSamples', () => {
+  it('should return the original sample list when no chunk size is provided', () => {
+    const samples = buildSamples(3);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, null));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toBe(samples);
+  });
+
+  it('should split samples into evenly sized chunks', () => {
+    const samples = buildSamples(5);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
+    expect(chunks.map((chunk) => chunk.length)).toEqual([2, 2, 1]);
+    expect(chunks[0][0].id).toBe('sample-0');
+    expect(chunks[1][0].id).toBe('sample-2');
+    expect(chunks[2][0].id).toBe('sample-4');
+  });
+
+  it('should reject invalid chunk sizes', () => {
+    const samples = buildSamples(2);
+    expect(() => Array.from(GuardrailEval._chunkSamples(samples, 0))).toThrow('chunkSize must be positive when provided');
+  });
+
+  it('should return single chunk when chunk size is larger than samples', () => {
+    const samples = buildSamples(3);
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 10));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toBe(samples);
+  });
+
+  it('should handle empty samples', () => {
+    const samples: Sample[] = [];
+    const chunks = Array.from(GuardrailEval._chunkSamples(samples, 2));
+    expect(chunks.length).toBe(1);
+    expect(chunks[0]).toEqual([]);
+  });
+});
+
diff --git a/src/cli.ts b/src/cli.ts
index cedaa22..af3b958 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -36,6 +36,16 @@ interface CliArgs {
   batchSize?: number;
   outputDir?: string;
   multiTurn?: boolean;
+  maxParallelModels?: number | null;
+  benchmarkChunkSize?: number | null;
+  mode?: 'evaluate' | 'benchmark';
+  stages?: string[];
+  models?: string[];
+  latencyIterations?: number;
+  apiKey?: string | null;
+  baseUrl?: string | null;
+  azureEndpoint?: string | null;
+  azureApiVersion?: string;
   help?: boolean;
 }
 
@@ -72,6 +82,52 @@ function parseArgs(argv: string[]): CliArgs {
       args.outputDir = argv[++i];
     } else if (arg === '--multi-turn') {
       args.multiTurn = true;
+    } else if (arg === '--max-parallel-models') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: max-parallel-models must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.maxParallelModels = value;
+    } else if (arg === '--benchmark-chunk-size') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.benchmarkChunkSize = value;
+    } else if (arg === '--mode') {
+      const mode = argv[++i];
+      if (mode !== 'evaluate' && mode !== 'benchmark') {
+        console.error(`❌ Error: Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
+        process.exit(1);
+      }
+      args.mode = mode as 'evaluate' | 'benchmark';
+    } else if (arg === '--stages') {
+      args.stages = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.stages.push(argv[++i]);
+      }
+    } else if (arg === '--models') {
+      args.models = [];
+      while (i + 1 < argv.length && !argv[i + 1].startsWith('--')) {
+        args.models.push(argv[++i]);
+      }
+    } else if (arg === '--latency-iterations') {
+      const value = parseInt(argv[++i], 10);
+      if (isNaN(value) || value <= 0) {
+        console.error(`❌ Error: latency-iterations must be positive, got: ${argv[i]}`);
+        process.exit(1);
+      }
+      args.latencyIterations = value;
+    } else if (arg === '--api-key') {
+      args.apiKey = argv[++i];
+    } else if (arg === '--base-url') {
+      args.baseUrl = argv[++i];
+    } else if (arg === '--azure-endpoint') {
+      args.azureEndpoint = argv[++i];
+    } else if (arg === '--azure-api-version') {
+      args.azureApiVersion = argv[++i];
     } else if (!args.configFile && !arg.startsWith('-')) {
       args.configFile = arg;
     }
@@ -119,6 +175,12 @@ function showHelp(): void {
   console.log(
     '  --dataset-path <path>                         Path to evaluation dataset (required)'
   );
+  console.log(
+    '  --mode <mode>                                 Evaluation mode: "evaluate" or "benchmark" (default: evaluate)'
+  );
+  console.log(
+    '  --stages <stage>...                            Pipeline stages to evaluate: pre_flight, input, output'
+  );
   console.log(
     '  --batch-size <number>                         Number of samples to process in parallel (default: 32)'
   );
@@ -128,6 +190,32 @@ function showHelp(): void {
   console.log(
     '  --multi-turn                                  Evaluate conversation-aware guardrails turn-by-turn (default: single-pass)'
   );
+  console.log('Benchmark Options:');
+  console.log(
+    '  --models <model>...                            Models to test in benchmark mode (default: gpt-5, gpt-5-mini, gpt-4.1, gpt-4.1-mini)'
+  );
+  console.log(
+    '  --latency-iterations <number>                 Number of iterations for latency testing (default: 25)'
+  );
+  console.log(
+    '  --max-parallel-models <number>                Maximum number of models to benchmark concurrently (default: min(models, cpu_count))'
+  );
+  console.log(
+    '  --benchmark-chunk-size <number>                Optional number of samples per chunk when benchmarking to limit long-running runs'
+  );
+  console.log('API Configuration:');
+  console.log(
+    '  --api-key <key>                               API key for OpenAI, Azure OpenAI, or OpenAI-compatible API'
+  );
+  console.log(
+    '  --base-url <url>                              Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)'
+  );
+  console.log(
+    '  --azure-endpoint <endpoint>                   Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)'
+  );
+  console.log(
+    '  --azure-api-version <version>                 Azure OpenAI API version (default: 2025-01-01-preview)'
+  );
   console.log('');
   console.log('Examples:');
   console.log('  guardrails validate config.json');
@@ -136,6 +224,12 @@ function showHelp(): void {
   console.log(
     '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --batch-size 16 --output-dir my-results'
   );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --models gpt-5 gpt-5-mini'
+  );
+  console.log(
+    '  guardrails eval --config-path config.json --dataset-path dataset.jsonl --mode benchmark --azure-endpoint https://your-resource.openai.azure.com --api-key your-key'
+  );
   console.log('  guardrails validate-dataset dataset.jsonl');
 }
 
@@ -154,13 +248,61 @@ async function handleEvalCommand(args: CliArgs): Promise<void> {
     process.exit(1);
   }
 
+  if (args.maxParallelModels !== undefined && args.maxParallelModels !== null && args.maxParallelModels <= 0) {
+    console.error(`❌ Error: max-parallel-models must be positive, got: ${args.maxParallelModels}`);
+    process.exit(1);
+  }
+
+  if (args.benchmarkChunkSize !== undefined && args.benchmarkChunkSize !== null && args.benchmarkChunkSize <= 0) {
+    console.error(`❌ Error: benchmark-chunk-size must be positive, got: ${args.benchmarkChunkSize}`);
+    process.exit(1);
+  }
+
+  if (args.latencyIterations !== undefined && args.latencyIterations <= 0) {
+    console.error(`❌ Error: latency-iterations must be positive, got: ${args.latencyIterations}`);
+    process.exit(1);
+  }
+
+  if (args.stages) {
+    const validStages = new Set(['pre_flight', 'input', 'output']);
+    const invalidStages = args.stages.filter((s) => !validStages.has(s));
+    if (invalidStages.length > 0) {
+      console.error(`❌ Error: Invalid stages: ${invalidStages.join(', ')}. Valid stages are: ${Array.from(validStages).join(', ')}`);
+      process.exit(1);
+    }
+  }
+
+  if (args.mode === 'benchmark' && args.stages && args.stages.length > 1) {
+    console.warn('⚠️  Warning: Benchmark mode only uses the first specified stage. Additional stages will be ignored.');
+  }
+
+  if (args.azureEndpoint && args.baseUrl) {
+    console.error('❌ Error: Cannot specify both --azure-endpoint and --base-url. Choose one provider.');
+    process.exit(1);
+  }
+
+  if (args.azureEndpoint && !args.apiKey) {
+    console.error('❌ Error: --api-key is required when using --azure-endpoint');
+    process.exit(1);
+  }
+
   try {
     await runEvaluationCLI({
       configPath: args.configPath,
       datasetPath: args.datasetPath,
+      stages: args.stages || null,
       batchSize: args.batchSize || 32,
       outputDir: args.outputDir || 'results',
+      apiKey: args.apiKey || null,
+      baseUrl: args.baseUrl || null,
+      azureEndpoint: args.azureEndpoint || null,
+      azureApiVersion: args.azureApiVersion || '2025-01-01-preview',
+      mode: args.mode || 'evaluate',
+      models: args.models || null,
+      latencyIterations: args.latencyIterations,
       multiTurn: args.multiTurn,
+      maxParallelModels: args.maxParallelModels,
+      benchmarkChunkSize: args.benchmarkChunkSize,
     });
 
     console.log('Evaluation completed successfully!');
diff --git a/src/evals/core/async-engine.ts b/src/evals/core/async-engine.ts
index f39b94e..8a3a5e9 100644
--- a/src/evals/core/async-engine.ts
+++ b/src/evals/core/async-engine.ts
@@ -46,9 +46,9 @@ export class AsyncRunEngine implements RunEngine {
     }
 
     const results: SampleResult[] = [];
-    let processed = 0;
+    const totalSamples = samples.length;
 
-    console.log(`${desc}: ${samples.length} samples, batch size: ${batchSize}`);
+    console.log(`${desc}: ${totalSamples} samples, batch size: ${batchSize}`);
 
     for (let i = 0; i < samples.length; i += batchSize) {
       const batch = samples.slice(i, i + batchSize);
@@ -56,8 +56,7 @@ export class AsyncRunEngine implements RunEngine {
         batch.map((sample) => this.evaluateSample(context, sample))
       );
       results.push(...batchResults);
-      processed += batch.length;
-      console.log(`Processed ${processed}/${samples.length} samples`);
+      console.log(`Processed ${results.length}/${totalSamples} samples`);
     }
 
     return results;
diff --git a/src/evals/core/benchmark-calculator.ts b/src/evals/core/benchmark-calculator.ts
new file mode 100644
index 0000000..3362698
--- /dev/null
+++ b/src/evals/core/benchmark-calculator.ts
@@ -0,0 +1,295 @@
+/**
+ * Advanced metrics calculator for guardrail benchmarking.
+ *
+ * This module implements advanced evaluation metrics for benchmarking guardrail performance
+ * across different models.
+ */
+
+import { SampleResult } from './types';
+
+/**
+ * Calculates advanced benchmarking metrics for guardrail evaluation.
+ */
+export class BenchmarkMetricsCalculator {
+  /**
+   * Calculate advanced metrics for a specific guardrail.
+   *
+   * @param results - List of evaluation results
+   * @param guardrailName - Name of the guardrail to analyze
+   * @param guardrailConfig - Guardrail configuration to check for confidence thresholds
+   * @returns Dictionary containing advanced metrics, or empty dict if not applicable
+   */
+  calculateAdvancedMetrics(
+    results: SampleResult[],
+    guardrailName: string,
+    guardrailConfig?: Record<string, unknown> | null
+  ): Record<string, number> {
+    if (!guardrailConfig || !('confidence_threshold' in guardrailConfig)) {
+      return {};
+    }
+
+    if (results.length === 0) {
+      throw new Error('Cannot calculate metrics for empty results list');
+    }
+
+    const { yTrue, yScores } = this.extractLabelsAndScores(results, guardrailName);
+
+    if (yTrue.length === 0) {
+      throw new Error(`No valid data found for guardrail '${guardrailName}'`);
+    }
+
+    return this.calculateMetrics(yTrue, yScores);
+  }
+
+  private extractLabelsAndScores(
+    results: SampleResult[],
+    guardrailName: string
+  ): { yTrue: number[]; yScores: number[] } {
+    const yTrue: number[] = [];
+    const yScores: number[] = [];
+
+    for (const result of results) {
+      if (!(guardrailName in result.expectedTriggers)) {
+        console.warn(
+          `Guardrail '${guardrailName}' not found in expectedTriggers for sample ${result.id}`
+        );
+        continue;
+      }
+
+      const expected = result.expectedTriggers[guardrailName];
+      yTrue.push(expected ? 1 : 0);
+
+      // Get confidence score from details, fallback to binary
+      const confidence = this.getConfidenceScore(result, guardrailName);
+      yScores.push(confidence);
+    }
+
+    return { yTrue, yScores };
+  }
+
+  private getConfidenceScore(result: SampleResult, guardrailName: string): number {
+    if (guardrailName in result.details) {
+      const guardrailDetails = result.details[guardrailName];
+      if (
+        typeof guardrailDetails === 'object' &&
+        guardrailDetails !== null &&
+        'confidence' in guardrailDetails
+      ) {
+        const conf = guardrailDetails.confidence;
+        if (typeof conf === 'number') {
+          return conf;
+        }
+      }
+    }
+
+    // Fallback to binary: 1.0 if triggered, 0.0 if not
+    const actual = result.triggered[guardrailName] || false;
+    return actual ? 1.0 : 0.0;
+  }
+
+  private calculateMetrics(yTrue: number[], yScores: number[]): Record<string, number> {
+    const metrics: Record<string, number> = {};
+
+    // Calculate ROC AUC
+    try {
+      metrics.roc_auc = this.calculateRocAuc(yTrue, yScores);
+    } catch (error) {
+      console.warn(`Could not calculate ROC AUC: ${error}`);
+      metrics.roc_auc = NaN;
+    }
+
+    // Calculate precision at different recall thresholds
+    try {
+      const { precision, recall } = this.precisionRecallCurve(yTrue, yScores);
+      metrics.prec_at_r80 = this.precisionAtRecall(precision, recall, 0.8);
+      metrics.prec_at_r90 = this.precisionAtRecall(precision, recall, 0.9);
+      metrics.prec_at_r95 = this.precisionAtRecall(precision, recall, 0.95);
+    } catch (error) {
+      console.warn(`Could not calculate precision at recall thresholds: ${error}`);
+      metrics.prec_at_r80 = NaN;
+      metrics.prec_at_r90 = NaN;
+      metrics.prec_at_r95 = NaN;
+    }
+
+    // Calculate recall at FPR = 0.01
+    try {
+      const { fpr, tpr } = this.rocCurve(yTrue, yScores);
+      metrics.recall_at_fpr01 = this.recallAtFpr(fpr, tpr, 0.01);
+    } catch (error) {
+      console.warn(`Could not calculate recall at FPR=0.01: ${error}`);
+      metrics.recall_at_fpr01 = NaN;
+    }
+
+    return metrics;
+  }
+
+  private calculateRocAuc(yTrue: number[], yScores: number[]): number {
+    // Sort by score descending
+    const combined = yTrue.map((label, i) => ({ label, score: yScores[i] }));
+    combined.sort((a, b) => b.score - a.score);
+
+    const totalPositives = yTrue.filter((y) => y === 1).length;
+    const totalNegatives = yTrue.length - totalPositives;
+
+    if (totalPositives === 0 || totalNegatives === 0) {
+      throw new Error('Need both positive and negative samples to calculate ROC AUC');
+    }
+
+    let auc = 0;
+    let tp = 0;
+    let fp = 0;
+    let prevTpr = 0;
+    let prevFpr = 0;
+
+    for (const item of combined) {
+      if (item.label === 1) {
+        tp += 1;
+      } else {
+        fp += 1;
+      }
+
+      const tpr = tp / totalPositives;
+      const fpr = fp / totalNegatives;
+
+      // Trapezoidal rule
+      auc += (fpr - prevFpr) * (tpr + prevTpr) / 2;
+
+      prevTpr = tpr;
+      prevFpr = fpr;
+    }
+
+    return auc;
+  }
+
+  private precisionRecallCurve(yTrue: number[], yScores: number[]): {
+    precision: number[];
+    recall: number[];
+  } {
+    // Sort by score descending
+    const combined = yTrue.map((label, i) => ({ label, score: yScores[i] }));
+    combined.sort((a, b) => b.score - a.score);
+
+    const totalPositives = yTrue.filter((y) => y === 1).length;
+    if (totalPositives === 0) {
+      return { precision: [1], recall: [0] };
+    }
+
+    const precision: number[] = [];
+    const recall: number[] = [];
+
+    let tp = 0;
+    let fp = 0;
+
+    // Add initial point (recall=0, precision=1)
+    precision.push(1);
+    recall.push(0);
+
+    for (const item of combined) {
+      if (item.label === 1) {
+        tp += 1;
+      } else {
+        fp += 1;
+      }
+
+      const prec = tp + fp > 0 ? tp / (tp + fp) : 1;
+      const rec = tp / totalPositives;
+
+      precision.push(prec);
+      recall.push(rec);
+    }
+
+    return { precision, recall };
+  }
+
+  private rocCurve(yTrue: number[], yScores: number[]): { fpr: number[]; tpr: number[] } {
+    // Sort by score descending
+    const combined = yTrue.map((label, i) => ({ label, score: yScores[i] }));
+    combined.sort((a, b) => b.score - a.score);
+
+    const totalPositives = yTrue.filter((y) => y === 1).length;
+    const totalNegatives = yTrue.length - totalPositives;
+
+    const fpr: number[] = [0];
+    const tpr: number[] = [0];
+
+    let tp = 0;
+    let fp = 0;
+
+    for (const item of combined) {
+      if (item.label === 1) {
+        tp += 1;
+      } else {
+        fp += 1;
+      }
+
+      tpr.push(tp / totalPositives);
+      fpr.push(fp / totalNegatives);
+    }
+
+    return { fpr, tpr };
+  }
+
+  private precisionAtRecall(precision: number[], recall: number[], targetRecall: number): number {
+    let bestPrecision = 0;
+
+    for (let i = 0; i < recall.length; i += 1) {
+      if (recall[i] >= targetRecall) {
+        bestPrecision = Math.max(bestPrecision, precision[i]);
+      }
+    }
+
+    return bestPrecision;
+  }
+
+  private recallAtFpr(fpr: number[], tpr: number[], targetFpr: number): number {
+    let bestRecall = 0;
+
+    for (let i = 0; i < fpr.length; i += 1) {
+      if (fpr[i] <= targetFpr) {
+        bestRecall = Math.max(bestRecall, tpr[i]);
+      }
+    }
+
+    return bestRecall;
+  }
+
+  /**
+   * Calculate advanced metrics for all guardrails in the results.
+   *
+   * @param results - List of evaluation results
+   * @returns Dictionary mapping guardrail names to their advanced metrics
+   */
+  calculateAllGuardrailMetrics(
+    results: SampleResult[]
+  ): Record<string, Record<string, number>> {
+    if (results.length === 0) {
+      return {};
+    }
+
+    const guardrailNames = new Set<string>();
+    for (const result of results) {
+      Object.keys(result.expectedTriggers).forEach((name) => guardrailNames.add(name));
+    }
+
+    const metrics: Record<string, Record<string, number>> = {};
+
+    for (const guardrailName of guardrailNames) {
+      try {
+        const guardrailMetrics = this.calculateAdvancedMetrics(results, guardrailName);
+        metrics[guardrailName] = guardrailMetrics;
+      } catch (error) {
+        console.error(`Failed to calculate metrics for guardrail '${guardrailName}': ${error}`);
+        metrics[guardrailName] = {
+          roc_auc: NaN,
+          prec_at_r80: NaN,
+          prec_at_r90: NaN,
+          prec_at_r95: NaN,
+          recall_at_fpr01: NaN,
+        };
+      }
+    }
+
+    return metrics;
+  }
+}
+
diff --git a/src/evals/core/benchmark-reporter.ts b/src/evals/core/benchmark-reporter.ts
new file mode 100644
index 0000000..62d39b1
--- /dev/null
+++ b/src/evals/core/benchmark-reporter.ts
@@ -0,0 +1,291 @@
+/**
+ * Benchmark results reporter for guardrail evaluation.
+ *
+ * This module handles saving benchmark results in a specialized format with analysis
+ * folders containing visualizations and detailed metrics.
+ */
+
+import { SampleResult } from './types';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+
+/**
+ * Reports benchmark results with specialized output format.
+ */
+export class BenchmarkReporter {
+  private readonly outputDir: string;
+
+  /**
+   * Initialize the benchmark reporter.
+   *
+   * @param outputDir - Base directory for benchmark results
+   */
+  constructor(outputDir: string) {
+    this.outputDir = outputDir;
+  }
+
+  /**
+   * Save benchmark results in organized folder structure.
+   *
+   * @param resultsByModel - Dictionary mapping model names to their results
+   * @param metricsByModel - Dictionary mapping model names to their metrics
+   * @param latencyResults - Dictionary mapping model names to their latency data
+   * @param guardrailName - Name of the guardrail being benchmarked
+   * @param datasetSize - Number of samples in the dataset
+   * @param latencyIterations - Number of iterations used for latency testing
+   * @returns Path to the benchmark results directory
+   */
+  async saveBenchmarkResults(
+    resultsByModel: Record<string, SampleResult[]>,
+    metricsByModel: Record<string, Record<string, number>>,
+    latencyResults: Record<string, Record<string, unknown>>,
+    guardrailName: string,
+    datasetSize: number,
+    latencyIterations: number
+  ): Promise<string> {
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19).replace('T', '_');
+    const benchmarkDir = path.join(this.outputDir, `benchmark_${guardrailName}_${timestamp}`);
+
+    await fs.mkdir(benchmarkDir, { recursive: true });
+
+    // Create subdirectories
+    const resultsDir = path.join(benchmarkDir, 'results');
+    const graphsDir = path.join(benchmarkDir, 'graphs');
+    await fs.mkdir(resultsDir, { recursive: true });
+    await fs.mkdir(graphsDir, { recursive: true });
+
+    try {
+      // Save per-model results
+      for (const [modelName, results] of Object.entries(resultsByModel)) {
+        const modelResultsFile = path.join(
+          resultsDir,
+          `eval_results_${guardrailName}_${modelName}.jsonl`
+        );
+        await this.saveResultsJsonl(results, modelResultsFile);
+        console.info(`Model ${modelName} results saved to ${modelResultsFile}`);
+      }
+
+      // Save combined data
+      await this.saveMetricsJson(metricsByModel, path.join(resultsDir, 'performance_metrics.json'));
+      await this.saveLatencyJson(latencyResults, path.join(resultsDir, 'latency_results.json'));
+
+      // Save summary files
+      const summaryFile = path.join(benchmarkDir, 'benchmark_summary.txt');
+      await this.saveBenchmarkSummary(
+        summaryFile,
+        guardrailName,
+        resultsByModel,
+        metricsByModel,
+        latencyResults,
+        datasetSize,
+        latencyIterations
+      );
+
+      await this.saveSummaryTables(benchmarkDir, metricsByModel, latencyResults);
+    } catch (error) {
+      console.error(`Failed to save benchmark results: ${error}`);
+      throw error;
+    }
+
+    console.info(`Benchmark results saved to: ${benchmarkDir}`);
+    return benchmarkDir;
+  }
+
+  private createPerformanceTable(
+    metricsByModel: Record<string, Record<string, number>>
+  ): string[][] {
+    if (Object.keys(metricsByModel).length === 0) {
+      return [];
+    }
+
+    const metricKeys = ['precision', 'recall', 'f1Score', 'roc_auc'];
+    const metricNames = ['Precision', 'Recall', 'F1 Score', 'ROC AUC'];
+
+    const table: string[][] = [];
+    const header = ['Model', ...metricNames];
+    table.push(header);
+
+    for (const [modelName, modelMetrics] of Object.entries(metricsByModel)) {
+      const row: string[] = [modelName];
+      for (const key of metricKeys) {
+        const value = modelMetrics[key];
+        if (value === undefined || isNaN(value)) {
+          row.push('N/A');
+        } else {
+          row.push(value.toFixed(4));
+        }
+      }
+      table.push(row);
+    }
+
+    return table;
+  }
+
+  private createLatencyTable(latencyResults: Record<string, Record<string, unknown>>): string[][] {
+    if (Object.keys(latencyResults).length === 0) {
+      return [];
+    }
+
+    const table: string[][] = [];
+    const header = ['Model', 'TTC P50 (ms)', 'TTC P95 (ms)'];
+    table.push(header);
+
+    for (const [modelName, modelLatency] of Object.entries(latencyResults)) {
+      const row: string[] = [modelName];
+
+      if ('ttc' in modelLatency && typeof modelLatency.ttc === 'object' && modelLatency.ttc !== null) {
+        const ttcData = modelLatency.ttc as Record<string, unknown>;
+        const p50 = ttcData.p50;
+        const p95 = ttcData.p95;
+
+        row.push(
+          typeof p50 === 'number' && !isNaN(p50) ? p50.toFixed(1) : 'N/A',
+          typeof p95 === 'number' && !isNaN(p95) ? p95.toFixed(1) : 'N/A'
+        );
+      } else {
+        row.push('N/A', 'N/A');
+      }
+
+      table.push(row);
+    }
+
+    return table;
+  }
+
+  private formatTable(table: string[][]): string {
+    if (table.length === 0) {
+      return 'No data available';
+    }
+
+    // Calculate column widths
+    const widths: number[] = [];
+    for (let col = 0; col < table[0].length; col += 1) {
+      let maxWidth = 0;
+      for (const row of table) {
+        if (row[col]) {
+          maxWidth = Math.max(maxWidth, row[col].length);
+        }
+      }
+      widths.push(maxWidth);
+    }
+
+    // Format rows
+    const lines: string[] = [];
+    for (const row of table) {
+      const formattedRow = row
+        .map((cell, i) => (cell || '').padEnd(widths[i] || 0))
+        .join('  ');
+      lines.push(formattedRow);
+    }
+
+    return lines.join('\n');
+  }
+
+  private async saveSummaryTables(
+    benchmarkDir: string,
+    metricsByModel: Record<string, Record<string, number>>,
+    latencyResults: Record<string, Record<string, unknown>>
+  ): Promise<void> {
+    const outputFile = path.join(benchmarkDir, 'benchmark_summary_tables.txt');
+
+    try {
+      const perfTable = this.createPerformanceTable(metricsByModel);
+      const latencyTable = this.createLatencyTable(latencyResults);
+
+      let content = 'BENCHMARK SUMMARY TABLES\n';
+      content += '='.repeat(80) + '\n\n';
+
+      content += 'PERFORMANCE METRICS\n';
+      content += '-'.repeat(80) + '\n';
+      content += perfTable.length > 0 ? this.formatTable(perfTable) : 'No data available';
+      content += '\n\n';
+
+      content += 'LATENCY RESULTS (Time to Completion)\n';
+      content += '-'.repeat(80) + '\n';
+      content += latencyTable.length > 0 ? this.formatTable(latencyTable) : 'No data available';
+      content += '\n\n';
+
+      await fs.writeFile(outputFile, content, 'utf-8');
+      console.info(`Summary tables saved to: ${outputFile}`);
+    } catch (error) {
+      console.error(`Failed to save summary tables: ${error}`);
+    }
+  }
+
+  private async saveResultsJsonl(results: SampleResult[], filepath: string): Promise<void> {
+    const lines = results.map((result) =>
+      JSON.stringify({
+        id: result.id,
+        expected_triggers: result.expectedTriggers,
+        triggered: result.triggered,
+        details: result.details || {},
+      })
+    );
+    await fs.writeFile(filepath, lines.join('\n'), 'utf-8');
+  }
+
+  private async saveMetricsJson(
+    metricsByModel: Record<string, Record<string, number>>,
+    filepath: string
+  ): Promise<void> {
+    await fs.writeFile(filepath, JSON.stringify(metricsByModel, null, 2), 'utf-8');
+  }
+
+  private async saveLatencyJson(
+    latencyResults: Record<string, Record<string, unknown>>,
+    filepath: string
+  ): Promise<void> {
+    await fs.writeFile(filepath, JSON.stringify(latencyResults, null, 2), 'utf-8');
+  }
+
+  private async saveBenchmarkSummary(
+    filepath: string,
+    guardrailName: string,
+    resultsByModel: Record<string, SampleResult[]>,
+    metricsByModel: Record<string, Record<string, number>>,
+    latencyResults: Record<string, Record<string, unknown>>,
+    datasetSize: number,
+    latencyIterations: number
+  ): Promise<void> {
+    let content = 'Guardrail Benchmark Results\n';
+    content += '===========================\n\n';
+    content += `Guardrail: ${guardrailName}\n`;
+    content += `Timestamp: ${new Date().toISOString()}\n`;
+    content += `Dataset size: ${datasetSize} samples\n`;
+    content += `Latency iterations: ${latencyIterations}\n\n`;
+
+    content += `Models evaluated: ${Object.keys(resultsByModel).join(', ')}\n\n`;
+
+    content += 'Performance Metrics Summary:\n';
+    content += '---------------------------\n';
+    for (const [modelName, metrics] of Object.entries(metricsByModel)) {
+      content += `\n${modelName}:\n`;
+      for (const [metricName, value] of Object.entries(metrics)) {
+        if (typeof value === 'number' && !isNaN(value)) {
+          content += `  ${metricName}: ${value}\n`;
+        } else {
+          content += `  ${metricName}: N/A\n`;
+        }
+      }
+    }
+
+    content += '\nLatency Summary:\n';
+    content += '----------------\n';
+    for (const [modelName, latencyData] of Object.entries(latencyResults)) {
+      content += `\n${modelName}:\n`;
+      if ('error' in latencyData) {
+        content += `  Error: ${latencyData.error}\n`;
+      } else {
+        const ttft = latencyData.ttft as Record<string, number> | undefined;
+        const ttc = latencyData.ttc as Record<string, number> | undefined;
+        if (ttft && ttc) {
+          content += `  TTFT P50: ${ttft.p50?.toFixed(1) || 'N/A'}ms, P95: ${ttft.p95?.toFixed(1) || 'N/A'}ms\n`;
+          content += `  TTC P50: ${ttc.p50?.toFixed(1) || 'N/A'}ms, P95: ${ttc.p95?.toFixed(1) || 'N/A'}ms\n`;
+        }
+      }
+    }
+
+    await fs.writeFile(filepath, content, 'utf-8');
+  }
+}
+
diff --git a/src/evals/core/index.ts b/src/evals/core/index.ts
index 6438799..2de8e5d 100644
--- a/src/evals/core/index.ts
+++ b/src/evals/core/index.ts
@@ -11,3 +11,7 @@ export * from './validate-dataset';
 export * from './async-engine';
 export * from './calculator';
 export * from './json-reporter';
+export * from './benchmark-calculator';
+export * from './benchmark-reporter';
+export * from './latency-tester';
+export * from './visualizer';
diff --git a/src/evals/core/latency-tester.ts b/src/evals/core/latency-tester.ts
new file mode 100644
index 0000000..7134889
--- /dev/null
+++ b/src/evals/core/latency-tester.ts
@@ -0,0 +1,124 @@
+/**
+ * Latency testing for guardrail benchmarking.
+ *
+ * This module implements end-to-end guardrail latency testing for different models.
+ */
+
+import { Context, Sample } from './types';
+import { AsyncRunEngine } from './async-engine';
+import { instantiateGuardrails, GuardrailBundle } from '../../runtime';
+
+/**
+ * Tests end-to-end guardrail latency for different models.
+ */
+export class LatencyTester {
+  private readonly iterations: number;
+
+  /**
+   * Initialize the latency tester.
+   *
+   * @param iterations - Number of samples to time per model
+   */
+  constructor(iterations: number = 20) {
+    this.iterations = iterations;
+  }
+
+  /**
+   * Calculate latency statistics from a list of times.
+   *
+   * @param times - List of latency times in seconds
+   * @returns Dictionary with P50, P95, mean, and std dev (in milliseconds)
+   */
+  calculateLatencyStats(times: number[]): Record<string, number> {
+    if (times.length === 0) {
+      return { p50: NaN, p95: NaN, mean: NaN, std: NaN };
+    }
+
+    const timesMs = times.map((t) => t * 1000); // Convert to milliseconds
+    const sorted = [...timesMs].sort((a, b) => a - b);
+
+    const p50 = this.percentile(sorted, 50);
+    const p95 = this.percentile(sorted, 95);
+    const mean = timesMs.reduce((a, b) => a + b, 0) / timesMs.length;
+    const variance = timesMs.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / timesMs.length;
+    const std = Math.sqrt(variance);
+
+    return {
+      p50,
+      p95,
+      mean,
+      std,
+    };
+  }
+
+  /**
+   * Measure end-to-end guardrail latency per sample for a single model.
+   *
+   * @param context - Evaluation context with LLM client
+   * @param stageBundle - Stage bundle configured for the specific model
+   * @param samples - Full dataset samples
+   * @param iterations - Number of samples to time (uses first N samples)
+   * @param desc - Optional progress bar description
+   * @returns Dictionary with latency statistics and raw times
+   */
+  async testGuardrailLatencyForModel(
+    context: Context,
+    stageBundle: GuardrailBundle,
+    samples: Sample[],
+    iterations: number,
+    desc?: string
+  ): Promise<Record<string, unknown>> {
+    const guardrails = await instantiateGuardrails(stageBundle);
+    const engine = new AsyncRunEngine(guardrails);
+
+    const num = Math.min(iterations, samples.length);
+    if (num <= 0) {
+      return this.emptyLatencyResult();
+    }
+
+    const ttcTimes: number[] = [];
+    const barDesc = desc || 'Latency';
+
+    console.log(`${barDesc}: ${num} samples`);
+
+    for (let i = 0; i < num; i += 1) {
+      const sample = samples[i];
+      const start = performance.now() / 1000; // Convert to seconds
+      await engine.run(context, [sample], 1, undefined);
+      const ttc = performance.now() / 1000 - start;
+      ttcTimes.push(ttc);
+      console.log(`${barDesc}: Processed ${i + 1}/${num} samples`);
+    }
+
+    const ttcStats = this.calculateLatencyStats(ttcTimes);
+
+    return {
+      ttft: ttcStats, // TTFT same as TTC at guardrail level
+      ttc: ttcStats,
+      rawTimes: { ttft: ttcTimes, ttc: ttcTimes },
+      iterations: ttcTimes.length,
+    };
+  }
+
+  private emptyLatencyResult(): Record<string, unknown> {
+    const emptyStats = { p50: NaN, p95: NaN, mean: NaN, std: NaN };
+    return {
+      ttft: emptyStats,
+      ttc: emptyStats,
+      rawTimes: { ttft: [], ttc: [] },
+      iterations: 0,
+    };
+  }
+
+  private percentile(sorted: number[], p: number): number {
+    if (sorted.length === 0) {
+      return NaN;
+    }
+    const index = (p / 100) * (sorted.length - 1);
+    const lower = Math.floor(index);
+    const upper = Math.ceil(index);
+    const weight = index - lower;
+    return sorted[lower] * (1 - weight) + sorted[upper] * weight;
+  }
+}
+
diff --git a/src/evals/core/visualizer.ts b/src/evals/core/visualizer.ts
new file mode 100644
index 0000000..a637195
--- /dev/null
+++ b/src/evals/core/visualizer.ts
@@ -0,0 +1,68 @@
+/**
+ * Visualization module for guardrail benchmarking.
+ *
+ * This module generates charts and graphs for benchmark results.
+ * Note: Full visualization requires additional plotting libraries.
+ * This is a stub implementation that matches the Python interface.
+ */
+
+import * as fs from 'fs/promises';
+import * as path from 'path';
+
+/**
+ * Generates visualizations for guardrail benchmark results.
+ */
+export class BenchmarkVisualizer {
+  private readonly outputDir: string;
+
+  /**
+   * Initialize the visualizer.
+   *
+   * @param outputDir - Directory to save generated charts
+   */
+  constructor(outputDir: string) {
+    this.outputDir = outputDir;
+  }
+
+  /**
+   * Create all visualizations for a benchmark run.
+   *
+   * @param resultsByModel - Dictionary mapping model names to their results
+   * @param metricsByModel - Dictionary mapping model names to their metrics
+   * @param latencyResults - Dictionary mapping model names to their latency data
+   * @param guardrailName - Name of the guardrail being evaluated
+   * @param _expectedTriggers - Expected trigger values for each sample (reserved for future use)
+   * @returns List of paths to saved visualization files
+   */
+  async createAllVisualizations(
+    resultsByModel: Record<string, unknown[]>,
+    metricsByModel: Record<string, Record<string, number>>,
+    latencyResults: Record<string, Record<string, unknown>>,
+    guardrailName: string,
+    _expectedTriggers: Record<string, boolean>
+  ): Promise<string[]> {
+    const savedFiles: string[] = [];
+
+    // Ensure output directory exists
+    await fs.mkdir(this.outputDir, { recursive: true });
+
+    // Note: Full visualization requires plotting libraries (e.g., plotly, chart.js, etc.)
+    // For now, we create a placeholder file indicating visualizations would be generated here
+    try {
+      const placeholderFile = path.join(this.outputDir, 'visualizations_placeholder.txt');
+      await fs.writeFile(
+        placeholderFile,
+        `Visualizations would be generated here for guardrail: ${guardrailName}\n` +
+          `Models: ${Object.keys(resultsByModel).join(', ')}\n` +
+          `Note: Full visualization requires additional plotting libraries.\n`,
+        'utf-8'
+      );
+      savedFiles.push(placeholderFile);
+    } catch (error) {
+      console.error(`Failed to create visualization placeholder: ${error}`);
+    }
+
+    return savedFiles;
+  }
+}
+
diff --git a/src/evals/guardrail-evals.ts b/src/evals/guardrail-evals.ts
index a40e352..9bcb813 100644
--- a/src/evals/guardrail-evals.ts
+++ b/src/evals/guardrail-evals.ts
@@ -1,17 +1,34 @@
 /**
- * Guardrail evaluation runner.
+ * Guardrail evaluation runner and CLI.
  *
- * This class provides the main interface for running guardrail evaluations on datasets.
- * It loads guardrail configurations, runs evaluations asynchronously, calculates metrics, and saves results.
+ * This script provides a command-line interface and class for running guardrail evaluations on datasets.
  */
 
-import { Context } from './core/types';
+import { Context, Sample, SampleResult } from './core/types';
 import { JsonlDatasetLoader } from './core/jsonl-loader';
 import { AsyncRunEngine } from './core/async-engine';
 import { GuardrailMetricsCalculator } from './core/calculator';
 import { JsonResultsReporter } from './core/json-reporter';
-import { loadConfigBundleFromFile, instantiateGuardrails } from '../runtime';
+import { BenchmarkMetricsCalculator } from './core/benchmark-calculator';
+import { BenchmarkReporter } from './core/benchmark-reporter';
+import { BenchmarkVisualizer } from './core/visualizer';
+import { LatencyTester } from './core/latency-tester';
+import {
+  instantiateGuardrails,
+  loadPipelineBundles,
+  PipelineConfig,
+  GuardrailBundle,
+} from '../runtime';
 import { OpenAI } from 'openai';
+import * as os from 'os';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+
+// Default models for benchmark mode
+const DEFAULT_BENCHMARK_MODELS = ['gpt-5', 'gpt-5-mini', 'gpt-4.1', 'gpt-4.1-mini'];
+const DEFAULT_BATCH_SIZE = 32;
+const DEFAULT_LATENCY_ITERATIONS = 25;
+const VALID_STAGES = new Set(['pre_flight', 'input', 'output']);
 
 /**
  * Class for running guardrail evaluations.
@@ -19,69 +36,676 @@ import { OpenAI } from 'openai';
 export class GuardrailEval {
   private configPath: string;
   private datasetPath: string;
+  private stages: string[] | null;
   private batchSize: number;
   private outputDir: string;
+  private apiKey: string | null;
+  private baseUrl: string | null;
+  private azureEndpoint: string | null;
+  private azureApiVersion: string;
+  private mode: 'evaluate' | 'benchmark';
+  private models: string[];
+  private latencyIterations: number;
   private multiTurn: boolean;
+  private maxParallelModels: number;
+  private benchmarkChunkSize: number | null;
 
   /**
    * Initialize the evaluator.
    *
-   * @param configPath - Path to the guardrail config file
-   * @param datasetPath - Path to the evaluation dataset
+   * @param configPath - Path to pipeline configuration file
+   * @param datasetPath - Path to evaluation dataset (JSONL)
+   * @param stages - Specific stages to evaluate (pre_flight, input, output)
    * @param batchSize - Number of samples to process in parallel
    * @param outputDir - Directory to save evaluation results
+   * @param apiKey - API key for OpenAI, Azure OpenAI, or OpenAI-compatible API
+   * @param baseUrl - Base URL for OpenAI-compatible API (e.g., http://localhost:11434/v1)
+   * @param azureEndpoint - Azure OpenAI endpoint (e.g., https://your-resource.openai.azure.com)
+   * @param azureApiVersion - Azure OpenAI API version (e.g., 2025-01-01-preview)
+   * @param mode - Evaluation mode ("evaluate" or "benchmark")
+   * @param models - Models to test in benchmark mode
+   * @param latencyIterations - Number of iterations for latency testing
+   * @param multiTurn - Whether to evaluate guardrails on multi-turn conversations
+   * @param maxParallelModels - Maximum number of models to benchmark concurrently
+   * @param benchmarkChunkSize - Optional sample chunk size for per-model benchmarking
    */
   constructor(
     configPath: string,
     datasetPath: string,
-    batchSize: number = 32,
+    stages: string[] | null = null,
+    batchSize: number = DEFAULT_BATCH_SIZE,
     outputDir: string = 'results',
-    multiTurn: boolean = false
+    apiKey: string | null = null,
+    baseUrl: string | null = null,
+    azureEndpoint: string | null = null,
+    azureApiVersion: string = '2025-01-01-preview',
+    mode: 'evaluate' | 'benchmark' = 'evaluate',
+    models: string[] | null = null,
+    latencyIterations: number = DEFAULT_LATENCY_ITERATIONS,
+    multiTurn: boolean = false,
+    maxParallelModels: number | null = null,
+    benchmarkChunkSize: number | null = null
   ) {
+    // Note: File existence validation will happen in run() method
+    // since constructor cannot be async
+    if (batchSize <= 0) {
+      throw new Error(`Batch size must be positive, got: ${batchSize}`);
+    }
+
+    if (mode !== 'evaluate' && mode !== 'benchmark') {
+      throw new Error(`Invalid mode: ${mode}. Must be 'evaluate' or 'benchmark'`);
+    }
+
+    if (latencyIterations <= 0) {
+      throw new Error(`Latency iterations must be positive, got: ${latencyIterations}`);
+    }
+
+    if (maxParallelModels !== null && maxParallelModels <= 0) {
+      throw new Error(`max_parallel_models must be positive, got: ${maxParallelModels}`);
+    }
+
+    if (benchmarkChunkSize !== null && benchmarkChunkSize <= 0) {
+      throw new Error(`benchmark_chunk_size must be positive, got: ${benchmarkChunkSize}`);
+    }
+
     this.configPath = configPath;
     this.datasetPath = datasetPath;
+    this.stages = stages;
     this.batchSize = batchSize;
     this.outputDir = outputDir;
+    this.apiKey = apiKey;
+    this.baseUrl = baseUrl;
+    this.azureEndpoint = azureEndpoint;
+    this.azureApiVersion = azureApiVersion;
+    this.mode = mode;
+    this.models = models || [...DEFAULT_BENCHMARK_MODELS];
+    this.latencyIterations = latencyIterations;
     this.multiTurn = multiTurn;
+    this.maxParallelModels = GuardrailEval._determineParallelModelLimit(
+      this.models.length,
+      maxParallelModels
+    );
+    this.benchmarkChunkSize = benchmarkChunkSize;
+  }
+
+  private async _validateFilePaths(): Promise<void> {
+    try {
+      await fs.access(this.configPath);
+    } catch {
+      throw new Error(`Config file not found: ${this.configPath}`);
+    }
+
+    try {
+      await fs.access(this.datasetPath);
+    } catch {
+      throw new Error(`Dataset file not found: ${this.datasetPath}`);
+    }
   }
 
   /**
-   * Run the evaluation pipeline.
+   * Resolve the number of benchmark tasks that can run concurrently.
    *
-   * @param desc - Description for the evaluation process
+   * @param modelCount - Total number of models scheduled for benchmarking
+   * @param requestedLimit - Optional user-provided parallelism limit
+   * @returns Number of concurrent benchmark tasks to run
    */
-  async run(desc: string = 'Evaluating samples'): Promise<void> {
-    // Load/validate config, instantiate guardrails
-    const bundle = await loadConfigBundleFromFile(this.configPath);
-    const guardrails = await instantiateGuardrails(bundle);
+  static _determineParallelModelLimit(modelCount: number, requestedLimit?: number | null): number {
+    if (modelCount <= 0) {
+      throw new Error('modelCount must be positive');
+    }
+
+    if (requestedLimit !== null && requestedLimit !== undefined) {
+      if (requestedLimit <= 0) {
+        throw new Error('maxParallelModels must be positive');
+      }
+      return Math.min(requestedLimit, modelCount);
+    }
+
+    const cpuCount = os.cpus().length || 1;
+    return Math.max(1, Math.min(cpuCount, modelCount));
+  }
+
+  /**
+   * Yield contiguous sample chunks respecting the configured chunk size.
+   *
+   * @param samples - Samples to evaluate
+   * @param chunkSize - Optional maximum chunk size to enforce
+   * @returns Generator yielding slices of the provided samples
+   */
+  static *_chunkSamples(samples: Sample[], chunkSize?: number | null): Generator<Sample[], void, unknown> {
+    if (chunkSize !== null && chunkSize !== undefined && chunkSize <= 0) {
+      throw new Error('chunkSize must be positive when provided');
+    }
+
+    if (!samples || samples.length === 0 || chunkSize === null || chunkSize === undefined || chunkSize >= samples.length) {
+      yield samples;
+      return;
+    }
+
+    for (let start = 0; start < samples.length; start += chunkSize) {
+      yield samples.slice(start, start + chunkSize);
+    }
+  }
+
+  /**
+   * Run the evaluation pipeline for all specified stages.
+   */
+  async run(): Promise<void> {
+    await this._validateFilePaths();
+    try {
+      if (this.mode === 'benchmark') {
+        await this._runBenchmark();
+      } else {
+        await this._runEvaluation();
+      }
+    } catch (error) {
+      console.error(`Evaluation failed: ${error}`);
+      throw error;
+    }
+  }
+
+  private async _runEvaluation(): Promise<void> {
+    const pipelineBundles = await loadPipelineBundles(this.configPath);
+    const stagesToEvaluate = this._getValidStages(pipelineBundles);
+
+    if (stagesToEvaluate.length === 0) {
+      throw new Error('No valid stages found in configuration');
+    }
+
+    console.info(`event="evaluation_start" stages="${stagesToEvaluate.join(', ')}" mode="evaluate"`);
 
-    // Load and validate dataset
     const loader = new JsonlDatasetLoader();
     const samples = await loader.load(this.datasetPath);
+    console.info(`Loaded ${samples.length} samples from dataset`);
+
+    const context = this._createContext();
+    const calculator = new GuardrailMetricsCalculator();
+    const reporter = new JsonResultsReporter();
+
+    const allResults: Record<string, SampleResult[]> = {};
+    const allMetrics: Record<string, Record<string, unknown>> = {};
+
+    for (const stage of stagesToEvaluate) {
+      console.info(`Starting ${stage} stage evaluation`);
+
+      try {
+        const stageResults = await this._evaluateSingleStage(
+          stage,
+          pipelineBundles,
+          samples,
+          context,
+          calculator
+        );
+
+        if (stageResults) {
+          allResults[stage] = stageResults.results;
+          allMetrics[stage] = stageResults.metrics;
+          console.info(`Completed ${stage} stage evaluation`);
+        } else {
+          console.warn(`Stage '${stage}' evaluation returned no results`);
+        }
+      } catch (error) {
+        console.error(`Failed to evaluate stage '${stage}': ${error}`);
+      }
+    }
+
+    if (Object.keys(allResults).length === 0) {
+      throw new Error('No stages were successfully evaluated');
+    }
+
+    // Note: JsonResultsReporter.save_multi_stage would need to be implemented
+    // For now, save each stage separately
+    for (const [stage, results] of Object.entries(allResults)) {
+      const stageMetrics = allMetrics[stage] as ReturnType<typeof calculator.calculate>;
+      await reporter.save(results, stageMetrics, this.outputDir);
+    }
+
+    console.info(`Evaluation completed. Results saved to: ${this.outputDir}`);
+  }
+
+  private async _runBenchmark(): Promise<void> {
+    console.info(`event="benchmark_start" duration_ms=0 models="${this.models.join(', ')}"`);
+    console.info(
+      `event="benchmark_parallel_config" duration_ms=0 parallel_limit=${this.maxParallelModels} chunk_size=${
+        this.benchmarkChunkSize || 'dataset'
+      } batch_size=${this.batchSize}`
+    );
 
-    // Initialize components
-    if (!process.env.OPENAI_API_KEY) {
+    const pipelineBundles = await loadPipelineBundles(this.configPath);
+    const { stageToTest, guardrailName } = this._getBenchmarkTarget(pipelineBundles);
+
+    // Validate guardrail has model configuration
+    const stageBundle = (pipelineBundles as Record<string, GuardrailBundle>)[stageToTest];
+    if (!this._hasModelConfiguration(stageBundle)) {
       throw new Error(
-        'OPENAI_API_KEY environment variable is required. Please set it with: export OPENAI_API_KEY="your-api-key-here"'
+        `Guardrail '${guardrailName}' does not have a model configuration. ` +
+          'Benchmark mode requires LLM-based guardrails with configurable models.'
       );
     }
 
+    console.info(`event="benchmark_target" duration_ms=0 guardrail="${guardrailName}" stage="${stageToTest}"`);
+
+    const loader = new JsonlDatasetLoader();
+    const samples = await loader.load(this.datasetPath);
+    console.info(`event="benchmark_samples_loaded" duration_ms=0 count=${samples.length}`);
+
+    const context = this._createContext();
+    const benchmarkCalculator = new BenchmarkMetricsCalculator();
+    const basicCalculator = new GuardrailMetricsCalculator();
+    const benchmarkReporter = new BenchmarkReporter(this.outputDir);
+
+    // Run benchmark for all models
+    const { resultsByModel, metricsByModel } = await this._benchmarkAllModels(
+      stageToTest,
+      guardrailName,
+      samples,
+      context,
+      benchmarkCalculator,
+      basicCalculator,
+      pipelineBundles
+    );
+
+    // Run latency testing
+    console.info(`event="benchmark_latency_start" duration_ms=0 model_count=${this.models.length}`);
+    const latencyResults = await this._runLatencyTests(stageToTest, samples, pipelineBundles);
+
+    // Save benchmark results
+    const benchmarkDir = await benchmarkReporter.saveBenchmarkResults(
+      resultsByModel,
+      metricsByModel,
+      latencyResults,
+      guardrailName,
+      samples.length,
+      this.latencyIterations
+    );
+
+    // Create visualizations
+    console.info(`event="benchmark_visualization_start" duration_ms=0 guardrail="${guardrailName}"`);
+    const visualizer = new BenchmarkVisualizer(path.join(benchmarkDir, 'graphs'));
+    const visualizationFiles = await visualizer.createAllVisualizations(
+      resultsByModel,
+      metricsByModel,
+      latencyResults,
+      guardrailName,
+      samples[0]?.expectedTriggers || {}
+    );
+
+    console.info(`event="benchmark_complete" duration_ms=0 output="${benchmarkDir}"`);
+    console.info(`event="benchmark_visualization_complete" duration_ms=0 count=${visualizationFiles.length}`);
+  }
+
+  private _hasModelConfiguration(stageBundle: GuardrailBundle | undefined): boolean {
+    if (!stageBundle || !stageBundle.guardrails || stageBundle.guardrails.length === 0) {
+      return false;
+    }
+
+    const guardrailConfig = stageBundle.guardrails[0]?.config;
+    if (!guardrailConfig) {
+      return false;
+    }
+
+    if (typeof guardrailConfig === 'object' && 'model' in guardrailConfig) {
+      return true;
+    }
+
+    return false;
+  }
+
+  private async _runLatencyTests(
+    stageToTest: string,
+    samples: Sample[],
+    pipelineBundles: PipelineConfig
+  ): Promise<Record<string, Record<string, unknown>>> {
+    const latencyResults: Record<string, Record<string, unknown>> = {};
+    const latencyTester = new LatencyTester(this.latencyIterations);
+
+    for (const model of this.models) {
+      const stageBundle = (pipelineBundles as Record<string, GuardrailBundle>)[stageToTest];
+      const modelStageBundle = this._createModelSpecificStageBundle(stageBundle, model);
+      const modelContext = this._createContext();
+      latencyResults[model] = await latencyTester.testGuardrailLatencyForModel(
+        modelContext,
+        modelStageBundle,
+        samples,
+        this.latencyIterations,
+        `Testing latency: ${model}`
+      );
+    }
+
+    return latencyResults;
+  }
+
+  private _createContext(): Context {
+    // Azure OpenAI
+    if (this.azureEndpoint) {
+      // Validate API key availability
+      const apiKey = this.apiKey || process.env.OPENAI_API_KEY;
+      if (!apiKey) {
+        throw new Error(
+          'API key is required for Azure OpenAI. Please provide --api-key or set OPENAI_API_KEY environment variable.'
+        );
+      }
+
+      const azureKwargs: Record<string, string> = {
+        azureEndpoint: this.azureEndpoint,
+        apiVersion: this.azureApiVersion,
+      };
+      if (this.apiKey) {
+        azureKwargs.apiKey = this.apiKey;
+      }
+
+      // Note: Azure OpenAI client creation would need AzureOpenAI import
+      // For now, fall back to regular OpenAI with base URL
     const openaiClient = new OpenAI({
-      apiKey: process.env.OPENAI_API_KEY,
-    });
-    const context: Context = { guardrailLlm: openaiClient };
+        apiKey: apiKey,
+        baseURL: `https://${this.azureEndpoint.replace(/^https?:\/\//, '')}/openai/deployments`,
+      });
+      console.info(`event="client_created" type="azure" endpoint="${this.azureEndpoint}"`);
+      return { guardrailLlm: openaiClient };
+    }
+    // OpenAI or OpenAI-compatible API
+    else {
+      const openaiKwargs: Record<string, string> = {};
+      if (this.apiKey) {
+        openaiKwargs.apiKey = this.apiKey;
+      } else if (process.env.OPENAI_API_KEY) {
+        openaiKwargs.apiKey = process.env.OPENAI_API_KEY;
+      } else {
+        throw new Error(
+          'OPENAI_API_KEY environment variable is required. Please set it with: export OPENAI_API_KEY="your-api-key-here"'
+        );
+      }
+      if (this.baseUrl) {
+        openaiKwargs.baseURL = this.baseUrl;
+        console.info(`event="client_created" type="openai_compatible" base_url="${this.baseUrl}"`);
+      } else {
+        console.info(`event="client_created" type="openai"`);
+      }
+
+      const openaiClient = new OpenAI(openaiKwargs);
+      return { guardrailLlm: openaiClient };
+    }
+  }
+
+  private _isValidStage(pipelineBundles: PipelineConfig, stage: string): boolean {
+    const bundles = pipelineBundles as Record<string, GuardrailBundle | undefined>;
+    const stageBundle = bundles[stage];
+    return stageBundle !== undefined && stageBundle !== null && stageBundle.guardrails && stageBundle.guardrails.length > 0;
+  }
+
+  /**
+   * Create a modified copy of a stage bundle with model-specific configuration.
+   * 
+   * @param stageBundle - Original stage bundle
+   * @param model - Model name to inject into guardrail configs
+   * @returns Modified stage bundle with updated model configuration
+   */
+  private _createModelSpecificStageBundle(stageBundle: GuardrailBundle, model: string): GuardrailBundle {
+    // Deep copy the bundle using structuredClone for better performance
+    // Fall back to JSON parse/stringify for compatibility
+    let modifiedBundle: GuardrailBundle;
+    try {
+      modifiedBundle = structuredClone(stageBundle);
+    } catch {
+      modifiedBundle = JSON.parse(JSON.stringify(stageBundle));
+    }
+
+    for (const guardrail of modifiedBundle.guardrails) {
+      if (guardrail.config && typeof guardrail.config === 'object' && 'model' in guardrail.config) {
+        guardrail.config.model = model;
+      }
+    }
+
+    return modifiedBundle;
+  }
+
+  private _getValidStages(pipelineBundles: PipelineConfig): string[] {
+    if (this.stages === null) {
+      // Auto-detect all valid stages
+      const availableStages = Array.from(VALID_STAGES).filter((stage) =>
+        this._isValidStage(pipelineBundles, stage)
+      );
+
+      if (availableStages.length === 0) {
+        throw new Error('No valid stages found in configuration');
+      }
+
+      console.info(`event="stage_auto_detection" stages="${availableStages.join(', ')}"`);
+      return availableStages;
+    } else {
+      // Validate requested stages
+      const validRequestedStages: string[] = [];
+      for (const stage of this.stages) {
+        if (!VALID_STAGES.has(stage)) {
+          console.warn(`Invalid stage '${stage}', skipping`);
+          continue;
+        }
+
+        if (!this._isValidStage(pipelineBundles, stage)) {
+          console.warn(`Stage '${stage}' not found or has no guardrails configured, skipping`);
+          continue;
+        }
+
+        validRequestedStages.push(stage);
+      }
+
+      if (validRequestedStages.length === 0) {
+        throw new Error('No valid stages found in configuration');
+      }
+
+      return validRequestedStages;
+    }
+  }
+
+  private async _evaluateSingleStage(
+    stage: string,
+    pipelineBundles: PipelineConfig,
+    samples: Sample[],
+    context: Context,
+    calculator: GuardrailMetricsCalculator
+  ): Promise<{ results: SampleResult[]; metrics: Record<string, unknown> } | null> {
+    try {
+      const stageBundle = (pipelineBundles as Record<string, GuardrailBundle>)[stage];
+      const guardrails = await instantiateGuardrails(stageBundle);
+
+      const engine = new AsyncRunEngine(guardrails, this.multiTurn);
+
+      const stageResults = await engine.run(context, samples, this.batchSize, `Evaluating ${stage} stage`);
+
+      const stageMetrics = calculator.calculate(stageResults);
+
+      return { results: stageResults, metrics: stageMetrics };
+    } catch (error) {
+      console.error(`Failed to evaluate stage '${stage}': ${error}`);
+      return null;
+    }
+  }
+
+  private _getBenchmarkTarget(pipelineBundles: PipelineConfig): { stageToTest: string; guardrailName: string } {
+    let stageToTest: string;
+    if (this.stages && this.stages.length > 0) {
+      stageToTest = this.stages[0];
+      if (!this._isValidStage(pipelineBundles, stageToTest)) {
+        throw new Error(`Stage '${stageToTest}' has no guardrails configured`);
+      }
+    } else {
+      // Find first valid stage
+      stageToTest = Array.from(VALID_STAGES).find((stage) => this._isValidStage(pipelineBundles, stage)) || '';
+      if (!stageToTest) {
+        throw new Error('No valid stage found for benchmarking');
+      }
+    }
+
+    const stageBundle = (pipelineBundles as Record<string, GuardrailBundle>)[stageToTest];
+    const guardrailName = stageBundle.guardrails[0]?.name || 'unknown';
+
+    return { stageToTest, guardrailName };
+  }
+
+  private async _benchmarkAllModels(
+    stageToTest: string,
+    guardrailName: string,
+    samples: Sample[],
+    context: Context,
+    benchmarkCalculator: BenchmarkMetricsCalculator,
+    basicCalculator: GuardrailMetricsCalculator,
+    pipelineBundles: PipelineConfig
+  ): Promise<{
+    resultsByModel: Record<string, SampleResult[]>;
+    metricsByModel: Record<string, Record<string, number>>;
+  }> {
+    const stageBundle = (pipelineBundles as Record<string, GuardrailBundle>)[stageToTest];
+
+    const resultsByModel: Record<string, SampleResult[]> = {};
+    const metricsByModel: Record<string, Record<string, number>> = {};
+
+    // Create semaphore for concurrency control using a proper async queue
+    const maxActive = this.maxParallelModels;
+    const semaphore: Array<() => void> = [];
+    let running = 0;
+
+    const acquire = (): Promise<void> => {
+      return new Promise<void>((resolve) => {
+        const tryAcquire = () => {
+          if (running < maxActive) {
+            running += 1;
+            resolve();
+          } else {
+            semaphore.push(tryAcquire);
+          }
+        };
+        tryAcquire();
+      });
+    };
+
+    const release = (): void => {
+      running -= 1;
+      if (semaphore.length > 0) {
+        const next = semaphore.shift();
+        if (next) {
+          next();
+        }
+      }
+    };
+
+    const runModelTask = async (index: number, model: string): Promise<void> => {
+      await acquire();
+
+      const startTime = performance.now();
+      console.info(`event="benchmark_model_start" duration_ms=0 model="${model}" position=${index} total=${this.models.length} active=${running}/${maxActive}`);
+
+      try {
+        const modifiedStageBundle = this._createModelSpecificStageBundle(stageBundle, model);
+
+        const modelResults = await this._benchmarkSingleModel(
+          model,
+          modifiedStageBundle,
+          samples,
+          context,
+          guardrailName,
+          benchmarkCalculator,
+          basicCalculator
+        );
+
+        const elapsedMs = performance.now() - startTime;
+
+        if (modelResults) {
+          resultsByModel[model] = modelResults.results;
+          metricsByModel[model] = modelResults.metrics;
+          console.info(`event="benchmark_model_complete" duration_ms=${elapsedMs.toFixed(2)} model="${model}" status="success"`);
+        } else {
+          resultsByModel[model] = [];
+          metricsByModel[model] = {};
+          console.warn(`event="benchmark_model_empty" duration_ms=${elapsedMs.toFixed(2)} model="${model}" status="no_results"`);
+        }
+      } catch (error) {
+        const elapsedMs = performance.now() - startTime;
+        resultsByModel[model] = [];
+        metricsByModel[model] = {};
+        console.error(`event="benchmark_model_failure" duration_ms=${elapsedMs.toFixed(2)} model="${model}" error="${error}"`);
+      } finally {
+        release();
+      }
+    };
+
+    // Start all tasks in parallel (they will be throttled by the semaphore)
+    const tasks = this.models.map((model, idx) => runModelTask(idx + 1, model));
+    await Promise.all(tasks);
+
+    // Log summary
+    const successfulModels = this.models.filter((model) => resultsByModel[model] && resultsByModel[model].length > 0);
+    const failedModels = this.models.filter((model) => !resultsByModel[model] || resultsByModel[model].length === 0);
+
+    console.info(`event="benchmark_summary" duration_ms=0 successful=${successfulModels.length} failed=${failedModels.length}`);
+    console.info(`event="benchmark_successful_models" duration_ms=0 models="${successfulModels.join(', ') || 'None'}"`);
+    if (failedModels.length > 0) {
+      console.warn(`event="benchmark_failed_models" duration_ms=0 models="${failedModels.join(', ')}"`);
+    }
+    console.info(`event="benchmark_total_models" duration_ms=0 total=${this.models.length}`);
+
+    return { resultsByModel, metricsByModel };
+  }
+
+  private async _benchmarkSingleModel(
+    model: string,
+    stageBundle: GuardrailBundle,
+    samples: Sample[],
+    context: Context,
+    guardrailName: string,
+    benchmarkCalculator: BenchmarkMetricsCalculator,
+    basicCalculator: GuardrailMetricsCalculator
+  ): Promise<{ results: SampleResult[]; metrics: Record<string, number> } | null> {
+    try {
+      const guardrails = await instantiateGuardrails(stageBundle);
     const engine = new AsyncRunEngine(guardrails, this.multiTurn);
-    const calculator = new GuardrailMetricsCalculator();
-    const reporter = new JsonResultsReporter();
+      const chunkTotal = this.benchmarkChunkSize && samples.length > 0
+        ? Math.max(1, Math.ceil(samples.length / this.benchmarkChunkSize))
+        : 1;
 
-    // Run evaluations
-    const results = await engine.run(context, samples, this.batchSize, desc);
+      const modelResults: SampleResult[] = [];
+      let chunkIndex = 1;
+      for (const chunk of GuardrailEval._chunkSamples(samples, this.benchmarkChunkSize)) {
+        const chunkDesc =
+          chunkTotal === 1
+            ? `Benchmarking ${model}`
+            : `Benchmarking ${model} (${chunkIndex}/${chunkTotal})`;
+        const chunkResults = await engine.run(context, chunk, this.batchSize, chunkDesc);
+        modelResults.push(...chunkResults);
+        chunkIndex += 1;
+      }
 
-    // Calculate metrics
-    const metrics = calculator.calculate(results);
+      const guardrailConfig = stageBundle.guardrails[0]?.config || null;
+
+      const advancedMetrics = benchmarkCalculator.calculateAdvancedMetrics(
+        modelResults,
+        guardrailName,
+        guardrailConfig as Record<string, unknown> | null
+      );
 
-    // Save results
-    await reporter.save(results, metrics, this.outputDir);
+      const basicMetrics = basicCalculator.calculate(modelResults);
+
+      let basicMetricsDict: Record<string, number> = {};
+      if (guardrailName in basicMetrics) {
+        const guardrailMetrics = basicMetrics[guardrailName];
+        basicMetricsDict = {
+          precision: guardrailMetrics.precision,
+          recall: guardrailMetrics.recall,
+          f1Score: guardrailMetrics.f1Score,
+          truePositives: guardrailMetrics.truePositives,
+          falsePositives: guardrailMetrics.falsePositives,
+          falseNegatives: guardrailMetrics.falseNegatives,
+          trueNegatives: guardrailMetrics.trueNegatives,
+          totalSamples: guardrailMetrics.totalSamples,
+        };
+      }
+
+      const combinedMetrics = { ...basicMetricsDict, ...advancedMetrics };
+
+      return { results: modelResults, metrics: combinedMetrics };
+    } catch (error) {
+      console.error(`Failed to benchmark model ${model}: ${error}`);
+      return null;
+    }
   }
 }
 
@@ -93,16 +717,36 @@ export class GuardrailEval {
 export async function runEvaluationCLI(args: {
   configPath: string;
   datasetPath: string;
+  stages?: string[] | null;
   batchSize?: number;
   outputDir?: string;
+  apiKey?: string | null;
+  baseUrl?: string | null;
+  azureEndpoint?: string | null;
+  azureApiVersion?: string;
+  mode?: 'evaluate' | 'benchmark';
+  models?: string[] | null;
+  latencyIterations?: number;
   multiTurn?: boolean;
+  maxParallelModels?: number | null;
+  benchmarkChunkSize?: number | null;
 }): Promise<void> {
   const evaluator = new GuardrailEval(
     args.configPath,
     args.datasetPath,
-    args.batchSize || 32,
+    args.stages || null,
+    args.batchSize || DEFAULT_BATCH_SIZE,
     args.outputDir || 'results',
-    Boolean(args.multiTurn)
+    args.apiKey || null,
+    args.baseUrl || null,
+    args.azureEndpoint || null,
+    args.azureApiVersion || '2025-01-01-preview',
+    args.mode || 'evaluate',
+    args.models || null,
+    args.latencyIterations || DEFAULT_LATENCY_ITERATIONS,
+    Boolean(args.multiTurn),
+    args.maxParallelModels || null,
+    args.benchmarkChunkSize || null
   );
 
   await evaluator.run();