diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 000000000..3e8d07066
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,73 @@
+name: Performance Benchmarks
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+    paths-ignore:
+      - '**/*.md'
+  workflow_dispatch:
+    inputs:
+      iterations:
+        description: 'Number of benchmark iterations'
+        required: false
+        default: '3'
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    name: Performance Benchmarks
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
+        with:
+          node-version: '20'
+          cache: 'npm'
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build project
+        run: npm run build
+
+      - name: Pre-benchmark cleanup
+        run: sudo ./scripts/ci/cleanup.sh
+
+      - name: Run performance benchmarks
+        id: run-benchmarks
+        run: |
+          sudo -E npm run test:benchmark 2>&1 | tee benchmark-output.log
+        continue-on-error: true
+
+      - name: Generate benchmark summary
+        if: always()
+        run: |
+          npx tsx scripts/ci/generate-benchmark-summary.ts benchmark-output.log
+
+      - name: Check benchmark results
+        if: steps.run-benchmarks.outcome == 'failure'
+        run: exit 1
+
+      - name: Post-benchmark cleanup
+        if: always()
+        run: sudo ./scripts/ci/cleanup.sh
+
+      - name: Upload benchmark report
+        if: always()
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        with:
+          name: benchmark-report
+          path: |
+            /tmp/awf-benchmark-report.json
+            benchmark-output.log
+          retention-days: 30
diff --git a/package.json b/package.json
index 3856d5df3..67466d0cc 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
     "test": "jest",
     "test:unit": "jest --config jest.config.js",
     "test:integration": "jest --config tests/setup/jest.integration.config.js",
+    "test:benchmark": "jest --config tests/setup/jest.benchmark.config.js",
     "test:all": "npm run test:unit && npm run test:integration",
     "test:watch": "jest --watch",
     "test:coverage": "jest --coverage",
diff --git a/scripts/ci/generate-benchmark-summary.ts b/scripts/ci/generate-benchmark-summary.ts
new file mode 100644
index 000000000..07cd0fe71
--- /dev/null
+++ b/scripts/ci/generate-benchmark-summary.ts
@@ -0,0 +1,183 @@
+#!/usr/bin/env node
+/**
+ * Generate GitHub Actions job summary from benchmark test output
+ * This script parses benchmark output and creates a markdown summary
+ * with performance metrics and statistics.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+
+interface BenchmarkMetric {
+  name: string;
+  metric: string;
+  unit: string;
+  values: number[];
+}
+
+interface ParsedResults {
+  metrics: BenchmarkMetric[];
+  passed: number;
+  failed: number;
+  duration: string;
+}
+
+function parseJestOutput(output: string): ParsedResults {
+  const lines = output.split('\n');
+  const metrics: BenchmarkMetric[] = [];
+  let passed = 0;
+  let failed = 0;
+  let duration = 'unknown';
+
+  // Parse test results
+  const testsLine = lines.find(line => line.startsWith('Tests:'));
+  if (testsLine) {
+    const passedMatch = testsLine.match(/(\d+) passed/);
+    const failedMatch = testsLine.match(/(\d+) failed/);
+    passed = passedMatch ? parseInt(passedMatch[1], 10) : 0;
+    failed = failedMatch ? parseInt(failedMatch[1], 10) : 0;
+  }
+
+  // Parse duration
+  const timeLine = lines.find(line => line.match(/Time:\s+[\d.]+\s*s/));
+  if (timeLine) {
+    const timeMatch = timeLine.match(/Time:\s+([\d.]+\s*s)/);
+    if (timeMatch) {
+      duration = timeMatch[1];
+    }
+  }
+
+  // Parse benchmark-specific output
+  // Look for lines like: "container_startup: Iteration 1 completed - 5234 ms"
+  const benchmarkPattern = /\[Benchmark\] (\w+): Iteration \d+ completed - ([\d.]+) (\w+)/g;
+  let match;
+  while ((match = benchmarkPattern.exec(output)) !== null) {
+    const [, name, value, unit] = match;
+    const existingMetric = metrics.find(m => m.name === name);
+    if (existingMetric) {
+      existingMetric.values.push(parseFloat(value));
+    } else {
+      metrics.push({
+        name,
+        metric: name,
+        unit,
+        values: [parseFloat(value)],
+      });
+    }
+  }
+
+  return { metrics, passed, failed, duration };
+}
+
+function calculateStats(values: number[]): { min: number; max: number; mean: number; stdDev: number } {
+  if (values.length === 0) {
+    return { min: 0, max: 0, mean: 0, stdDev: 0 };
+  }
+
+  const sorted = [...values].sort((a, b) => a - b);
+  const sum = sorted.reduce((a, b) => a + b, 0);
+  const mean = sum / sorted.length;
+
+  const squaredDiffs = sorted.map(v => Math.pow(v - mean, 2));
+  const avgSquaredDiff = squaredDiffs.reduce((a, b) => a + b, 0) / sorted.length;
+  const stdDev = Math.sqrt(avgSquaredDiff);
+
+  return {
+    min: sorted[0],
+    max: sorted[sorted.length - 1],
+    mean,
+    stdDev,
+  };
+}
+
+function generateSummary(output: string): string {
+  const results = parseJestOutput(output);
+  const statusEmoji = results.failed === 0 ? '✅' : '❌';
+
+  let summary = `## ${statusEmoji} Performance Benchmark Results\n\n`;
+  summary += `**Results:** ${results.passed} passed, ${results.failed} failed in ${results.duration}\n\n`;
+
+  if (results.metrics.length > 0) {
+    summary += '### Benchmark Metrics\n\n';
+    summary += '| Metric | Mean | Min | Max | Std Dev | Samples |\n';
+    summary += '|--------|------|-----|-----|---------|--------|\n';
+
+    for (const metric of results.metrics) {
+      const stats = calculateStats(metric.values);
+      summary += `| ${metric.name} | ${stats.mean.toFixed(2)} ${metric.unit} | `;
+      summary += `${stats.min.toFixed(2)} ${metric.unit} | `;
+      summary += `${stats.max.toFixed(2)} ${metric.unit} | `;
+      summary += `${stats.stdDev.toFixed(2)} | `;
+      summary += `${metric.values.length} |\n`;
+    }
+
+    summary += '\n';
+  }
+
+  // Add interpretation section
+  summary += '### Metric Descriptions\n\n';
+  summary += '| Metric | Description |\n';
+  summary += '|--------|-------------|\n';
+  summary += '| startup_time_ms | Time to start containers and execute a simple command |\n';
+  summary += '| request_time_ms | Time to make an HTTP request through the proxy |\n';
+  summary += '| download_time_ms | Time to download a small file through the proxy |\n';
+  summary += '| memory_mb | Combined memory usage of containers |\n';
+  summary += '| reject_time_ms | Time for proxy to reject a blocked domain request |\n';
+  summary += '\n';
+
+  // Try to load the full JSON report if available
+  const reportPath = '/tmp/awf-benchmark-report.json';
+  if (fs.existsSync(reportPath)) {
+    try {
+      const report = JSON.parse(fs.readFileSync(reportPath, 'utf-8'));
+      summary += '### Environment\n\n';
+      summary += `- **OS:** ${report.environment?.os || 'unknown'}\n`;
+      summary += `- **Node.js:** ${report.environment?.nodeVersion || 'unknown'}\n`;
+      summary += `- **CPU:** ${report.environment?.cpuModel || 'unknown'}\n`;
+      summary += `- **CPU Cores:** ${report.environment?.cpuCount || 'unknown'}\n`;
+      summary += `- **Memory:** ${report.environment?.totalMemoryMb || 'unknown'} MB\n`;
+      summary += `- **Commit:** \`${(report.commitSha || 'unknown').substring(0, 7)}\`\n`;
+      summary += '\n';
+    } catch (error) {
+      // Ignore parse errors
+    }
+  }
+
+  return summary;
+}
+
+function main() {
+  const args = process.argv.slice(2);
+
+  if (args.length < 1) {
+    console.error('Usage: generate-benchmark-summary.ts <output-file>');
+    process.exit(1);
+  }
+
+  const outputFile = args[0];
+
+  // Read benchmark output from file
+  let benchmarkOutput: string;
+  if (fs.existsSync(outputFile)) {
+    benchmarkOutput = fs.readFileSync(outputFile, 'utf-8');
+  } else {
+    console.error(`Error: Output file not found: ${outputFile}`);
+    process.exit(1);
+  }
+
+  // Generate summary
+  const summary = generateSummary(benchmarkOutput);
+
+  // Write to GITHUB_STEP_SUMMARY or stdout
+  const summaryPath = process.env.GITHUB_STEP_SUMMARY;
+  if (summaryPath) {
+    fs.appendFileSync(summaryPath, summary);
+    console.log('Benchmark summary generated successfully');
+  } else {
+    console.error('Warning: GITHUB_STEP_SUMMARY not set. Running outside GitHub Actions?');
+    console.log('\n--- Benchmark Summary ---');
+    console.log(summary);
+  }
+}
+
+main();
diff --git a/src/benchmarks/benchmark-runner.test.ts b/src/benchmarks/benchmark-runner.test.ts
new file mode 100644
index 000000000..d7197ba01
--- /dev/null
+++ b/src/benchmarks/benchmark-runner.test.ts
@@ -0,0 +1,316 @@
+/**
+ * Unit tests for benchmark runner utilities
+ */
+
+import { describe, test, expect } from '@jest/globals';
+import {
+  calculateStats,
+  detectRegressions,
+  formatReportAsMarkdown,
+  formatRegressionAsMarkdown,
+} from './benchmark-runner';
+import { BenchmarkReport, BenchmarkStats, RegressionThreshold } from './benchmark-types';
+
+describe('Benchmark Runner', () => {
+  describe('calculateStats', () => {
+    test('should calculate correct statistics for simple array', () => {
+      const values = [10, 20, 30, 40, 50];
+      const stats = calculateStats(values);
+
+      expect(stats.min).toBe(10);
+      expect(stats.max).toBe(50);
+      expect(stats.mean).toBe(30);
+      expect(stats.median).toBe(30);
+      expect(stats.samples).toBe(5);
+    });
+
+    test('should calculate median correctly for even-length arrays', () => {
+      const values = [10, 20, 30, 40];
+      const stats = calculateStats(values);
+
+      expect(stats.median).toBe(25); // (20 + 30) / 2
+    });
+
+    test('should handle single value', () => {
+      const values = [42];
+      const stats = calculateStats(values);
+
+      expect(stats.min).toBe(42);
+      expect(stats.max).toBe(42);
+      expect(stats.mean).toBe(42);
+      expect(stats.median).toBe(42);
+      expect(stats.stdDev).toBe(0);
+      expect(stats.samples).toBe(1);
+    });
+
+    test('should handle empty array', () => {
+      const values: number[] = [];
+      const stats = calculateStats(values);
+
+      expect(stats.min).toBe(0);
+      expect(stats.max).toBe(0);
+      expect(stats.mean).toBe(0);
+      expect(stats.median).toBe(0);
+      expect(stats.stdDev).toBe(0);
+      expect(stats.samples).toBe(0);
+    });
+
+    test('should calculate standard deviation correctly', () => {
+      // Values: [2, 4, 4, 4, 5, 5, 7, 9]
+      // Mean = 5
+      // Variance = ((2-5)² + (4-5)² + (4-5)² + (4-5)² + (5-5)² + (5-5)² + (7-5)² + (9-5)²) / 8
+      //          = (9 + 1 + 1 + 1 + 0 + 0 + 4 + 16) / 8 = 32 / 8 = 4
+      // StdDev = √4 = 2
+      const values = [2, 4, 4, 4, 5, 5, 7, 9];
+      const stats = calculateStats(values);
+
+      expect(stats.mean).toBe(5);
+      expect(stats.stdDev).toBe(2);
+    });
+  });
+
+  describe('detectRegressions', () => {
+    test('should detect regression when threshold exceeded', () => {
+      const current: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 5500,
+          max: 6000,
+          mean: 5750,
+          median: 5750,
+          stdDev: 100,
+          samples: 3,
+        },
+      };
+
+      const baseline: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 4500,
+          max: 5000,
+          mean: 4750,
+          median: 4750,
+          stdDev: 100,
+          samples: 3,
+        },
+      };
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'startup_time_ms', maxIncreasePercent: 10 },
+      ];
+
+      const result = detectRegressions(current, baseline, thresholds);
+
+      expect(result.hasRegression).toBe(true);
+      expect(result.regressions).toHaveLength(1);
+      expect(result.regressions[0].metric).toBe('startup_time_ms');
+      // Change is (5750 - 4750) / 4750 * 100 = 21.05%
+      expect(result.regressions[0].changePercent).toBeCloseTo(21.05, 1);
+    });
+
+    test('should not detect regression when within threshold', () => {
+      const current: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 4900,
+          max: 5100,
+          mean: 5000,
+          median: 5000,
+          stdDev: 50,
+          samples: 3,
+        },
+      };
+
+      const baseline: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 4800,
+          max: 5000,
+          mean: 4900,
+          median: 4900,
+          stdDev: 50,
+          samples: 3,
+        },
+      };
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'startup_time_ms', maxIncreasePercent: 10 },
+      ];
+
+      const result = detectRegressions(current, baseline, thresholds);
+
+      expect(result.hasRegression).toBe(false);
+      expect(result.regressions).toHaveLength(0);
+    });
+
+    test('should detect improvements', () => {
+      const current: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 4000,
+          max: 4500,
+          mean: 4250,
+          median: 4250,
+          stdDev: 100,
+          samples: 3,
+        },
+      };
+
+      const baseline: Record<string, BenchmarkStats> = {
+        startup_time_ms: {
+          min: 5000,
+          max: 5500,
+          mean: 5250,
+          median: 5250,
+          stdDev: 100,
+          samples: 3,
+        },
+      };
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'startup_time_ms', maxIncreasePercent: 10, minDecreasePercent: 15 },
+      ];
+
+      const result = detectRegressions(current, baseline, thresholds);
+
+      expect(result.hasRegression).toBe(false);
+      expect(result.improvements).toHaveLength(1);
+      expect(result.improvements[0].metric).toBe('startup_time_ms');
+      // Change is (4250 - 5250) / 5250 * 100 = -19.05%
+      expect(result.improvements[0].changePercent).toBeCloseTo(-19.05, 1);
+    });
+
+    test('should handle missing baseline metrics', () => {
+      const current: Record<string, BenchmarkStats> = {
+        new_metric: {
+          min: 100,
+          max: 200,
+          mean: 150,
+          median: 150,
+          stdDev: 25,
+          samples: 3,
+        },
+      };
+
+      const baseline: Record<string, BenchmarkStats> = {};
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'new_metric', maxIncreasePercent: 10 },
+      ];
+
+      const result = detectRegressions(current, baseline, thresholds);
+
+      expect(result.hasRegression).toBe(false);
+      expect(result.regressions).toHaveLength(0);
+    });
+  });
+
+  describe('formatReportAsMarkdown', () => {
+    test('should format report correctly', () => {
+      const report: BenchmarkReport = {
+        version: '1.0.0',
+        commitSha: 'abc123def456',
+        branch: 'main',
+        environment: {
+          os: 'Linux 5.4.0',
+          nodeVersion: 'v20.0.0',
+          cpuModel: 'Intel Core i7',
+          cpuCount: 4,
+          totalMemoryMb: 16384,
+        },
+        results: [
+          {
+            name: 'test_benchmark',
+            description: 'Test benchmark',
+            metric: 'test_metric_ms',
+            unit: 'ms',
+            value: 100,
+            timestamp: '2024-01-01T00:00:00.000Z',
+            durationMs: 1000,
+          },
+        ],
+        stats: {
+          test_metric_ms: {
+            min: 90,
+            max: 110,
+            mean: 100,
+            median: 100,
+            stdDev: 5,
+            samples: 3,
+          },
+        },
+        generatedAt: '2024-01-01T00:00:00.000Z',
+      };
+
+      const markdown = formatReportAsMarkdown(report);
+
+      expect(markdown).toContain('## 📊 Performance Benchmark Results');
+      expect(markdown).toContain('abc123d');
+      expect(markdown).toContain('main');
+      expect(markdown).toContain('Linux 5.4.0');
+      expect(markdown).toContain('test_metric_ms');
+      expect(markdown).toContain('100.00 ms');
+    });
+  });
+
+  describe('formatRegressionAsMarkdown', () => {
+    test('should format regression result correctly', () => {
+      const result = {
+        hasRegression: true,
+        regressions: [
+          {
+            metric: 'startup_time_ms',
+            current: 6000,
+            baseline: 5000,
+            changePercent: 20,
+            threshold: 10,
+          },
+        ],
+        improvements: [],
+      };
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'startup_time_ms', maxIncreasePercent: 10 },
+      ];
+
+      const markdown = formatRegressionAsMarkdown(result, thresholds);
+
+      expect(markdown).toContain('⚠️ **Performance Regressions Detected:**');
+      expect(markdown).toContain('startup_time_ms');
+      expect(markdown).toContain('+20.0%');
+    });
+
+    test('should format improvements correctly', () => {
+      const result = {
+        hasRegression: false,
+        regressions: [],
+        improvements: [
+          {
+            metric: 'startup_time_ms',
+            current: 4000,
+            baseline: 5000,
+            changePercent: -20,
+          },
+        ],
+      };
+
+      const thresholds: RegressionThreshold[] = [
+        { metric: 'startup_time_ms', maxIncreasePercent: 10, minDecreasePercent: 15 },
+      ];
+
+      const markdown = formatRegressionAsMarkdown(result, thresholds);
+
+      expect(markdown).toContain('🎉 **Performance Improvements:**');
+      expect(markdown).toContain('-20.0%');
+    });
+
+    test('should indicate no changes when no regressions or improvements', () => {
+      const result = {
+        hasRegression: false,
+        regressions: [],
+        improvements: [],
+      };
+
+      const thresholds: RegressionThreshold[] = [];
+
+      const markdown = formatRegressionAsMarkdown(result, thresholds);
+
+      expect(markdown).toContain('✅ **No significant performance changes detected.**');
+    });
+  });
+});
diff --git a/src/benchmarks/benchmark-runner.ts b/src/benchmarks/benchmark-runner.ts
new file mode 100644
index 000000000..315b38452
--- /dev/null
+++ b/src/benchmarks/benchmark-runner.ts
@@ -0,0 +1,371 @@
+/**
+ * Benchmark Runner Utility
+ *
+ * Provides infrastructure for running and measuring performance benchmarks
+ * for the awf firewall.
+ */
+
+import * as os from 'os';
+import execa = require('execa');
+import {
+  BenchmarkResult,
+  BenchmarkStats,
+  BenchmarkReport,
+  BenchmarkOptions,
+  RegressionThreshold,
+  RegressionResult,
+} from './benchmark-types';
+
+/**
+ * Get current git commit SHA
+ */
+async function getGitCommitSha(): Promise<string> {
+  try {
+    const { stdout } = await execa('git', ['rev-parse', 'HEAD']);
+    return stdout.trim();
+  } catch {
+    return 'unknown';
+  }
+}
+
+/**
+ * Get current git branch name
+ */
+async function getGitBranch(): Promise<string> {
+  try {
+    const { stdout } = await execa('git', ['rev-parse', '--abbrev-ref', 'HEAD']);
+    return stdout.trim();
+  } catch {
+    return 'unknown';
+  }
+}
+
+/**
+ * Get CPU model string
+ */
+function getCpuModel(): string {
+  const cpus = os.cpus();
+  return cpus.length > 0 ? cpus[0].model : 'unknown';
+}
+
+/**
+ * Calculate statistical summary from an array of numbers
+ */
+export function calculateStats(values: number[]): BenchmarkStats {
+  if (values.length === 0) {
+    return {
+      min: 0,
+      max: 0,
+      mean: 0,
+      median: 0,
+      stdDev: 0,
+      samples: 0,
+    };
+  }
+
+  const sorted = [...values].sort((a, b) => a - b);
+  const sum = sorted.reduce((a, b) => a + b, 0);
+  const mean = sum / sorted.length;
+
+  // Calculate median
+  const mid = Math.floor(sorted.length / 2);
+  const median =
+    sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
+
+  // Calculate standard deviation
+  const squaredDiffs = sorted.map((v) => Math.pow(v - mean, 2));
+  const avgSquaredDiff = squaredDiffs.reduce((a, b) => a + b, 0) / sorted.length;
+  const stdDev = Math.sqrt(avgSquaredDiff);
+
+  return {
+    min: sorted[0],
+    max: sorted[sorted.length - 1],
+    mean,
+    median,
+    stdDev,
+    samples: sorted.length,
+  };
+}
+
+/**
+ * Benchmark runner class
+ */
+export class BenchmarkRunner {
+  private results: BenchmarkResult[] = [];
+  private options: Required<BenchmarkOptions>;
+
+  constructor(options: BenchmarkOptions = {}) {
+    this.options = {
+      iterations: options.iterations ?? 3,
+      warmupRuns: options.warmupRuns ?? 1,
+      timeout: options.timeout ?? 120000,
+      verbose: options.verbose ?? false,
+    };
+  }
+
+  private log(message: string): void {
+    if (this.options.verbose) {
+      console.log(`[Benchmark] ${message}`);
+    }
+  }
+
+  /**
+   * Run a benchmark function and record the result
+   *
+   * @param name - Benchmark name
+   * @param description - What the benchmark measures
+   * @param metric - Metric name (e.g., 'startup_time_ms')
+   * @param unit - Unit of measurement
+   * @param fn - Function that returns the measured value
+   */
+  async run(
+    name: string,
+    description: string,
+    metric: string,
+    unit: string,
+    fn: () => Promise<number>
+  ): Promise<BenchmarkResult[]> {
+    const iterationResults: BenchmarkResult[] = [];
+
+    // Warmup runs
+    for (let i = 0; i < this.options.warmupRuns; i++) {
+      this.log(`${name}: Warmup run ${i + 1}/${this.options.warmupRuns}`);
+      try {
+        await fn();
+      } catch (error) {
+        this.log(`${name}: Warmup run ${i + 1} failed: ${error}`);
+      }
+    }
+
+    // Measured runs
+    for (let i = 0; i < this.options.iterations; i++) {
+      this.log(`${name}: Iteration ${i + 1}/${this.options.iterations}`);
+      const startTime = Date.now();
+
+      try {
+        const value = await fn();
+        const durationMs = Date.now() - startTime;
+
+        const result: BenchmarkResult = {
+          name,
+          description,
+          metric,
+          unit,
+          value,
+          timestamp: new Date().toISOString(),
+          durationMs,
+          metadata: {
+            iteration: i + 1,
+          },
+        };
+
+        iterationResults.push(result);
+        this.results.push(result);
+        this.log(`${name}: Iteration ${i + 1} completed - ${value} ${unit}`);
+      } catch (error) {
+        this.log(`${name}: Iteration ${i + 1} failed: ${error}`);
+      }
+    }
+
+    return iterationResults;
+  }
+
+  /**
+   * Get all collected results
+   */
+  getResults(): BenchmarkResult[] {
+    return [...this.results];
+  }
+
+  /**
+   * Calculate statistics for all metrics
+   */
+  calculateAllStats(): Record<string, BenchmarkStats> {
+    const statsByMetric: Record<string, BenchmarkStats> = {};
+
+    // Group results by metric
+    const valuesByMetric: Record<string, number[]> = {};
+    for (const result of this.results) {
+      if (!valuesByMetric[result.metric]) {
+        valuesByMetric[result.metric] = [];
+      }
+      valuesByMetric[result.metric].push(result.value);
+    }
+
+    // Calculate stats for each metric
+    for (const [metric, values] of Object.entries(valuesByMetric)) {
+      statsByMetric[metric] = calculateStats(values);
+    }
+
+    return statsByMetric;
+  }
+
+  /**
+   * Generate a complete benchmark report
+   */
+  async generateReport(): Promise<BenchmarkReport> {
+    const [commitSha, branch] = await Promise.all([
+      getGitCommitSha(),
+      getGitBranch(),
+    ]);
+
+    return {
+      version: '1.0.0',
+      commitSha,
+      branch,
+      environment: {
+        os: `${os.type()} ${os.release()}`,
+        nodeVersion: process.version,
+        cpuModel: getCpuModel(),
+        cpuCount: os.cpus().length,
+        totalMemoryMb: Math.round(os.totalmem() / (1024 * 1024)),
+      },
+      results: this.getResults(),
+      stats: this.calculateAllStats(),
+      generatedAt: new Date().toISOString(),
+    };
+  }
+
+  /**
+   * Clear all collected results
+   */
+  clear(): void {
+    this.results = [];
+  }
+}
+
+/**
+ * Detect regressions by comparing current stats to baseline
+ */
+export function detectRegressions(
+  current: Record<string, BenchmarkStats>,
+  baseline: Record<string, BenchmarkStats>,
+  thresholds: RegressionThreshold[]
+): RegressionResult {
+  const regressions: RegressionResult['regressions'] = [];
+  const improvements: RegressionResult['improvements'] = [];
+
+  for (const threshold of thresholds) {
+    const currentStats = current[threshold.metric];
+    const baselineStats = baseline[threshold.metric];
+
+    if (!currentStats || !baselineStats || baselineStats.mean === 0) {
+      continue;
+    }
+
+    const changePercent =
+      ((currentStats.mean - baselineStats.mean) / baselineStats.mean) * 100;
+
+    if (changePercent > threshold.maxIncreasePercent) {
+      regressions.push({
+        metric: threshold.metric,
+        current: currentStats.mean,
+        baseline: baselineStats.mean,
+        changePercent,
+        threshold: threshold.maxIncreasePercent,
+      });
+    } else if (
+      threshold.minDecreasePercent !== undefined &&
+      changePercent < -threshold.minDecreasePercent
+    ) {
+      improvements.push({
+        metric: threshold.metric,
+        current: currentStats.mean,
+        baseline: baselineStats.mean,
+        changePercent,
+      });
+    }
+  }
+
+  return {
+    hasRegression: regressions.length > 0,
+    regressions,
+    improvements,
+  };
+}
+
+/**
+ * Format benchmark report as Markdown for GitHub Actions summary
+ */
+export function formatReportAsMarkdown(report: BenchmarkReport): string {
+  let md = '## 📊 Performance Benchmark Results\n\n';
+
+  md += `**Commit:** \`${report.commitSha.substring(0, 7)}\`\n`;
+  md += `**Branch:** \`${report.branch}\`\n`;
+  md += `**Generated:** ${report.generatedAt}\n\n`;
+
+  md += '### Environment\n\n';
+  md += `| Property | Value |\n`;
+  md += `|----------|-------|\n`;
+  md += `| OS | ${report.environment.os} |\n`;
+  md += `| Node.js | ${report.environment.nodeVersion} |\n`;
+  md += `| CPU | ${report.environment.cpuModel} |\n`;
+  md += `| CPU Cores | ${report.environment.cpuCount} |\n`;
+  md += `| Memory | ${report.environment.totalMemoryMb} MB |\n\n`;
+
+  md += '### Benchmark Results\n\n';
+  md += '| Metric | Mean | Min | Max | Std Dev | Samples |\n';
+  md += '|--------|------|-----|-----|---------|--------|\n';
+
+  for (const [metric, stats] of Object.entries(report.stats)) {
+    // Find a result to get the unit
+    const result = report.results.find((r) => r.metric === metric);
+    const unit = result?.unit ?? '';
+
+    md += `| ${metric} | ${stats.mean.toFixed(2)} ${unit} | `;
+    md += `${stats.min.toFixed(2)} ${unit} | `;
+    md += `${stats.max.toFixed(2)} ${unit} | `;
+    md += `${stats.stdDev.toFixed(2)} | `;
+    md += `${stats.samples} |\n`;
+  }
+
+  md += '\n';
+
+  return md;
+}
+
+/**
+ * Format regression result as Markdown
+ */
+export function formatRegressionAsMarkdown(
+  result: RegressionResult,
+  _thresholds: RegressionThreshold[]
+): string {
+  let md = '### Regression Analysis\n\n';
+
+  if (!result.hasRegression && result.improvements.length === 0) {
+    md += '✅ **No significant performance changes detected.**\n\n';
+    return md;
+  }
+
+  if (result.hasRegression) {
+    md += '⚠️ **Performance Regressions Detected:**\n\n';
+    md += '| Metric | Current | Baseline | Change | Threshold |\n';
+    md += '|--------|---------|----------|--------|----------|\n';
+
+    for (const reg of result.regressions) {
+      md += `| ${reg.metric} | ${reg.current.toFixed(2)} | `;
+      md += `${reg.baseline.toFixed(2)} | `;
+      md += `+${reg.changePercent.toFixed(1)}% | `;
+      md += `${reg.threshold}% |\n`;
+    }
+
+    md += '\n';
+  }
+
+  if (result.improvements.length > 0) {
+    md += '🎉 **Performance Improvements:**\n\n';
+    md += '| Metric | Current | Baseline | Change |\n';
+    md += '|--------|---------|----------|--------|\n';
+
+    for (const imp of result.improvements) {
+      md += `| ${imp.metric} | ${imp.current.toFixed(2)} | `;
+      md += `${imp.baseline.toFixed(2)} | `;
+      md += `${imp.changePercent.toFixed(1)}% |\n`;
+    }
+
+    md += '\n';
+  }
+
+  return md;
+}
diff --git a/src/benchmarks/benchmark-types.ts b/src/benchmarks/benchmark-types.ts
new file mode 100644
index 000000000..f3dbc7e6a
--- /dev/null
+++ b/src/benchmarks/benchmark-types.ts
@@ -0,0 +1,118 @@
+/**
+ * Type definitions for performance benchmarks
+ */
+
+/**
+ * Result of a single benchmark run
+ */
+export interface BenchmarkResult {
+  /** Benchmark name */
+  name: string;
+  /** What the benchmark measures */
+  description: string;
+  /** Metric name (e.g., 'startup_time', 'memory_mb', 'throughput_rps') */
+  metric: string;
+  /** Unit of measurement */
+  unit: string;
+  /** Measured value */
+  value: number;
+  /** Timestamp when the benchmark was run */
+  timestamp: string;
+  /** Duration of the benchmark in milliseconds */
+  durationMs: number;
+  /** Additional metadata */
+  metadata?: Record<string, string | number | boolean>;
+}
+
+/**
+ * Statistical summary of multiple benchmark runs
+ */
+export interface BenchmarkStats {
+  /** Minimum value */
+  min: number;
+  /** Maximum value */
+  max: number;
+  /** Mean value */
+  mean: number;
+  /** Median value */
+  median: number;
+  /** Standard deviation */
+  stdDev: number;
+  /** Number of samples */
+  samples: number;
+}
+
+/**
+ * Complete benchmark report
+ */
+export interface BenchmarkReport {
+  /** Report version for future compatibility */
+  version: string;
+  /** Git commit SHA */
+  commitSha: string;
+  /** Git branch name */
+  branch: string;
+  /** Runner environment info */
+  environment: {
+    os: string;
+    nodeVersion: string;
+    cpuModel: string;
+    cpuCount: number;
+    totalMemoryMb: number;
+  };
+  /** All benchmark results */
+  results: BenchmarkResult[];
+  /** Statistical summaries for each metric */
+  stats: Record<string, BenchmarkStats>;
+  /** Report generation timestamp */
+  generatedAt: string;
+}
+
+/**
+ * Threshold configuration for regression detection
+ */
+export interface RegressionThreshold {
+  /** Metric name */
+  metric: string;
+  /** Maximum allowed increase percentage (e.g., 10 means 10% regression allowed) */
+  maxIncreasePercent: number;
+  /** Minimum required decrease percentage for improvement (optional) */
+  minDecreasePercent?: number;
+}
+
+/**
+ * Regression detection result
+ */
+export interface RegressionResult {
+  /** Whether a regression was detected */
+  hasRegression: boolean;
+  /** List of metrics that regressed */
+  regressions: Array<{
+    metric: string;
+    current: number;
+    baseline: number;
+    changePercent: number;
+    threshold: number;
+  }>;
+  /** List of metrics that improved */
+  improvements: Array<{
+    metric: string;
+    current: number;
+    baseline: number;
+    changePercent: number;
+  }>;
+}
+
+/**
+ * Options for running benchmarks
+ */
+export interface BenchmarkOptions {
+  /** Number of iterations to run (default: 3) */
+  iterations?: number;
+  /** Warmup runs before measuring (default: 1) */
+  warmupRuns?: number;
+  /** Timeout for each benchmark in milliseconds (default: 120000) */
+  timeout?: number;
+  /** Whether to output verbose logging */
+  verbose?: boolean;
+}
diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md
new file mode 100644
index 000000000..c605f6de4
--- /dev/null
+++ b/tests/benchmarks/README.md
@@ -0,0 +1,142 @@
+# Performance Benchmarks
+
+This directory contains performance benchmarks for the awf (Agentic Workflow Firewall) tool.
+
+## Overview
+
+The benchmark suite measures key performance metrics to track and prevent performance regressions over time:
+
+- **Container Startup Time**: Time from command invocation to container ready
+- **Network Throughput**: Time to make HTTP requests through the proxy
+- **Memory Usage**: Combined memory consumption of firewall containers
+- **Blocked Domain Performance**: Time for proxy to reject blocked domains
+
+## Running Benchmarks
+
+### Locally
+
+```bash
+# Run all benchmarks (requires sudo for iptables)
+sudo -E npm run test:benchmark
+
+# View the benchmark report
+cat /tmp/awf-benchmark-report.json | jq
+```
+
+### In CI
+
+The benchmarks run automatically on:
+- Push to `main` branch
+- Pull requests to `main` branch
+
+The GitHub Actions workflow (`benchmark.yml`) generates a summary with:
+- Performance metrics table
+- Environment information
+- Comparison with baseline (if available)
+
+## Benchmark Files
+
+| File | Description |
+|------|-------------|
+| `performance.benchmark.ts` | Main benchmark test file with all performance tests |
+
+## Metrics Tracked
+
+| Metric | Description | Unit |
+|--------|-------------|------|
+| `startup_time_ms` | Container startup and simple command execution | milliseconds |
+| `request_time_ms` | HTTP request time through proxy | milliseconds |
+| `download_time_ms` | Small file download through proxy | milliseconds |
+| `memory_mb` | Combined container memory usage | megabytes |
+| `reject_time_ms` | Time to reject blocked domain | milliseconds |
+
+## Regression Detection
+
+The benchmark suite includes regression detection with configurable thresholds:
+
+```typescript
+const thresholds: RegressionThreshold[] = [
+  { metric: 'startup_time_ms', maxIncreasePercent: 20 },
+  { metric: 'request_time_ms', maxIncreasePercent: 15 },
+  { metric: 'memory_mb', maxIncreasePercent: 25 },
+];
+```
+
+When a metric increases beyond its threshold compared to the baseline, a warning is generated in the CI summary.
+
+## Interpreting Results
+
+### Good Performance Indicators
+
+- **Startup time < 30s**: Fast container initialization
+- **Request time < 5s**: Efficient proxy overhead
+- **Memory < 200MB**: Low resource footprint
+
+### Common Performance Issues
+
+1. **Slow startup**: May indicate Docker image pull issues or container init problems
+2. **High request latency**: Could be DNS resolution or proxy configuration issues
+3. **High memory usage**: Check for memory leaks in long-running tests
+
+## Benchmark Report Format
+
+The benchmark report is saved as JSON in `/tmp/awf-benchmark-report.json`:
+
+```json
+{
+  "version": "1.0.0",
+  "commitSha": "abc123",
+  "branch": "main",
+  "environment": {
+    "os": "Linux 5.4.0",
+    "nodeVersion": "v20.0.0",
+    "cpuModel": "Intel Core i7",
+    "cpuCount": 4,
+    "totalMemoryMb": 16384
+  },
+  "results": [...],
+  "stats": {
+    "startup_time_ms": {
+      "min": 5000,
+      "max": 6000,
+      "mean": 5500,
+      "median": 5500,
+      "stdDev": 250,
+      "samples": 3
+    }
+  },
+  "generatedAt": "2024-01-01T00:00:00.000Z"
+}
+```
+
+## Adding New Benchmarks
+
+To add a new benchmark:
+
+1. Open `performance.benchmark.ts`
+2. Add a new test using the benchmark runner:
+
+```typescript
+await benchmarkRunner.run(
+  'your_benchmark_name',
+  'Description of what this measures',
+  'metric_name',
+  'unit',
+  async () => {
+    // Your benchmark code here
+    // Return the measured value
+    return measuredValue;
+  }
+);
+```
+
+3. Update the threshold configuration if needed
+4. Update this README with the new metric
+
+## Best Practices
+
+1. **Run multiple iterations**: The default is 3 iterations to reduce noise
+2. **Use warmup runs**: First run often includes cold start overhead
+3. **Clean state**: Always clean up between iterations
+4. **Minimize variables**: Run on consistent hardware/environment
+5. **Track trends**: Compare against historical baselines
diff --git a/tests/benchmarks/performance.benchmark.ts b/tests/benchmarks/performance.benchmark.ts
new file mode 100644
index 000000000..c162e5819
--- /dev/null
+++ b/tests/benchmarks/performance.benchmark.ts
@@ -0,0 +1,270 @@
+/**
+ * Performance Benchmark Tests
+ *
+ * This file contains performance benchmarks for the awf firewall.
+ * Key metrics tracked:
+ * - Container startup time
+ * - Network throughput (allowed domains)
+ * - Memory usage
+ * - Cold start vs warm start comparison
+ */
+
+import { describe, test, expect, beforeAll, afterAll } from '@jest/globals';
+import { BenchmarkRunner, formatReportAsMarkdown } from '../../src/benchmarks/benchmark-runner';
+import { createRunner, AwfRunner } from '../fixtures/awf-runner';
+import { cleanup } from '../fixtures/cleanup';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import execa = require('execa');
+
+describe('Performance Benchmarks', () => {
+  let runner: AwfRunner;
+  let benchmarkRunner: BenchmarkRunner;
+  const iterations = 3; // Number of iterations for each benchmark
+
+  beforeAll(async () => {
+    // Ensure clean state
+    await cleanup(false);
+
+    runner = createRunner();
+    benchmarkRunner = new BenchmarkRunner({
+      iterations,
+      warmupRuns: 1,
+      verbose: true,
+    });
+  });
+
+  afterAll(async () => {
+    // Generate and save benchmark report
+    const report = await benchmarkRunner.generateReport();
+    const reportPath = path.join(os.tmpdir(), 'awf-benchmark-report.json');
+    fs.writeFileSync(reportPath, JSON.stringify(report, null, 2));
+    console.log(`\nBenchmark report saved to: ${reportPath}`);
+
+    // Print markdown summary
+    console.log('\n' + formatReportAsMarkdown(report));
+
+    // Clean up after all tests
+    await cleanup(false);
+  });
+
+  describe('1. Container Startup Time', () => {
+    test('measures time to start containers and execute simple command', async () => {
+      const results = await benchmarkRunner.run(
+        'container_startup',
+        'Time from command invocation to container ready and simple command executed',
+        'startup_time_ms',
+        'ms',
+        async () => {
+          const startTime = Date.now();
+
+          const result = await runner.runWithSudo('echo "ready"', {
+            allowDomains: ['github.com'],
+            logLevel: 'error', // Minimize log overhead
+            timeout: 60000,
+          });
+
+          const elapsed = Date.now() - startTime;
+
+          // Ensure the command succeeded
+          expect(result.exitCode).toBe(0);
+          expect(result.stdout).toContain('ready');
+
+          // Clean up for next iteration
+          await cleanup(false);
+
+          return elapsed;
+        }
+      );
+
+      expect(results.length).toBe(iterations);
+
+      // Sanity check: startup should take less than 60 seconds
+      for (const result of results) {
+        expect(result.value).toBeLessThan(60000);
+      }
+    }, 300000); // 5 min timeout
+  });
+
+  describe('2. Network Throughput', () => {
+    test('measures time to make HTTP request through proxy', async () => {
+      const results = await benchmarkRunner.run(
+        'http_request_time',
+        'Time to make an HTTP request to an allowed domain through the proxy',
+        'request_time_ms',
+        'ms',
+        async () => {
+          const startTime = Date.now();
+
+          // Use curl with timing info
+          const result = await runner.runWithSudo(
+            'curl -s -o /dev/null -w "%{time_total}" https://api.github.com',
+            {
+              allowDomains: ['github.com'],
+              logLevel: 'error',
+              timeout: 60000,
+            }
+          );
+
+          const elapsed = Date.now() - startTime;
+
+          expect(result.exitCode).toBe(0);
+
+          // Clean up for next iteration
+          await cleanup(false);
+
+          // Return the curl-reported time in ms
+          const curlTime = parseFloat(result.stdout.trim()) * 1000;
+          return isNaN(curlTime) ? elapsed : curlTime;
+        }
+      );
+
+      expect(results.length).toBe(iterations);
+    }, 300000);
+
+    test('measures time to download small file', async () => {
+      const results = await benchmarkRunner.run(
+        'download_time',
+        'Time to download a small file (robots.txt) through the proxy',
+        'download_time_ms',
+        'ms',
+        async () => {
+          const startTime = Date.now();
+
+          const result = await runner.runWithSudo(
+            'curl -s -o /dev/null -w "%{time_total}" https://github.com/robots.txt',
+            {
+              allowDomains: ['github.com'],
+              logLevel: 'error',
+              timeout: 60000,
+            }
+          );
+
+          const elapsed = Date.now() - startTime;
+
+          expect(result.exitCode).toBe(0);
+
+          await cleanup(false);
+
+          const curlTime = parseFloat(result.stdout.trim()) * 1000;
+          return isNaN(curlTime) ? elapsed : curlTime;
+        }
+      );
+
+      expect(results.length).toBe(iterations);
+    }, 300000);
+  });
+
+  describe('3. Memory Usage', () => {
+    test('measures container memory usage during idle', async () => {
+      const results = await benchmarkRunner.run(
+        'memory_usage_idle',
+        'Memory usage of containers while idle',
+        'memory_mb',
+        'MB',
+        async () => {
+          // Start containers and keep them running
+          await runner.runWithSudo(
+            // Wait a bit for containers to stabilize
+            'sleep 2',
+            {
+              allowDomains: ['github.com'],
+              keepContainers: true,
+              logLevel: 'error',
+              timeout: 60000,
+            }
+          );
+
+          // Get memory stats from Docker
+          let memoryMb = -1; // Use -1 as sentinel for measurement failure
+          try {
+            const { stdout } = await execa('docker', [
+              'stats',
+              '--no-stream',
+              '--format',
+              '{{.MemUsage}}',
+              'awf-squid',
+              'awf-agent',
+            ]);
+
+            // Parse memory usage (format: "15.2MiB / 1.9GiB")
+            memoryMb = 0;
+            const lines = stdout.trim().split('\n');
+            for (const line of lines) {
+              // Match memory usage anywhere in the line
+              const match = line.match(/([\d.]+)(MiB|GiB|MB|GB)/);
+              if (match) {
+                const value = parseFloat(match[1]);
+                const unit = match[2];
+                if (unit === 'GiB' || unit === 'GB') {
+                  memoryMb += value * 1024;
+                } else {
+                  memoryMb += value;
+                }
+              }
+            }
+
+            // If we parsed lines but got 0, that's suspicious
+            if (memoryMb === 0 && lines.length > 0) {
+              console.warn('Warning: Docker stats returned data but memory parsing yielded 0');
+            }
+          } catch (error) {
+            console.error('Failed to get Docker memory stats for containers:', error);
+          }
+
+          // Clean up
+          await cleanup(false);
+
+          // Return sentinel value if measurement completely failed
+          if (memoryMb < 0) {
+            throw new Error('Memory measurement failed - Docker stats unavailable');
+          }
+
+          return memoryMb;
+        }
+      );
+
+      expect(results.length).toBe(iterations);
+
+      // Sanity check: memory should be reasonable (less than 1GB total)
+      for (const result of results) {
+        expect(result.value).toBeLessThan(1024);
+      }
+    }, 300000);
+  });
+
+  describe('4. Blocked Domain Performance', () => {
+    test('measures time for proxy to reject blocked domain', async () => {
+      const results = await benchmarkRunner.run(
+        'blocked_domain_time',
+        'Time for proxy to reject a request to a non-allowed domain',
+        'reject_time_ms',
+        'ms',
+        async () => {
+          const startTime = Date.now();
+
+          // This should fail quickly since example.com is not allowed
+          const result = await runner.runWithSudo(
+            'curl -s -o /dev/null -w "%{time_total}" --max-time 10 https://example.com || true',
+            {
+              allowDomains: ['github.com'],
+              logLevel: 'error',
+              timeout: 60000,
+            }
+          );
+
+          const elapsed = Date.now() - startTime;
+
+          await cleanup(false);
+
+          // Return curl time if available, otherwise elapsed
+          const curlTime = parseFloat(result.stdout.trim()) * 1000;
+          return isNaN(curlTime) ? elapsed : curlTime;
+        }
+      );
+
+      expect(results.length).toBe(iterations);
+    }, 300000);
+  });
+});
diff --git a/tests/setup/jest.benchmark.config.js b/tests/setup/jest.benchmark.config.js
new file mode 100644
index 000000000..0e96547fc
--- /dev/null
+++ b/tests/setup/jest.benchmark.config.js
@@ -0,0 +1,11 @@
+module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'node',
+  roots: ['<rootDir>/../benchmarks'],
+  testMatch: ['**/*.benchmark.ts'],
+  moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
+  setupFilesAfterEnv: ['<rootDir>/jest.setup.ts'],
+  testTimeout: 300000, // 5 minutes per test (benchmarks can be slow)
+  verbose: true,
+  maxWorkers: 1, // Run tests serially to get accurate benchmark measurements
+};