diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..3e8d07066 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,73 @@ +name: Performance Benchmarks + +on: + push: + branches: [main] + pull_request: + branches: [main] + paths-ignore: + - '**/*.md' + workflow_dispatch: + inputs: + iterations: + description: 'Number of benchmark iterations' + required: false + default: '3' + type: string + +permissions: + contents: read + +jobs: + benchmark: + name: Performance Benchmarks + runs-on: ubuntu-latest + timeout-minutes: 30 + + steps: + - name: Checkout repository + uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4 + + - name: Setup Node.js + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4 + with: + node-version: '20' + cache: 'npm' + + - name: Install dependencies + run: npm ci + + - name: Build project + run: npm run build + + - name: Pre-benchmark cleanup + run: sudo ./scripts/ci/cleanup.sh + + - name: Run performance benchmarks + id: run-benchmarks + run: | + sudo -E npm run test:benchmark 2>&1 | tee benchmark-output.log + continue-on-error: true + + - name: Generate benchmark summary + if: always() + run: | + npx tsx scripts/ci/generate-benchmark-summary.ts benchmark-output.log + + - name: Check benchmark results + if: steps.run-benchmarks.outcome == 'failure' + run: exit 1 + + - name: Post-benchmark cleanup + if: always() + run: sudo ./scripts/ci/cleanup.sh + + - name: Upload benchmark report + if: always() + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: benchmark-report + path: | + /tmp/awf-benchmark-report.json + benchmark-output.log + retention-days: 30 diff --git a/package.json b/package.json index 3856d5df3..67466d0cc 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "test": "jest", "test:unit": "jest --config jest.config.js", "test:integration": "jest --config tests/setup/jest.integration.config.js", + "test:benchmark": "jest --config tests/setup/jest.benchmark.config.js", "test:all": "npm run test:unit && npm run test:integration", "test:watch": "jest --watch", "test:coverage": "jest --coverage", diff --git a/scripts/ci/generate-benchmark-summary.ts b/scripts/ci/generate-benchmark-summary.ts new file mode 100644 index 000000000..07cd0fe71 --- /dev/null +++ b/scripts/ci/generate-benchmark-summary.ts @@ -0,0 +1,183 @@ +#!/usr/bin/env node +/** + * Generate GitHub Actions job summary from benchmark test output + * This script parses benchmark output and creates a markdown summary + * with performance metrics and statistics. + */ + +import * as fs from 'fs'; +import * as path from 'path'; + +interface BenchmarkMetric { + name: string; + metric: string; + unit: string; + values: number[]; +} + +interface ParsedResults { + metrics: BenchmarkMetric[]; + passed: number; + failed: number; + duration: string; +} + +function parseJestOutput(output: string): ParsedResults { + const lines = output.split('\n'); + const metrics: BenchmarkMetric[] = []; + let passed = 0; + let failed = 0; + let duration = 'unknown'; + + // Parse test results + const testsLine = lines.find(line => line.startsWith('Tests:')); + if (testsLine) { + const passedMatch = testsLine.match(/(\d+) passed/); + const failedMatch = testsLine.match(/(\d+) failed/); + passed = passedMatch ? parseInt(passedMatch[1], 10) : 0; + failed = failedMatch ? parseInt(failedMatch[1], 10) : 0; + } + + // Parse duration + const timeLine = lines.find(line => line.match(/Time:\s+[\d.]+\s*s/)); + if (timeLine) { + const timeMatch = timeLine.match(/Time:\s+([\d.]+\s*s)/); + if (timeMatch) { + duration = timeMatch[1]; + } + } + + // Parse benchmark-specific output + // Look for lines like: "container_startup: Iteration 1 completed - 5234 ms" + const benchmarkPattern = /\[Benchmark\] (\w+): Iteration \d+ completed - ([\d.]+) (\w+)/g; + let match; + while ((match = benchmarkPattern.exec(output)) !== null) { + const [, name, value, unit] = match; + const existingMetric = metrics.find(m => m.name === name); + if (existingMetric) { + existingMetric.values.push(parseFloat(value)); + } else { + metrics.push({ + name, + metric: name, + unit, + values: [parseFloat(value)], + }); + } + } + + return { metrics, passed, failed, duration }; +} + +function calculateStats(values: number[]): { min: number; max: number; mean: number; stdDev: number } { + if (values.length === 0) { + return { min: 0, max: 0, mean: 0, stdDev: 0 }; + } + + const sorted = [...values].sort((a, b) => a - b); + const sum = sorted.reduce((a, b) => a + b, 0); + const mean = sum / sorted.length; + + const squaredDiffs = sorted.map(v => Math.pow(v - mean, 2)); + const avgSquaredDiff = squaredDiffs.reduce((a, b) => a + b, 0) / sorted.length; + const stdDev = Math.sqrt(avgSquaredDiff); + + return { + min: sorted[0], + max: sorted[sorted.length - 1], + mean, + stdDev, + }; +} + +function generateSummary(output: string): string { + const results = parseJestOutput(output); + const statusEmoji = results.failed === 0 ? '✅' : '❌'; + + let summary = `## ${statusEmoji} Performance Benchmark Results\n\n`; + summary += `**Results:** ${results.passed} passed, ${results.failed} failed in ${results.duration}\n\n`; + + if (results.metrics.length > 0) { + summary += '### Benchmark Metrics\n\n'; + summary += '| Metric | Mean | Min | Max | Std Dev | Samples |\n'; + summary += '|--------|------|-----|-----|---------|--------|\n'; + + for (const metric of results.metrics) { + const stats = calculateStats(metric.values); + summary += `| ${metric.name} | ${stats.mean.toFixed(2)} ${metric.unit} | `; + summary += `${stats.min.toFixed(2)} ${metric.unit} | `; + summary += `${stats.max.toFixed(2)} ${metric.unit} | `; + summary += `${stats.stdDev.toFixed(2)} | `; + summary += `${metric.values.length} |\n`; + } + + summary += '\n'; + } + + // Add interpretation section + summary += '### Metric Descriptions\n\n'; + summary += '| Metric | Description |\n'; + summary += '|--------|-------------|\n'; + summary += '| startup_time_ms | Time to start containers and execute a simple command |\n'; + summary += '| request_time_ms | Time to make an HTTP request through the proxy |\n'; + summary += '| download_time_ms | Time to download a small file through the proxy |\n'; + summary += '| memory_mb | Combined memory usage of containers |\n'; + summary += '| reject_time_ms | Time for proxy to reject a blocked domain request |\n'; + summary += '\n'; + + // Try to load the full JSON report if available + const reportPath = '/tmp/awf-benchmark-report.json'; + if (fs.existsSync(reportPath)) { + try { + const report = JSON.parse(fs.readFileSync(reportPath, 'utf-8')); + summary += '### Environment\n\n'; + summary += `- **OS:** ${report.environment?.os || 'unknown'}\n`; + summary += `- **Node.js:** ${report.environment?.nodeVersion || 'unknown'}\n`; + summary += `- **CPU:** ${report.environment?.cpuModel || 'unknown'}\n`; + summary += `- **CPU Cores:** ${report.environment?.cpuCount || 'unknown'}\n`; + summary += `- **Memory:** ${report.environment?.totalMemoryMb || 'unknown'} MB\n`; + summary += `- **Commit:** \`${(report.commitSha || 'unknown').substring(0, 7)}\`\n`; + summary += '\n'; + } catch (error) { + // Ignore parse errors + } + } + + return summary; +} + +function main() { + const args = process.argv.slice(2); + + if (args.length < 1) { + console.error('Usage: generate-benchmark-summary.ts '); + process.exit(1); + } + + const outputFile = args[0]; + + // Read benchmark output from file + let benchmarkOutput: string; + if (fs.existsSync(outputFile)) { + benchmarkOutput = fs.readFileSync(outputFile, 'utf-8'); + } else { + console.error(`Error: Output file not found: ${outputFile}`); + process.exit(1); + } + + // Generate summary + const summary = generateSummary(benchmarkOutput); + + // Write to GITHUB_STEP_SUMMARY or stdout + const summaryPath = process.env.GITHUB_STEP_SUMMARY; + if (summaryPath) { + fs.appendFileSync(summaryPath, summary); + console.log('Benchmark summary generated successfully'); + } else { + console.error('Warning: GITHUB_STEP_SUMMARY not set. Running outside GitHub Actions?'); + console.log('\n--- Benchmark Summary ---'); + console.log(summary); + } +} + +main(); diff --git a/src/benchmarks/benchmark-runner.test.ts b/src/benchmarks/benchmark-runner.test.ts new file mode 100644 index 000000000..d7197ba01 --- /dev/null +++ b/src/benchmarks/benchmark-runner.test.ts @@ -0,0 +1,316 @@ +/** + * Unit tests for benchmark runner utilities + */ + +import { describe, test, expect } from '@jest/globals'; +import { + calculateStats, + detectRegressions, + formatReportAsMarkdown, + formatRegressionAsMarkdown, +} from './benchmark-runner'; +import { BenchmarkReport, BenchmarkStats, RegressionThreshold } from './benchmark-types'; + +describe('Benchmark Runner', () => { + describe('calculateStats', () => { + test('should calculate correct statistics for simple array', () => { + const values = [10, 20, 30, 40, 50]; + const stats = calculateStats(values); + + expect(stats.min).toBe(10); + expect(stats.max).toBe(50); + expect(stats.mean).toBe(30); + expect(stats.median).toBe(30); + expect(stats.samples).toBe(5); + }); + + test('should calculate median correctly for even-length arrays', () => { + const values = [10, 20, 30, 40]; + const stats = calculateStats(values); + + expect(stats.median).toBe(25); // (20 + 30) / 2 + }); + + test('should handle single value', () => { + const values = [42]; + const stats = calculateStats(values); + + expect(stats.min).toBe(42); + expect(stats.max).toBe(42); + expect(stats.mean).toBe(42); + expect(stats.median).toBe(42); + expect(stats.stdDev).toBe(0); + expect(stats.samples).toBe(1); + }); + + test('should handle empty array', () => { + const values: number[] = []; + const stats = calculateStats(values); + + expect(stats.min).toBe(0); + expect(stats.max).toBe(0); + expect(stats.mean).toBe(0); + expect(stats.median).toBe(0); + expect(stats.stdDev).toBe(0); + expect(stats.samples).toBe(0); + }); + + test('should calculate standard deviation correctly', () => { + // Values: [2, 4, 4, 4, 5, 5, 7, 9] + // Mean = 5 + // Variance = ((2-5)² + (4-5)² + (4-5)² + (4-5)² + (5-5)² + (5-5)² + (7-5)² + (9-5)²) / 8 + // = (9 + 1 + 1 + 1 + 0 + 0 + 4 + 16) / 8 = 32 / 8 = 4 + // StdDev = √4 = 2 + const values = [2, 4, 4, 4, 5, 5, 7, 9]; + const stats = calculateStats(values); + + expect(stats.mean).toBe(5); + expect(stats.stdDev).toBe(2); + }); + }); + + describe('detectRegressions', () => { + test('should detect regression when threshold exceeded', () => { + const current: Record = { + startup_time_ms: { + min: 5500, + max: 6000, + mean: 5750, + median: 5750, + stdDev: 100, + samples: 3, + }, + }; + + const baseline: Record = { + startup_time_ms: { + min: 4500, + max: 5000, + mean: 4750, + median: 4750, + stdDev: 100, + samples: 3, + }, + }; + + const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 10 }, + ]; + + const result = detectRegressions(current, baseline, thresholds); + + expect(result.hasRegression).toBe(true); + expect(result.regressions).toHaveLength(1); + expect(result.regressions[0].metric).toBe('startup_time_ms'); + // Change is (5750 - 4750) / 4750 * 100 = 21.05% + expect(result.regressions[0].changePercent).toBeCloseTo(21.05, 1); + }); + + test('should not detect regression when within threshold', () => { + const current: Record = { + startup_time_ms: { + min: 4900, + max: 5100, + mean: 5000, + median: 5000, + stdDev: 50, + samples: 3, + }, + }; + + const baseline: Record = { + startup_time_ms: { + min: 4800, + max: 5000, + mean: 4900, + median: 4900, + stdDev: 50, + samples: 3, + }, + }; + + const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 10 }, + ]; + + const result = detectRegressions(current, baseline, thresholds); + + expect(result.hasRegression).toBe(false); + expect(result.regressions).toHaveLength(0); + }); + + test('should detect improvements', () => { + const current: Record = { + startup_time_ms: { + min: 4000, + max: 4500, + mean: 4250, + median: 4250, + stdDev: 100, + samples: 3, + }, + }; + + const baseline: Record = { + startup_time_ms: { + min: 5000, + max: 5500, + mean: 5250, + median: 5250, + stdDev: 100, + samples: 3, + }, + }; + + const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 10, minDecreasePercent: 15 }, + ]; + + const result = detectRegressions(current, baseline, thresholds); + + expect(result.hasRegression).toBe(false); + expect(result.improvements).toHaveLength(1); + expect(result.improvements[0].metric).toBe('startup_time_ms'); + // Change is (4250 - 5250) / 5250 * 100 = -19.05% + expect(result.improvements[0].changePercent).toBeCloseTo(-19.05, 1); + }); + + test('should handle missing baseline metrics', () => { + const current: Record = { + new_metric: { + min: 100, + max: 200, + mean: 150, + median: 150, + stdDev: 25, + samples: 3, + }, + }; + + const baseline: Record = {}; + + const thresholds: RegressionThreshold[] = [ + { metric: 'new_metric', maxIncreasePercent: 10 }, + ]; + + const result = detectRegressions(current, baseline, thresholds); + + expect(result.hasRegression).toBe(false); + expect(result.regressions).toHaveLength(0); + }); + }); + + describe('formatReportAsMarkdown', () => { + test('should format report correctly', () => { + const report: BenchmarkReport = { + version: '1.0.0', + commitSha: 'abc123def456', + branch: 'main', + environment: { + os: 'Linux 5.4.0', + nodeVersion: 'v20.0.0', + cpuModel: 'Intel Core i7', + cpuCount: 4, + totalMemoryMb: 16384, + }, + results: [ + { + name: 'test_benchmark', + description: 'Test benchmark', + metric: 'test_metric_ms', + unit: 'ms', + value: 100, + timestamp: '2024-01-01T00:00:00.000Z', + durationMs: 1000, + }, + ], + stats: { + test_metric_ms: { + min: 90, + max: 110, + mean: 100, + median: 100, + stdDev: 5, + samples: 3, + }, + }, + generatedAt: '2024-01-01T00:00:00.000Z', + }; + + const markdown = formatReportAsMarkdown(report); + + expect(markdown).toContain('## 📊 Performance Benchmark Results'); + expect(markdown).toContain('abc123d'); + expect(markdown).toContain('main'); + expect(markdown).toContain('Linux 5.4.0'); + expect(markdown).toContain('test_metric_ms'); + expect(markdown).toContain('100.00 ms'); + }); + }); + + describe('formatRegressionAsMarkdown', () => { + test('should format regression result correctly', () => { + const result = { + hasRegression: true, + regressions: [ + { + metric: 'startup_time_ms', + current: 6000, + baseline: 5000, + changePercent: 20, + threshold: 10, + }, + ], + improvements: [], + }; + + const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 10 }, + ]; + + const markdown = formatRegressionAsMarkdown(result, thresholds); + + expect(markdown).toContain('⚠️ **Performance Regressions Detected:**'); + expect(markdown).toContain('startup_time_ms'); + expect(markdown).toContain('+20.0%'); + }); + + test('should format improvements correctly', () => { + const result = { + hasRegression: false, + regressions: [], + improvements: [ + { + metric: 'startup_time_ms', + current: 4000, + baseline: 5000, + changePercent: -20, + }, + ], + }; + + const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 10, minDecreasePercent: 15 }, + ]; + + const markdown = formatRegressionAsMarkdown(result, thresholds); + + expect(markdown).toContain('🎉 **Performance Improvements:**'); + expect(markdown).toContain('-20.0%'); + }); + + test('should indicate no changes when no regressions or improvements', () => { + const result = { + hasRegression: false, + regressions: [], + improvements: [], + }; + + const thresholds: RegressionThreshold[] = []; + + const markdown = formatRegressionAsMarkdown(result, thresholds); + + expect(markdown).toContain('✅ **No significant performance changes detected.**'); + }); + }); +}); diff --git a/src/benchmarks/benchmark-runner.ts b/src/benchmarks/benchmark-runner.ts new file mode 100644 index 000000000..315b38452 --- /dev/null +++ b/src/benchmarks/benchmark-runner.ts @@ -0,0 +1,371 @@ +/** + * Benchmark Runner Utility + * + * Provides infrastructure for running and measuring performance benchmarks + * for the awf firewall. + */ + +import * as os from 'os'; +import execa = require('execa'); +import { + BenchmarkResult, + BenchmarkStats, + BenchmarkReport, + BenchmarkOptions, + RegressionThreshold, + RegressionResult, +} from './benchmark-types'; + +/** + * Get current git commit SHA + */ +async function getGitCommitSha(): Promise { + try { + const { stdout } = await execa('git', ['rev-parse', 'HEAD']); + return stdout.trim(); + } catch { + return 'unknown'; + } +} + +/** + * Get current git branch name + */ +async function getGitBranch(): Promise { + try { + const { stdout } = await execa('git', ['rev-parse', '--abbrev-ref', 'HEAD']); + return stdout.trim(); + } catch { + return 'unknown'; + } +} + +/** + * Get CPU model string + */ +function getCpuModel(): string { + const cpus = os.cpus(); + return cpus.length > 0 ? cpus[0].model : 'unknown'; +} + +/** + * Calculate statistical summary from an array of numbers + */ +export function calculateStats(values: number[]): BenchmarkStats { + if (values.length === 0) { + return { + min: 0, + max: 0, + mean: 0, + median: 0, + stdDev: 0, + samples: 0, + }; + } + + const sorted = [...values].sort((a, b) => a - b); + const sum = sorted.reduce((a, b) => a + b, 0); + const mean = sum / sorted.length; + + // Calculate median + const mid = Math.floor(sorted.length / 2); + const median = + sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid]; + + // Calculate standard deviation + const squaredDiffs = sorted.map((v) => Math.pow(v - mean, 2)); + const avgSquaredDiff = squaredDiffs.reduce((a, b) => a + b, 0) / sorted.length; + const stdDev = Math.sqrt(avgSquaredDiff); + + return { + min: sorted[0], + max: sorted[sorted.length - 1], + mean, + median, + stdDev, + samples: sorted.length, + }; +} + +/** + * Benchmark runner class + */ +export class BenchmarkRunner { + private results: BenchmarkResult[] = []; + private options: Required; + + constructor(options: BenchmarkOptions = {}) { + this.options = { + iterations: options.iterations ?? 3, + warmupRuns: options.warmupRuns ?? 1, + timeout: options.timeout ?? 120000, + verbose: options.verbose ?? false, + }; + } + + private log(message: string): void { + if (this.options.verbose) { + console.log(`[Benchmark] ${message}`); + } + } + + /** + * Run a benchmark function and record the result + * + * @param name - Benchmark name + * @param description - What the benchmark measures + * @param metric - Metric name (e.g., 'startup_time_ms') + * @param unit - Unit of measurement + * @param fn - Function that returns the measured value + */ + async run( + name: string, + description: string, + metric: string, + unit: string, + fn: () => Promise + ): Promise { + const iterationResults: BenchmarkResult[] = []; + + // Warmup runs + for (let i = 0; i < this.options.warmupRuns; i++) { + this.log(`${name}: Warmup run ${i + 1}/${this.options.warmupRuns}`); + try { + await fn(); + } catch (error) { + this.log(`${name}: Warmup run ${i + 1} failed: ${error}`); + } + } + + // Measured runs + for (let i = 0; i < this.options.iterations; i++) { + this.log(`${name}: Iteration ${i + 1}/${this.options.iterations}`); + const startTime = Date.now(); + + try { + const value = await fn(); + const durationMs = Date.now() - startTime; + + const result: BenchmarkResult = { + name, + description, + metric, + unit, + value, + timestamp: new Date().toISOString(), + durationMs, + metadata: { + iteration: i + 1, + }, + }; + + iterationResults.push(result); + this.results.push(result); + this.log(`${name}: Iteration ${i + 1} completed - ${value} ${unit}`); + } catch (error) { + this.log(`${name}: Iteration ${i + 1} failed: ${error}`); + } + } + + return iterationResults; + } + + /** + * Get all collected results + */ + getResults(): BenchmarkResult[] { + return [...this.results]; + } + + /** + * Calculate statistics for all metrics + */ + calculateAllStats(): Record { + const statsByMetric: Record = {}; + + // Group results by metric + const valuesByMetric: Record = {}; + for (const result of this.results) { + if (!valuesByMetric[result.metric]) { + valuesByMetric[result.metric] = []; + } + valuesByMetric[result.metric].push(result.value); + } + + // Calculate stats for each metric + for (const [metric, values] of Object.entries(valuesByMetric)) { + statsByMetric[metric] = calculateStats(values); + } + + return statsByMetric; + } + + /** + * Generate a complete benchmark report + */ + async generateReport(): Promise { + const [commitSha, branch] = await Promise.all([ + getGitCommitSha(), + getGitBranch(), + ]); + + return { + version: '1.0.0', + commitSha, + branch, + environment: { + os: `${os.type()} ${os.release()}`, + nodeVersion: process.version, + cpuModel: getCpuModel(), + cpuCount: os.cpus().length, + totalMemoryMb: Math.round(os.totalmem() / (1024 * 1024)), + }, + results: this.getResults(), + stats: this.calculateAllStats(), + generatedAt: new Date().toISOString(), + }; + } + + /** + * Clear all collected results + */ + clear(): void { + this.results = []; + } +} + +/** + * Detect regressions by comparing current stats to baseline + */ +export function detectRegressions( + current: Record, + baseline: Record, + thresholds: RegressionThreshold[] +): RegressionResult { + const regressions: RegressionResult['regressions'] = []; + const improvements: RegressionResult['improvements'] = []; + + for (const threshold of thresholds) { + const currentStats = current[threshold.metric]; + const baselineStats = baseline[threshold.metric]; + + if (!currentStats || !baselineStats || baselineStats.mean === 0) { + continue; + } + + const changePercent = + ((currentStats.mean - baselineStats.mean) / baselineStats.mean) * 100; + + if (changePercent > threshold.maxIncreasePercent) { + regressions.push({ + metric: threshold.metric, + current: currentStats.mean, + baseline: baselineStats.mean, + changePercent, + threshold: threshold.maxIncreasePercent, + }); + } else if ( + threshold.minDecreasePercent !== undefined && + changePercent < -threshold.minDecreasePercent + ) { + improvements.push({ + metric: threshold.metric, + current: currentStats.mean, + baseline: baselineStats.mean, + changePercent, + }); + } + } + + return { + hasRegression: regressions.length > 0, + regressions, + improvements, + }; +} + +/** + * Format benchmark report as Markdown for GitHub Actions summary + */ +export function formatReportAsMarkdown(report: BenchmarkReport): string { + let md = '## 📊 Performance Benchmark Results\n\n'; + + md += `**Commit:** \`${report.commitSha.substring(0, 7)}\`\n`; + md += `**Branch:** \`${report.branch}\`\n`; + md += `**Generated:** ${report.generatedAt}\n\n`; + + md += '### Environment\n\n'; + md += `| Property | Value |\n`; + md += `|----------|-------|\n`; + md += `| OS | ${report.environment.os} |\n`; + md += `| Node.js | ${report.environment.nodeVersion} |\n`; + md += `| CPU | ${report.environment.cpuModel} |\n`; + md += `| CPU Cores | ${report.environment.cpuCount} |\n`; + md += `| Memory | ${report.environment.totalMemoryMb} MB |\n\n`; + + md += '### Benchmark Results\n\n'; + md += '| Metric | Mean | Min | Max | Std Dev | Samples |\n'; + md += '|--------|------|-----|-----|---------|--------|\n'; + + for (const [metric, stats] of Object.entries(report.stats)) { + // Find a result to get the unit + const result = report.results.find((r) => r.metric === metric); + const unit = result?.unit ?? ''; + + md += `| ${metric} | ${stats.mean.toFixed(2)} ${unit} | `; + md += `${stats.min.toFixed(2)} ${unit} | `; + md += `${stats.max.toFixed(2)} ${unit} | `; + md += `${stats.stdDev.toFixed(2)} | `; + md += `${stats.samples} |\n`; + } + + md += '\n'; + + return md; +} + +/** + * Format regression result as Markdown + */ +export function formatRegressionAsMarkdown( + result: RegressionResult, + _thresholds: RegressionThreshold[] +): string { + let md = '### Regression Analysis\n\n'; + + if (!result.hasRegression && result.improvements.length === 0) { + md += '✅ **No significant performance changes detected.**\n\n'; + return md; + } + + if (result.hasRegression) { + md += '⚠️ **Performance Regressions Detected:**\n\n'; + md += '| Metric | Current | Baseline | Change | Threshold |\n'; + md += '|--------|---------|----------|--------|----------|\n'; + + for (const reg of result.regressions) { + md += `| ${reg.metric} | ${reg.current.toFixed(2)} | `; + md += `${reg.baseline.toFixed(2)} | `; + md += `+${reg.changePercent.toFixed(1)}% | `; + md += `${reg.threshold}% |\n`; + } + + md += '\n'; + } + + if (result.improvements.length > 0) { + md += '🎉 **Performance Improvements:**\n\n'; + md += '| Metric | Current | Baseline | Change |\n'; + md += '|--------|---------|----------|--------|\n'; + + for (const imp of result.improvements) { + md += `| ${imp.metric} | ${imp.current.toFixed(2)} | `; + md += `${imp.baseline.toFixed(2)} | `; + md += `${imp.changePercent.toFixed(1)}% |\n`; + } + + md += '\n'; + } + + return md; +} diff --git a/src/benchmarks/benchmark-types.ts b/src/benchmarks/benchmark-types.ts new file mode 100644 index 000000000..f3dbc7e6a --- /dev/null +++ b/src/benchmarks/benchmark-types.ts @@ -0,0 +1,118 @@ +/** + * Type definitions for performance benchmarks + */ + +/** + * Result of a single benchmark run + */ +export interface BenchmarkResult { + /** Benchmark name */ + name: string; + /** What the benchmark measures */ + description: string; + /** Metric name (e.g., 'startup_time', 'memory_mb', 'throughput_rps') */ + metric: string; + /** Unit of measurement */ + unit: string; + /** Measured value */ + value: number; + /** Timestamp when the benchmark was run */ + timestamp: string; + /** Duration of the benchmark in milliseconds */ + durationMs: number; + /** Additional metadata */ + metadata?: Record; +} + +/** + * Statistical summary of multiple benchmark runs + */ +export interface BenchmarkStats { + /** Minimum value */ + min: number; + /** Maximum value */ + max: number; + /** Mean value */ + mean: number; + /** Median value */ + median: number; + /** Standard deviation */ + stdDev: number; + /** Number of samples */ + samples: number; +} + +/** + * Complete benchmark report + */ +export interface BenchmarkReport { + /** Report version for future compatibility */ + version: string; + /** Git commit SHA */ + commitSha: string; + /** Git branch name */ + branch: string; + /** Runner environment info */ + environment: { + os: string; + nodeVersion: string; + cpuModel: string; + cpuCount: number; + totalMemoryMb: number; + }; + /** All benchmark results */ + results: BenchmarkResult[]; + /** Statistical summaries for each metric */ + stats: Record; + /** Report generation timestamp */ + generatedAt: string; +} + +/** + * Threshold configuration for regression detection + */ +export interface RegressionThreshold { + /** Metric name */ + metric: string; + /** Maximum allowed increase percentage (e.g., 10 means 10% regression allowed) */ + maxIncreasePercent: number; + /** Minimum required decrease percentage for improvement (optional) */ + minDecreasePercent?: number; +} + +/** + * Regression detection result + */ +export interface RegressionResult { + /** Whether a regression was detected */ + hasRegression: boolean; + /** List of metrics that regressed */ + regressions: Array<{ + metric: string; + current: number; + baseline: number; + changePercent: number; + threshold: number; + }>; + /** List of metrics that improved */ + improvements: Array<{ + metric: string; + current: number; + baseline: number; + changePercent: number; + }>; +} + +/** + * Options for running benchmarks + */ +export interface BenchmarkOptions { + /** Number of iterations to run (default: 3) */ + iterations?: number; + /** Warmup runs before measuring (default: 1) */ + warmupRuns?: number; + /** Timeout for each benchmark in milliseconds (default: 120000) */ + timeout?: number; + /** Whether to output verbose logging */ + verbose?: boolean; +} diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md new file mode 100644 index 000000000..c605f6de4 --- /dev/null +++ b/tests/benchmarks/README.md @@ -0,0 +1,142 @@ +# Performance Benchmarks + +This directory contains performance benchmarks for the awf (Agentic Workflow Firewall) tool. + +## Overview + +The benchmark suite measures key performance metrics to track and prevent performance regressions over time: + +- **Container Startup Time**: Time from command invocation to container ready +- **Network Throughput**: Time to make HTTP requests through the proxy +- **Memory Usage**: Combined memory consumption of firewall containers +- **Blocked Domain Performance**: Time for proxy to reject blocked domains + +## Running Benchmarks + +### Locally + +```bash +# Run all benchmarks (requires sudo for iptables) +sudo -E npm run test:benchmark + +# View the benchmark report +cat /tmp/awf-benchmark-report.json | jq +``` + +### In CI + +The benchmarks run automatically on: +- Push to `main` branch +- Pull requests to `main` branch + +The GitHub Actions workflow (`benchmark.yml`) generates a summary with: +- Performance metrics table +- Environment information +- Comparison with baseline (if available) + +## Benchmark Files + +| File | Description | +|------|-------------| +| `performance.benchmark.ts` | Main benchmark test file with all performance tests | + +## Metrics Tracked + +| Metric | Description | Unit | +|--------|-------------|------| +| `startup_time_ms` | Container startup and simple command execution | milliseconds | +| `request_time_ms` | HTTP request time through proxy | milliseconds | +| `download_time_ms` | Small file download through proxy | milliseconds | +| `memory_mb` | Combined container memory usage | megabytes | +| `reject_time_ms` | Time to reject blocked domain | milliseconds | + +## Regression Detection + +The benchmark suite includes regression detection with configurable thresholds: + +```typescript +const thresholds: RegressionThreshold[] = [ + { metric: 'startup_time_ms', maxIncreasePercent: 20 }, + { metric: 'request_time_ms', maxIncreasePercent: 15 }, + { metric: 'memory_mb', maxIncreasePercent: 25 }, +]; +``` + +When a metric increases beyond its threshold compared to the baseline, a warning is generated in the CI summary. + +## Interpreting Results + +### Good Performance Indicators + +- **Startup time < 30s**: Fast container initialization +- **Request time < 5s**: Efficient proxy overhead +- **Memory < 200MB**: Low resource footprint + +### Common Performance Issues + +1. **Slow startup**: May indicate Docker image pull issues or container init problems +2. **High request latency**: Could be DNS resolution or proxy configuration issues +3. **High memory usage**: Check for memory leaks in long-running tests + +## Benchmark Report Format + +The benchmark report is saved as JSON in `/tmp/awf-benchmark-report.json`: + +```json +{ + "version": "1.0.0", + "commitSha": "abc123", + "branch": "main", + "environment": { + "os": "Linux 5.4.0", + "nodeVersion": "v20.0.0", + "cpuModel": "Intel Core i7", + "cpuCount": 4, + "totalMemoryMb": 16384 + }, + "results": [...], + "stats": { + "startup_time_ms": { + "min": 5000, + "max": 6000, + "mean": 5500, + "median": 5500, + "stdDev": 250, + "samples": 3 + } + }, + "generatedAt": "2024-01-01T00:00:00.000Z" +} +``` + +## Adding New Benchmarks + +To add a new benchmark: + +1. Open `performance.benchmark.ts` +2. Add a new test using the benchmark runner: + +```typescript +await benchmarkRunner.run( + 'your_benchmark_name', + 'Description of what this measures', + 'metric_name', + 'unit', + async () => { + // Your benchmark code here + // Return the measured value + return measuredValue; + } +); +``` + +3. Update the threshold configuration if needed +4. Update this README with the new metric + +## Best Practices + +1. **Run multiple iterations**: The default is 3 iterations to reduce noise +2. **Use warmup runs**: First run often includes cold start overhead +3. **Clean state**: Always clean up between iterations +4. **Minimize variables**: Run on consistent hardware/environment +5. **Track trends**: Compare against historical baselines diff --git a/tests/benchmarks/performance.benchmark.ts b/tests/benchmarks/performance.benchmark.ts new file mode 100644 index 000000000..c162e5819 --- /dev/null +++ b/tests/benchmarks/performance.benchmark.ts @@ -0,0 +1,270 @@ +/** + * Performance Benchmark Tests + * + * This file contains performance benchmarks for the awf firewall. + * Key metrics tracked: + * - Container startup time + * - Network throughput (allowed domains) + * - Memory usage + * - Cold start vs warm start comparison + */ + +import { describe, test, expect, beforeAll, afterAll } from '@jest/globals'; +import { BenchmarkRunner, formatReportAsMarkdown } from '../../src/benchmarks/benchmark-runner'; +import { createRunner, AwfRunner } from '../fixtures/awf-runner'; +import { cleanup } from '../fixtures/cleanup'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import execa = require('execa'); + +describe('Performance Benchmarks', () => { + let runner: AwfRunner; + let benchmarkRunner: BenchmarkRunner; + const iterations = 3; // Number of iterations for each benchmark + + beforeAll(async () => { + // Ensure clean state + await cleanup(false); + + runner = createRunner(); + benchmarkRunner = new BenchmarkRunner({ + iterations, + warmupRuns: 1, + verbose: true, + }); + }); + + afterAll(async () => { + // Generate and save benchmark report + const report = await benchmarkRunner.generateReport(); + const reportPath = path.join(os.tmpdir(), 'awf-benchmark-report.json'); + fs.writeFileSync(reportPath, JSON.stringify(report, null, 2)); + console.log(`\nBenchmark report saved to: ${reportPath}`); + + // Print markdown summary + console.log('\n' + formatReportAsMarkdown(report)); + + // Clean up after all tests + await cleanup(false); + }); + + describe('1. Container Startup Time', () => { + test('measures time to start containers and execute simple command', async () => { + const results = await benchmarkRunner.run( + 'container_startup', + 'Time from command invocation to container ready and simple command executed', + 'startup_time_ms', + 'ms', + async () => { + const startTime = Date.now(); + + const result = await runner.runWithSudo('echo "ready"', { + allowDomains: ['github.com'], + logLevel: 'error', // Minimize log overhead + timeout: 60000, + }); + + const elapsed = Date.now() - startTime; + + // Ensure the command succeeded + expect(result.exitCode).toBe(0); + expect(result.stdout).toContain('ready'); + + // Clean up for next iteration + await cleanup(false); + + return elapsed; + } + ); + + expect(results.length).toBe(iterations); + + // Sanity check: startup should take less than 60 seconds + for (const result of results) { + expect(result.value).toBeLessThan(60000); + } + }, 300000); // 5 min timeout + }); + + describe('2. Network Throughput', () => { + test('measures time to make HTTP request through proxy', async () => { + const results = await benchmarkRunner.run( + 'http_request_time', + 'Time to make an HTTP request to an allowed domain through the proxy', + 'request_time_ms', + 'ms', + async () => { + const startTime = Date.now(); + + // Use curl with timing info + const result = await runner.runWithSudo( + 'curl -s -o /dev/null -w "%{time_total}" https://api.github.com', + { + allowDomains: ['github.com'], + logLevel: 'error', + timeout: 60000, + } + ); + + const elapsed = Date.now() - startTime; + + expect(result.exitCode).toBe(0); + + // Clean up for next iteration + await cleanup(false); + + // Return the curl-reported time in ms + const curlTime = parseFloat(result.stdout.trim()) * 1000; + return isNaN(curlTime) ? elapsed : curlTime; + } + ); + + expect(results.length).toBe(iterations); + }, 300000); + + test('measures time to download small file', async () => { + const results = await benchmarkRunner.run( + 'download_time', + 'Time to download a small file (robots.txt) through the proxy', + 'download_time_ms', + 'ms', + async () => { + const startTime = Date.now(); + + const result = await runner.runWithSudo( + 'curl -s -o /dev/null -w "%{time_total}" https://github.com/robots.txt', + { + allowDomains: ['github.com'], + logLevel: 'error', + timeout: 60000, + } + ); + + const elapsed = Date.now() - startTime; + + expect(result.exitCode).toBe(0); + + await cleanup(false); + + const curlTime = parseFloat(result.stdout.trim()) * 1000; + return isNaN(curlTime) ? elapsed : curlTime; + } + ); + + expect(results.length).toBe(iterations); + }, 300000); + }); + + describe('3. Memory Usage', () => { + test('measures container memory usage during idle', async () => { + const results = await benchmarkRunner.run( + 'memory_usage_idle', + 'Memory usage of containers while idle', + 'memory_mb', + 'MB', + async () => { + // Start containers and keep them running + await runner.runWithSudo( + // Wait a bit for containers to stabilize + 'sleep 2', + { + allowDomains: ['github.com'], + keepContainers: true, + logLevel: 'error', + timeout: 60000, + } + ); + + // Get memory stats from Docker + let memoryMb = -1; // Use -1 as sentinel for measurement failure + try { + const { stdout } = await execa('docker', [ + 'stats', + '--no-stream', + '--format', + '{{.MemUsage}}', + 'awf-squid', + 'awf-agent', + ]); + + // Parse memory usage (format: "15.2MiB / 1.9GiB") + memoryMb = 0; + const lines = stdout.trim().split('\n'); + for (const line of lines) { + // Match memory usage anywhere in the line + const match = line.match(/([\d.]+)(MiB|GiB|MB|GB)/); + if (match) { + const value = parseFloat(match[1]); + const unit = match[2]; + if (unit === 'GiB' || unit === 'GB') { + memoryMb += value * 1024; + } else { + memoryMb += value; + } + } + } + + // If we parsed lines but got 0, that's suspicious + if (memoryMb === 0 && lines.length > 0) { + console.warn('Warning: Docker stats returned data but memory parsing yielded 0'); + } + } catch (error) { + console.error('Failed to get Docker memory stats for containers:', error); + } + + // Clean up + await cleanup(false); + + // Return sentinel value if measurement completely failed + if (memoryMb < 0) { + throw new Error('Memory measurement failed - Docker stats unavailable'); + } + + return memoryMb; + } + ); + + expect(results.length).toBe(iterations); + + // Sanity check: memory should be reasonable (less than 1GB total) + for (const result of results) { + expect(result.value).toBeLessThan(1024); + } + }, 300000); + }); + + describe('4. Blocked Domain Performance', () => { + test('measures time for proxy to reject blocked domain', async () => { + const results = await benchmarkRunner.run( + 'blocked_domain_time', + 'Time for proxy to reject a request to a non-allowed domain', + 'reject_time_ms', + 'ms', + async () => { + const startTime = Date.now(); + + // This should fail quickly since example.com is not allowed + const result = await runner.runWithSudo( + 'curl -s -o /dev/null -w "%{time_total}" --max-time 10 https://example.com || true', + { + allowDomains: ['github.com'], + logLevel: 'error', + timeout: 60000, + } + ); + + const elapsed = Date.now() - startTime; + + await cleanup(false); + + // Return curl time if available, otherwise elapsed + const curlTime = parseFloat(result.stdout.trim()) * 1000; + return isNaN(curlTime) ? elapsed : curlTime; + } + ); + + expect(results.length).toBe(iterations); + }, 300000); + }); +}); diff --git a/tests/setup/jest.benchmark.config.js b/tests/setup/jest.benchmark.config.js new file mode 100644 index 000000000..0e96547fc --- /dev/null +++ b/tests/setup/jest.benchmark.config.js @@ -0,0 +1,11 @@ +module.exports = { + preset: 'ts-jest', + testEnvironment: 'node', + roots: ['/../benchmarks'], + testMatch: ['**/*.benchmark.ts'], + moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'], + setupFilesAfterEnv: ['/jest.setup.ts'], + testTimeout: 300000, // 5 minutes per test (benchmarks can be slow) + verbose: true, + maxWorkers: 1, // Run tests serially to get accurate benchmark measurements +};