Skip to content

Commit f598014

Browse files
Mossakaclaude
andauthored
feat: add unit tests for benchmark statistics and threshold logic (#1766)
* feat: extract benchmark utils and add comprehensive unit tests (#1761) Extract pure logic (stats, parseMb, checkRegressions) from benchmark-performance.ts into benchmark-utils.ts for testability. Add 28 test cases covering statistics computation, memory parsing, and threshold regression detection. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: address review feedback on parseMb docs and test clarity - Update parseMb() docstring to say MiB instead of MB since it operates on binary units - Split "returns 0 for unrecognized format" test to separate the zero-valued MiB case from truly unrecognized strings Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent df63435 commit f598014

4 files changed

Lines changed: 282 additions & 53 deletions

File tree

jest.config.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
module.exports = {
22
preset: 'ts-jest',
33
testEnvironment: 'node',
4-
roots: ['<rootDir>/src'],
4+
roots: ['<rootDir>/src', '<rootDir>/scripts'],
55
testMatch: ['**/__tests__/**/*.ts', '**/*.test.ts'],
66
collectCoverageFrom: [
77
'src/**/*.ts',

scripts/ci/benchmark-performance.ts

Lines changed: 2 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
*/
1717

1818
import { execSync, ExecSyncOptions, spawn, ChildProcess } from "child_process";
19+
import { stats, parseMb, checkRegressions, BenchmarkResult, BenchmarkReport } from "./benchmark-utils";
1920

2021
// ── Configuration ──────────────────────────────────────────────────
2122

@@ -24,25 +25,6 @@ const AWF_CMD = "sudo awf";
2425
const ALLOWED_DOMAIN = "api.github.com";
2526
const CLEANUP_CMD = "sudo docker compose down -v 2>/dev/null; sudo docker rm -f awf-squid awf-agent 2>/dev/null; sudo docker network prune -f 2>/dev/null";
2627

27-
interface BenchmarkResult {
28-
metric: string;
29-
unit: string;
30-
values: number[];
31-
mean: number;
32-
median: number;
33-
p95: number;
34-
p99: number;
35-
}
36-
37-
interface BenchmarkReport {
38-
timestamp: string;
39-
commitSha: string;
40-
iterations: number;
41-
results: BenchmarkResult[];
42-
thresholds: Record<string, { target: number; critical: number }>;
43-
regressions: string[];
44-
}
45-
4628
// ── Thresholds (milliseconds or MB) ───────────────────────────────
4729

4830
const THRESHOLDS: Record<string, { target: number; critical: number }> = {
@@ -65,17 +47,6 @@ function timeMs(fn: () => void): number {
6547
return Math.round(performance.now() - start);
6648
}
6749

68-
function stats(values: number[]): Pick<BenchmarkResult, "mean" | "median" | "p95" | "p99"> {
69-
const sorted = [...values].sort((a, b) => a - b);
70-
const n = sorted.length;
71-
return {
72-
mean: Math.round(sorted.reduce((a, b) => a + b, 0) / n),
73-
median: sorted[Math.floor(n / 2)],
74-
p95: sorted[Math.min(Math.floor(n * 0.95), n - 1)],
75-
p99: sorted[Math.min(Math.floor(n * 0.99), n - 1)],
76-
};
77-
}
78-
7950
function cleanup(): void {
8051
try {
8152
execSync(CLEANUP_CMD, { stdio: "ignore", timeout: 30_000 });
@@ -202,19 +173,6 @@ function waitForContainers(containerNames: string[], timeoutMs: number): Promise
202173
});
203174
}
204175

205-
/**
206-
* Parse a Docker memory usage string like "123.4MiB / 7.773GiB" into MB.
207-
*/
208-
function parseMb(s: string): number {
209-
const match = s.match(/([\d.]+)\s*(MiB|GiB|KiB)/i);
210-
if (!match) return 0;
211-
const val = parseFloat(match[1]);
212-
const unit = match[2].toLowerCase();
213-
if (unit === "gib") return val * 1024;
214-
if (unit === "kib") return val / 1024;
215-
return val;
216-
}
217-
218176
/**
219177
* Kill a spawned background process and its entire process group, best-effort.
220178
* Sends SIGTERM then SIGKILL to the process group so descendant processes
@@ -343,15 +301,7 @@ async function main(): Promise<void> {
343301
cleanup();
344302

345303
// Check for regressions against critical thresholds
346-
const regressions: string[] = [];
347-
for (const r of results) {
348-
const threshold = THRESHOLDS[r.metric];
349-
if (threshold && r.p95 > threshold.critical) {
350-
regressions.push(
351-
`${r.metric}: p95=${r.p95}${r.unit} exceeds critical threshold of ${threshold.critical}${r.unit}`
352-
);
353-
}
354-
}
304+
const regressions = checkRegressions(results, THRESHOLDS);
355305

356306
const report: BenchmarkReport = {
357307
timestamp: new Date().toISOString(),

scripts/ci/benchmark-utils.test.ts

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import { stats, parseMb, checkRegressions, BenchmarkResult } from "./benchmark-utils";
2+
3+
// ── stats() ──────────────────────────────────────────────────────
4+
5+
describe("stats()", () => {
6+
it("throws on empty array", () => {
7+
expect(() => stats([])).toThrow("stats() requires at least one value");
8+
});
9+
10+
it("handles single element", () => {
11+
const result = stats([42]);
12+
expect(result).toEqual({ mean: 42, median: 42, p95: 42, p99: 42 });
13+
});
14+
15+
it("handles two elements", () => {
16+
const result = stats([10, 20]);
17+
expect(result.mean).toBe(15);
18+
expect(result.median).toBe(20); // floor(2/2) = index 1
19+
expect(result.p95).toBe(20);
20+
expect(result.p99).toBe(20);
21+
});
22+
23+
it("handles odd count", () => {
24+
const result = stats([3, 1, 2]);
25+
// sorted: [1, 2, 3]
26+
expect(result.mean).toBe(2);
27+
expect(result.median).toBe(2); // floor(3/2) = index 1
28+
expect(result.p95).toBe(3); // floor(3*0.95)=2, index 2
29+
expect(result.p99).toBe(3); // floor(3*0.99)=2, index 2
30+
});
31+
32+
it("handles even count", () => {
33+
const result = stats([4, 2, 1, 3]);
34+
// sorted: [1, 2, 3, 4]
35+
expect(result.mean).toBe(3); // Math.round(10/4) = 3 (2.5 rounds to 3)
36+
expect(result.median).toBe(3); // floor(4/2) = index 2
37+
expect(result.p95).toBe(4); // floor(4*0.95)=3
38+
expect(result.p99).toBe(4); // floor(4*0.99)=3
39+
});
40+
41+
it("handles all same values", () => {
42+
const result = stats([7, 7, 7, 7, 7]);
43+
expect(result).toEqual({ mean: 7, median: 7, p95: 7, p99: 7 });
44+
});
45+
46+
it("rounds mean correctly", () => {
47+
// 1 + 2 + 3 = 6 / 3 = 2, no rounding needed
48+
expect(stats([1, 2, 3]).mean).toBe(2);
49+
// 1 + 2 = 3 / 2 = 1.5, rounds to 2
50+
expect(stats([1, 2]).mean).toBe(2);
51+
// 1 + 2 + 4 = 7 / 3 = 2.333... rounds to 2
52+
expect(stats([1, 2, 4]).mean).toBe(2);
53+
});
54+
55+
it("does not mutate input array", () => {
56+
const input = [5, 3, 1, 4, 2];
57+
const copy = [...input];
58+
stats(input);
59+
expect(input).toEqual(copy);
60+
});
61+
62+
it("handles large array with correct percentiles", () => {
63+
// 100 values: 1..100
64+
const values = Array.from({ length: 100 }, (_, i) => i + 1);
65+
const result = stats(values);
66+
expect(result.mean).toBe(51); // Math.round(5050/100)
67+
expect(result.median).toBe(51); // floor(100/2)=50, value at index 50 = 51
68+
expect(result.p95).toBe(96); // floor(100*0.95)=95, value at index 95 = 96
69+
expect(result.p99).toBe(100); // floor(100*0.99)=99, value at index 99 = 100
70+
});
71+
72+
it("handles negative values", () => {
73+
const result = stats([-10, -5, 0, 5, 10]);
74+
expect(result.mean).toBe(0);
75+
expect(result.median).toBe(0);
76+
});
77+
});
78+
79+
// ── parseMb() ────────────────────────────────────────────────────
80+
81+
describe("parseMb()", () => {
82+
it("parses MiB values", () => {
83+
expect(parseMb("123.4MiB / 7.773GiB")).toBe(123.4);
84+
});
85+
86+
it("parses GiB values", () => {
87+
expect(parseMb("2GiB / 8GiB")).toBe(2048);
88+
});
89+
90+
it("parses KiB values", () => {
91+
expect(parseMb("512KiB / 8GiB")).toBe(0.5);
92+
});
93+
94+
it("parses zero-valued MiB input", () => {
95+
expect(parseMb("0MiB")).toBe(0);
96+
});
97+
98+
it("returns 0 for unrecognized or empty format", () => {
99+
expect(parseMb("unknown")).toBe(0);
100+
expect(parseMb("")).toBe(0);
101+
});
102+
103+
it("is case insensitive", () => {
104+
expect(parseMb("100mib")).toBe(100);
105+
expect(parseMb("1gib")).toBe(1024);
106+
expect(parseMb("1024kib")).toBe(1);
107+
});
108+
109+
it("handles decimal values", () => {
110+
expect(parseMb("1.5GiB / 8GiB")).toBe(1536);
111+
expect(parseMb("0.5MiB / 8GiB")).toBe(0.5);
112+
});
113+
});
114+
115+
// ── checkRegressions() ──────────────────────────────────────────
116+
117+
describe("checkRegressions()", () => {
118+
const thresholds: Record<string, { target: number; critical: number }> = {
119+
container_startup_cold: { target: 15000, critical: 20000 },
120+
squid_https_latency: { target: 100, critical: 200 },
121+
memory_footprint_mb: { target: 500, critical: 1024 },
122+
};
123+
124+
function makeResult(metric: string, p95: number, unit = "ms"): BenchmarkResult {
125+
return { metric, unit, values: [p95], mean: p95, median: p95, p95, p99: p95 };
126+
}
127+
128+
it("returns empty array when all within thresholds", () => {
129+
const results = [
130+
makeResult("container_startup_cold", 19000),
131+
makeResult("squid_https_latency", 150),
132+
makeResult("memory_footprint_mb", 800, "MB"),
133+
];
134+
expect(checkRegressions(results, thresholds)).toEqual([]);
135+
});
136+
137+
it("detects single regression", () => {
138+
const results = [
139+
makeResult("container_startup_cold", 25000),
140+
];
141+
const regressions = checkRegressions(results, thresholds);
142+
expect(regressions).toHaveLength(1);
143+
expect(regressions[0]).toContain("container_startup_cold");
144+
expect(regressions[0]).toContain("p95=25000");
145+
expect(regressions[0]).toContain("critical threshold of 20000");
146+
});
147+
148+
it("detects multiple regressions", () => {
149+
const results = [
150+
makeResult("container_startup_cold", 25000),
151+
makeResult("squid_https_latency", 300),
152+
];
153+
const regressions = checkRegressions(results, thresholds);
154+
expect(regressions).toHaveLength(2);
155+
});
156+
157+
it("ignores metrics without thresholds", () => {
158+
const results = [
159+
makeResult("unknown_metric", 999999),
160+
];
161+
expect(checkRegressions(results, thresholds)).toEqual([]);
162+
});
163+
164+
it("p95 exactly at critical is not a regression", () => {
165+
const results = [
166+
makeResult("container_startup_cold", 20000),
167+
];
168+
expect(checkRegressions(results, thresholds)).toEqual([]);
169+
});
170+
171+
it("p95 one unit above critical is a regression", () => {
172+
const results = [
173+
makeResult("container_startup_cold", 20001),
174+
];
175+
expect(checkRegressions(results, thresholds)).toHaveLength(1);
176+
});
177+
178+
it("returns empty array for empty results", () => {
179+
expect(checkRegressions([], thresholds)).toEqual([]);
180+
});
181+
182+
it("returns empty array for empty thresholds", () => {
183+
const results = [makeResult("container_startup_cold", 99999)];
184+
expect(checkRegressions(results, {})).toEqual([]);
185+
});
186+
187+
it("includes unit in regression message", () => {
188+
const results = [makeResult("memory_footprint_mb", 2000, "MB")];
189+
const regressions = checkRegressions(results, thresholds);
190+
expect(regressions[0]).toContain("MB");
191+
});
192+
});

scripts/ci/benchmark-utils.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/**
2+
* Pure utility functions extracted from benchmark-performance.ts
3+
* for testability. No Docker/exec dependencies.
4+
*/
5+
6+
// ── Types ─────────────────────────────────────────────────────────
7+
8+
export interface BenchmarkResult {
9+
metric: string;
10+
unit: string;
11+
values: number[];
12+
mean: number;
13+
median: number;
14+
p95: number;
15+
p99: number;
16+
}
17+
18+
export interface BenchmarkReport {
19+
timestamp: string;
20+
commitSha: string;
21+
iterations: number;
22+
results: BenchmarkResult[];
23+
thresholds: Record<string, { target: number; critical: number }>;
24+
regressions: string[];
25+
}
26+
27+
// ── Statistics ────────────────────────────────────────────────────
28+
29+
/**
30+
* Compute mean, median, p95, and p99 for an array of numeric values.
31+
*
32+
* - Empty arrays throw an Error (caller must guard).
33+
* - Values are sorted ascending before computing percentiles.
34+
* - Percentile indices use Math.floor, clamped to the last element.
35+
*/
36+
export function stats(values: number[]): Pick<BenchmarkResult, "mean" | "median" | "p95" | "p99"> {
37+
if (values.length === 0) {
38+
throw new Error("stats() requires at least one value");
39+
}
40+
const sorted = [...values].sort((a, b) => a - b);
41+
const n = sorted.length;
42+
return {
43+
mean: Math.round(sorted.reduce((a, b) => a + b, 0) / n),
44+
median: sorted[Math.floor(n / 2)],
45+
p95: sorted[Math.min(Math.floor(n * 0.95), n - 1)],
46+
p99: sorted[Math.min(Math.floor(n * 0.99), n - 1)],
47+
};
48+
}
49+
50+
// ── Memory parsing ───────────────────────────────────────────────
51+
52+
/**
53+
* Parse a Docker memory usage string like "123.4MiB / 7.773GiB"
54+
* and return the used amount in MiB (first number only).
55+
* Note: GiB values are converted to MiB (GiB * 1024), KiB to MiB (KiB / 1024).
56+
*/
57+
export function parseMb(s: string): number {
58+
const match = s.match(/([\d.]+)\s*(MiB|GiB|KiB)/i);
59+
if (!match) return 0;
60+
const val = parseFloat(match[1]);
61+
const unit = match[2].toLowerCase();
62+
if (unit === "gib") return val * 1024;
63+
if (unit === "kib") return val / 1024;
64+
return val;
65+
}
66+
67+
// ── Threshold checking ───────────────────────────────────────────
68+
69+
/**
70+
* Compare benchmark results against critical thresholds.
71+
* Returns an array of human-readable regression descriptions.
72+
*/
73+
export function checkRegressions(
74+
results: BenchmarkResult[],
75+
thresholds: Record<string, { target: number; critical: number }>,
76+
): string[] {
77+
const regressions: string[] = [];
78+
for (const r of results) {
79+
const threshold = thresholds[r.metric];
80+
if (threshold && r.p95 > threshold.critical) {
81+
regressions.push(
82+
`${r.metric}: p95=${r.p95}${r.unit} exceeds critical threshold of ${threshold.critical}${r.unit}`,
83+
);
84+
}
85+
}
86+
return regressions;
87+
}

0 commit comments

Comments
 (0)