PR

pwang347 · pwang347 · commit a4a562b6f046 · 2026-04-13T19:30:53.000-07:00
diff --git a/.github/skills/chat-perf/SKILL.md b/.github/skills/chat-perf/SKILL.md
@@ -28,7 +28,7 @@ npm run perf:chat-leak -- --messages 20 --verbose
 
 ## Perf regression test
 
-**Script:** `scripts/chat-perf/test-chat-perf-regression.js`  
+**Script:** `scripts/chat-perf/test-chat-perf-regression.js`
 **npm:** `npm run perf:chat`
 
 Launches VS Code via Playwright Electron, opens the chat panel, sends a message with a mock LLM response, and measures timing, layout, and rendering metrics. By default, downloads VS Code 1.115.0 as a baseline, benchmarks it, then benchmarks the local dev build and compares.
@@ -42,6 +42,7 @@ Launches VS Code via Playwright Electron, opens the chat panel, sends a message
 | `--build <path\|ver>` | local dev | Build to test. Accepts path or version (`1.110.0`, `insiders`). |
 | `--baseline-build <ver>` | `1.115.0` | Version to download and compare against. |
 | `--no-baseline` | — | Skip baseline comparison entirely. |
+| `--resume <path>` | — | Resume a previous run, adding more iterations to increase confidence. |
 | `--threshold <frac>` | `0.2` | Regression threshold (0.2 = flag if 20% slower). |
 | `--verbose` | — | Print per-run details including response content. |
 
@@ -52,10 +53,37 @@ Launches VS Code via Playwright Electron, opens the chat panel, sends a message
 npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 --runs 5
 ```
 
+### Resuming a run for more confidence
+
+When results exceed the threshold but aren't statistically significant, the tool prints a `--resume` hint. Use it to add more iterations to an existing run:
+
+```bash
+# Initial run with 3 iterations — may be inconclusive:
+npm run perf:chat -- --scenario text-only --runs 3
+
+# Add 3 more runs to the same results file (both test + baseline):
+npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 3
+
+# Keep adding until confidence is reached:
+npm run perf:chat -- --resume .chat-perf-data/2026-04-14T02-15-14/results.json --runs 5
+```
+
+`--resume` loads the previous `results.json` and its associated `baseline-*.json`, runs N more iterations for both builds, merges rawRuns, recomputes stats, and re-runs the comparison. The updated files are written back in-place. You can resume multiple times — samples accumulate.
+
+### Statistical significance
+
+Regression detection uses **Welch's t-test** to avoid false positives from noisy measurements. A metric is only flagged as `REGRESSION` when it both exceeds the threshold AND is statistically significant (p < 0.05). Otherwise it's reported as `(likely noise — p=X, not significant)`.
+
+With typical variance (cv ≈ 20%), you need:
+- **n ≥ 5** per build to detect a 35% regression at 95% confidence
+- **n ≥ 10** per build to detect a 20% regression reliably
+
+Confidence levels reported: `high` (p < 0.01), `medium` (p < 0.05), `low` (p < 0.1), `none`.
+
 ### Exit codes
 
-- `0` — all metrics within threshold
-- `1` — regression detected or runs failed
+- `0` — all metrics within threshold, or exceeding threshold but not statistically significant
+- `1` — statistically significant regression detected, or all runs failed
 
 ### Scenarios
 
@@ -80,11 +108,11 @@ npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 --runs 5
 
 ### Statistics
 
-Results use **IQR-based outlier removal** and **median** (not mean) to handle startup jitter. The **coefficient of variation (cv)** is reported — under 15% is stable, over 15% gets a ⚠ warning. Use 5+ runs to get stable results.
+Results use **IQR-based outlier removal** and **median** (not mean) to handle startup jitter. The **coefficient of variation (cv)** is reported — under 15% is stable, over 15% gets a ⚠ warning. Baseline comparison uses **Welch's t-test** on raw run values to determine statistical significance before flagging regressions. Use 5+ runs to get stable results.
 
 ## Memory leak check
 
-**Script:** `scripts/chat-perf/test-chat-mem-leaks.js`  
+**Script:** `scripts/chat-perf/test-chat-mem-leaks.js`
 **npm:** `npm run perf:chat-leak`
 
 Launches one VS Code session, sends N messages sequentially, forces GC between each, and measures renderer heap and DOM node count. Uses **linear regression** on the samples to compute per-message growth rate, which is compared against a threshold.
diff --git a/.github/workflows/chat-perf.yml b/.github/workflows/chat-perf.yml
@@ -0,0 +1,144 @@
+name: Chat Performance Comparison
+
+on:
+  workflow_dispatch:
+    inputs:
+      baseline_commit:
+        description: 'Baseline commit SHA or version (e.g. "1.115.0", "abc1234")'
+        required: true
+        type: string
+      test_commit:
+        description: 'Test commit SHA or version (e.g. "main", "abc1234")'
+        required: true
+        type: string
+      runs:
+        description: 'Runs per scenario (default: 7 for statistical significance)'
+        required: false
+        type: number
+        default: 7
+      scenarios:
+        description: 'Comma-separated scenario list (empty = all)'
+        required: false
+        type: string
+        default: ''
+      threshold:
+        description: 'Regression threshold fraction (default: 0.2 = 20%)'
+        required: false
+        type: number
+        default: 0.2
+
+permissions:
+  contents: read
+
+concurrency:
+  group: chat-perf-${{ github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  chat-perf:
+    name: Chat Perf – ${{ inputs.baseline_commit }} vs ${{ inputs.test_commit }}
+    runs-on: ubuntu-latest
+    timeout-minutes: 120
+    steps:
+      - name: Checkout test commit
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ inputs.test_commit }}
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v6
+        with:
+          node-version-file: .nvmrc
+
+      - name: Install system dependencies
+        run: |
+          sudo apt update -y
+          sudo apt install -y \
+            build-essential pkg-config \
+            libx11-dev libx11-xcb-dev libxkbfile-dev \
+            libnotify-bin libkrb5-dev \
+            xvfb sqlite3 \
+            libnss3 libatk1.0-0 libatk-bridge2.0-0 \
+            libcups2 libdrm2 libxcomposite1 libxdamage1 \
+            libxrandr2 libgbm1 libpango-1.0-0 libcairo2 \
+            libasound2 libxshmfence1 libgtk-3-0
+
+      - name: Install dependencies
+        run: npm ci
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Install build dependencies
+        run: npm ci
+        working-directory: build
+
+      - name: Transpile source
+        run: npm run transpile-client
+
+      - name: Install Playwright Chromium
+        run: npx playwright install chromium
+
+      - name: Run chat perf comparison
+        id: perf
+        run: |
+          SCENARIO_ARGS=""
+          if [[ -n "${{ inputs.scenarios }}" ]]; then
+            IFS=',' read -ra SCENS <<< "${{ inputs.scenarios }}"
+            for s in "${SCENS[@]}"; do
+              SCENARIO_ARGS="$SCENARIO_ARGS --scenario $(echo "$s" | xargs)"
+            done
+          fi
+
+          xvfb-run node scripts/chat-perf/test-chat-perf-regression.js \
+            --baseline-build "${{ inputs.baseline_commit }}" \
+            --build "${{ inputs.test_commit }}" \
+            --runs ${{ inputs.runs }} \
+            --threshold ${{ inputs.threshold }} \
+            --ci \
+            $SCENARIO_ARGS \
+            2>&1 | tee perf-output.log
+
+          # Extract exit code from the script (tee masks it)
+          exit ${PIPESTATUS[0]}
+        continue-on-error: true
+
+      - name: Write job summary
+        if: always()
+        run: |
+          if [[ -f .chat-perf-data/ci-summary.md ]]; then
+            cat .chat-perf-data/ci-summary.md >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "⚠️ No summary file generated. Check perf-output.log artifact." >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: Zip diagnostic outputs
+        if: always()
+        run: |
+          # Find the most recent timestamped run directory
+          RUN_DIR=$(ls -td .chat-perf-data/20*/ 2>/dev/null | head -1)
+          if [[ -n "$RUN_DIR" ]]; then
+            # Zip everything: results JSON, CPU profiles, traces, heap snapshots
+            cd .chat-perf-data
+            zip -r ../chat-perf-artifacts.zip \
+              "$(basename "$RUN_DIR")"/ \
+              ci-summary.md \
+              baseline-*.json \
+              2>/dev/null || true
+            cd ..
+          fi
+
+      - name: Upload perf artifacts
+        if: always()
+        uses: actions/upload-artifact@v7
+        with:
+          name: chat-perf-${{ inputs.baseline_commit }}-vs-${{ inputs.test_commit }}
+          path: |
+            chat-perf-artifacts.zip
+            perf-output.log
+          retention-days: 30
+
+      - name: Fail on regression
+        if: steps.perf.outcome == 'failure'
+        run: |
+          echo "::error::Chat performance regression detected. See job summary for details."
+          exit 1
diff --git a/scripts/chat-perf/common/utils.js b/scripts/chat-perf/common/utils.js
@@ -399,6 +399,108 @@ function removeOutliers(values) {
 	return sorted.filter(v => v >= lo && v <= hi);
 }
 
+/**
+ * Regularized incomplete beta function I_x(a, b) via continued fraction.
+ * Used for computing t-distribution CDF / p-values.
+ * @param {number} x
+ * @param {number} a
+ * @param {number} b
+ * @returns {number}
+ */
+function betaIncomplete(x, a, b) {
+	if (x <= 0) { return 0; }
+	if (x >= 1) { return 1; }
+	// Use symmetry relation when x > (a+1)/(a+b+2) for better convergence
+	if (x > (a + 1) / (a + b + 2)) {
+		return 1 - betaIncomplete(1 - x, b, a);
+	}
+	// Log-beta via Stirling: lnBeta(a,b) = lnGamma(a)+lnGamma(b)-lnGamma(a+b)
+	const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
+	const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
+	// Lentz's continued fraction
+	const maxIter = 200;
+	const eps = 1e-14;
+	let c = 1, d = 1 - (a + b) * x / (a + 1);
+	if (Math.abs(d) < eps) { d = eps; }
+	d = 1 / d;
+	let result = d;
+	for (let m = 1; m <= maxIter; m++) {
+		// Even step
+		let num = m * (b - m) * x / ((a + 2 * m - 1) * (a + 2 * m));
+		d = 1 + num * d; if (Math.abs(d) < eps) { d = eps; } d = 1 / d;
+		c = 1 + num / c; if (Math.abs(c) < eps) { c = eps; }
+		result *= d * c;
+		// Odd step
+		num = -(a + m) * (a + b + m) * x / ((a + 2 * m) * (a + 2 * m + 1));
+		d = 1 + num * d; if (Math.abs(d) < eps) { d = eps; } d = 1 / d;
+		c = 1 + num / c; if (Math.abs(c) < eps) { c = eps; }
+		const delta = d * c;
+		result *= delta;
+		if (Math.abs(delta - 1) < eps) { break; }
+	}
+	return front * result;
+}
+
+/**
+ * Log-gamma via Lanczos approximation.
+ * @param {number} z
+ * @returns {number}
+ */
+function lnGamma(z) {
+	const g = 7;
+	const coef = [0.99999999999980993, 676.5203681218851, -1259.1392167224028,
+		771.32342877765313, -176.61502916214059, 12.507343278686905,
+		-0.13857109526572012, 9.9843695780195716e-6, 1.5056327351493116e-7];
+	if (z < 0.5) {
+		return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
+	}
+	z -= 1;
+	let x = coef[0];
+	for (let i = 1; i < g + 2; i++) { x += coef[i] / (z + i); }
+	const t = z + g + 0.5;
+	return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
+}
+
+/**
+ * Two-tailed p-value from t-distribution.
+ * @param {number} t - t-statistic
+ * @param {number} df - degrees of freedom
+ * @returns {number}
+ */
+function tDistPValue(t, df) {
+	const x = df / (df + t * t);
+	return betaIncomplete(x, df / 2, 0.5);
+}
+
+/**
+ * Welch's t-test for two independent samples (unequal variance).
+ * @param {number[]} a - Sample 1 (e.g., baseline values)
+ * @param {number[]} b - Sample 2 (e.g., current values)
+ * @returns {{ t: number, df: number, pValue: number, significant: boolean, confidence: string } | null}
+ */
+function welchTTest(a, b) {
+	if (a.length < 2 || b.length < 2) { return null; }
+	const meanA = a.reduce((s, v) => s + v, 0) / a.length;
+	const meanB = b.reduce((s, v) => s + v, 0) / b.length;
+	const varA = a.reduce((s, v) => s + (v - meanA) ** 2, 0) / (a.length - 1);
+	const varB = b.reduce((s, v) => s + (v - meanB) ** 2, 0) / (b.length - 1);
+	const seA = varA / a.length;
+	const seB = varB / b.length;
+	const seDiff = Math.sqrt(seA + seB);
+	if (seDiff === 0) { return null; }
+	const t = (meanB - meanA) / seDiff;
+	// Welch-Satterthwaite degrees of freedom
+	const df = (seA + seB) ** 2 / ((seA ** 2) / (a.length - 1) + (seB ** 2) / (b.length - 1));
+	const pValue = tDistPValue(t, df);
+	const significant = pValue < 0.05;
+	let confidence;
+	if (pValue < 0.01) { confidence = 'high'; }
+	else if (pValue < 0.05) { confidence = 'medium'; }
+	else if (pValue < 0.1) { confidence = 'low'; }
+	else { confidence = 'none'; }
+	return { t: Math.round(t * 100) / 100, df: Math.round(df * 10) / 10, pValue: Math.round(pValue * 1000) / 1000, significant, confidence };
+}
+
 /**
  * Compute robust stats for a metric array.
  * @param {number[]} raw
@@ -482,6 +584,7 @@ const METRIC_DEFS = [
 	['instructionCollectionTime', 'timing', 'ms'],
 	['agentInvokeTime', 'timing', 'ms'],
 	['heapDelta', 'memory', 'MB'],
+	['gcDurationMs', 'memory', 'ms'],
 	['layoutCount', 'rendering', ''],
 	['recalcStyleCount', 'rendering', ''],
 	['forcedReflowCount', 'rendering', ''],
@@ -504,6 +607,7 @@ module.exports = {
 	median,
 	removeOutliers,
 	robustStats,
+	welchTTest,
 	linearRegressionSlope,
 	summarize,
 	markDuration,
diff --git a/scripts/chat-perf/test-chat-perf-regression.js b/scripts/chat-perf/test-chat-perf-regression.js