Skip to content

Commit d3074e3

Browse files
JohnMcLearclaude
andauthored
test(ci): remove the silent-ELIFECYCLE flake investigation scaffolding (#7868)
* test(ci): remove the silent-ELIFECYCLE investigation scaffolding The Windows backend flake is root-caused (server.ts handler gate for the in-process process.exit path; Windows pinned to Node 24.16.0 for the libuv TCP-connect overrun, tracked upstream as nodejs/node#63620). Remove the temporary diagnostics added while hunting it: - delete src/tests/backend/diagnostics.ts (per-test heartbeat + node-report snapshots) and its `--require` from the backend `test` script; - drop the `--report-on-fatalerror`/`-on-signal`/`-uncaught-exception` `NODE_OPTIONS` and the "Upload Node diagnostic reports" steps; - drop the Windows OS-level netstat/tasklist sidecar watcher. Kept: the real fixes — the log-only unhandledRejection guard in tests/backend/common.ts, the lowerCasePadIds socket-teardown tracking (comment de-referenced from the deleted file), and `pnpm test -- --exit` on Windows. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * test: drop the obsolete report-on-fatalerror NODE_OPTIONS guard assertion The backend-tests-flake-mitigation source-lint guard required every backend step to keep the --report-on-fatalerror NODE_OPTIONS + node-report upload. Those diagnostics are removed in this PR now that the flake is root-caused, so drop that assertion. Retain the Windows-only `--exit` checks (still a live invariant) and reframe the file around the resolved root cause. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent d582398 commit d3074e3

5 files changed

Lines changed: 25 additions & 440 deletions

File tree

.github/workflows/backend-tests.yml

Lines changed: 8 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -66,24 +66,7 @@ jobs:
6666
run: pnpm build
6767
-
6868
name: Run the backend tests
69-
env:
70-
# --report-on-fatalerror and friends write a Node diagnostic report
71-
# (V8 stack, libuv handles, OS info) on fatal errors that bypass JS
72-
# handlers — the failure mode we've been chasing on Windows + Node
73-
# 24 since PR #7663. Reports land in node-report/ and are uploaded
74-
# as an artifact if the step fails.
75-
NODE_OPTIONS: "--report-on-fatalerror --report-uncaught-exception --report-on-signal --report-compact --report-directory=${{ github.workspace }}/node-report"
76-
run: |
77-
mkdir -p "${{ github.workspace }}/node-report"
78-
pnpm test
79-
- name: Upload Node diagnostic reports on failure
80-
if: ${{ failure() }}
81-
uses: actions/upload-artifact@v7
82-
with:
83-
name: node-diagnostic-report-${{ runner.os }}-node${{ matrix.node }}-${{ github.job }}
84-
path: node-report/
85-
if-no-files-found: ignore
86-
retention-days: 7
69+
run: pnpm test
8770
- name: Run the new vitest tests
8871
working-directory: src
8972
run: pnpm run test:vitest
@@ -153,19 +136,7 @@ jobs:
153136
ep_table_of_contents
154137
-
155138
name: Run the backend tests
156-
env:
157-
NODE_OPTIONS: "--report-on-fatalerror --report-uncaught-exception --report-on-signal --report-compact --report-directory=${{ github.workspace }}/node-report"
158-
run: |
159-
mkdir -p "${{ github.workspace }}/node-report"
160-
pnpm test
161-
- name: Upload Node diagnostic reports on failure
162-
if: ${{ failure() }}
163-
uses: actions/upload-artifact@v7
164-
with:
165-
name: node-diagnostic-report-${{ runner.os }}-node${{ matrix.node }}-${{ github.job }}
166-
path: node-report/
167-
if-no-files-found: ignore
168-
retention-days: 7
139+
run: pnpm test
169140
- name: Run the new vitest tests
170141
working-directory: src
171142
run: pnpm run test:vitest
@@ -229,60 +200,9 @@ jobs:
229200
name: Run the backend tests
230201
shell: bash
231202
working-directory: src
232-
env:
233-
NODE_OPTIONS: "--report-on-fatalerror --report-uncaught-exception --report-on-signal --report-compact --report-directory=${{ github.workspace }}/node-report"
234-
run: |
235-
mkdir -p "${{ github.workspace }}/node-report"
236-
OUT="${{ github.workspace }}/node-report"
237-
# Out-of-process OS-level watcher for the silent-ELIFECYCLE flake.
238-
# In-process diagnostics (diagnostics.ts heartbeat + node-report
239-
# snapshots) showed that during the death window the V8 main
240-
# isolate is starved — heartbeat stops firing entirely, then the
241-
# process is externally terminated, bypassing all JS handlers and
242-
# Node's --report-on-fatalerror. To capture state during that
243-
# starvation we need a process that doesn't depend on the dying
244-
# process's event loop. A bash background loop polling Windows
245-
# OS state every 500 ms gives us that:
246-
# - netstat.log: localhost TCP socket states over time
247-
# (TIME_WAIT/CLOSE_WAIT accumulation, handle exhaustion)
248-
# - tasklist.log: node.exe process handle count, working set,
249-
# CPU time — captured by the OS independent of V8.
250-
# Both logs are appended to node-report/ which already gets
251-
# uploaded as an artifact on failure.
252-
(
253-
while true; do
254-
ts=$(date '+%H:%M:%S.%3N')
255-
{
256-
echo "=== $ts ==="
257-
netstat -an 2>/dev/null | grep -E "TCP\s+(127\.0\.0\.1|\[::1\])" || true
258-
} >> "$OUT/netstat.log"
259-
{
260-
echo "=== $ts ==="
261-
tasklist /v /fi "imagename eq node.exe" /fo csv 2>/dev/null || true
262-
} >> "$OUT/tasklist.log"
263-
sleep 0.5
264-
done
265-
) &
266-
WATCHER_PID=$!
267-
# --exit forces process.exit(failures) after the suite completes,
268-
# closing the post-suite event-loop drain window where Windows +
269-
# Node 24 hard-kills the process. Scoped to Windows so Linux/local
270-
# runs still surface real handle leaks via natural drain.
271-
set +e
272-
pnpm test -- --exit
273-
EXIT=$?
274-
set -e
275-
kill "$WATCHER_PID" 2>/dev/null || true
276-
wait "$WATCHER_PID" 2>/dev/null || true
277-
exit $EXIT
278-
- name: Upload Node diagnostic reports on failure
279-
if: ${{ failure() }}
280-
uses: actions/upload-artifact@v7
281-
with:
282-
name: node-diagnostic-report-${{ runner.os }}-node${{ matrix.node }}-${{ github.job }}
283-
path: node-report/
284-
if-no-files-found: ignore
285-
retention-days: 7
203+
# --exit makes mocha call process.exit() after the run so a leaked handle
204+
# cannot hang the job on Windows.
205+
run: pnpm test -- --exit
286206
- name: Run the new vitest tests
287207
working-directory: src
288208
run: pnpm run test:vitest
@@ -374,60 +294,9 @@ jobs:
374294
name: Run the backend tests
375295
shell: bash
376296
working-directory: src
377-
env:
378-
NODE_OPTIONS: "--report-on-fatalerror --report-uncaught-exception --report-on-signal --report-compact --report-directory=${{ github.workspace }}/node-report"
379-
run: |
380-
mkdir -p "${{ github.workspace }}/node-report"
381-
OUT="${{ github.workspace }}/node-report"
382-
# Out-of-process OS-level watcher for the silent-ELIFECYCLE flake.
383-
# In-process diagnostics (diagnostics.ts heartbeat + node-report
384-
# snapshots) showed that during the death window the V8 main
385-
# isolate is starved — heartbeat stops firing entirely, then the
386-
# process is externally terminated, bypassing all JS handlers and
387-
# Node's --report-on-fatalerror. To capture state during that
388-
# starvation we need a process that doesn't depend on the dying
389-
# process's event loop. A bash background loop polling Windows
390-
# OS state every 500 ms gives us that:
391-
# - netstat.log: localhost TCP socket states over time
392-
# (TIME_WAIT/CLOSE_WAIT accumulation, handle exhaustion)
393-
# - tasklist.log: node.exe process handle count, working set,
394-
# CPU time — captured by the OS independent of V8.
395-
# Both logs are appended to node-report/ which already gets
396-
# uploaded as an artifact on failure.
397-
(
398-
while true; do
399-
ts=$(date '+%H:%M:%S.%3N')
400-
{
401-
echo "=== $ts ==="
402-
netstat -an 2>/dev/null | grep -E "TCP\s+(127\.0\.0\.1|\[::1\])" || true
403-
} >> "$OUT/netstat.log"
404-
{
405-
echo "=== $ts ==="
406-
tasklist /v /fi "imagename eq node.exe" /fo csv 2>/dev/null || true
407-
} >> "$OUT/tasklist.log"
408-
sleep 0.5
409-
done
410-
) &
411-
WATCHER_PID=$!
412-
# --exit forces process.exit(failures) after the suite completes,
413-
# closing the post-suite event-loop drain window where Windows +
414-
# Node 24 hard-kills the process. Scoped to Windows so Linux/local
415-
# runs still surface real handle leaks via natural drain.
416-
set +e
417-
pnpm test -- --exit
418-
EXIT=$?
419-
set -e
420-
kill "$WATCHER_PID" 2>/dev/null || true
421-
wait "$WATCHER_PID" 2>/dev/null || true
422-
exit $EXIT
423-
- name: Upload Node diagnostic reports on failure
424-
if: ${{ failure() }}
425-
uses: actions/upload-artifact@v7
426-
with:
427-
name: node-diagnostic-report-${{ runner.os }}-node${{ matrix.node }}-${{ github.job }}
428-
path: node-report/
429-
if-no-files-found: ignore
430-
retention-days: 7
297+
# --exit makes mocha call process.exit() after the run so a leaked handle
298+
# cannot hang the job on Windows.
299+
run: pnpm test -- --exit
431300
- name: Run the new vitest tests
432301
working-directory: src
433302
run: pnpm run test:vitest

src/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@
149149
},
150150
"scripts": {
151151
"lint": "eslint .",
152-
"test": "cross-env NODE_ENV=production mocha --import=tsx --require ./tests/backend/diagnostics.ts --timeout 120000 --extension ts --recursive tests/backend/specs ../node_modules/ep_*/static/tests/backend/specs",
152+
"test": "cross-env NODE_ENV=production mocha --import=tsx --timeout 120000 --extension ts --recursive tests/backend/specs ../node_modules/ep_*/static/tests/backend/specs",
153153
"test-utils": "cross-env NODE_ENV=production mocha --import=tsx --timeout 5000 --recursive tests/backend/specs/*utils.ts",
154154
"test-container": "mocha --import=tsx --timeout 30000 --extension ts,js tests/container/specs/api",
155155
"dev": "cross-env NODE_ENV=development node --require tsx/cjs node/server.ts",

src/tests/backend-new/specs/backend-tests-flake-mitigation.test.ts

Lines changed: 13 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
'use strict';
22

3-
// Source-level lint pinning the Windows + Node 24 backend-test flake
4-
// mitigations from PR #7748. Two independent attacks at the failure:
3+
// Source-level lint pinning the one remaining Windows backend-test CI
4+
// invariant after the silent-ELIFECYCLE flake was root-caused and fixed
5+
// (the in-process process.exit path is gated in src/node/server.ts, and the
6+
// Windows jobs run on Node 24.16.0 to avoid the libuv TCP-connect stack
7+
// overrun in 24.15.0 — tracked upstream as nodejs/node#63620):
58
//
6-
// 1. Mocha --exit on the Windows CI jobs so the post-suite event-loop
7-
// drain — where Windows + Node 24 hard-kills the process — never
8-
// executes. Scoped to Windows so Linux/local runs still surface
9-
// real handle leaks via natural drain.
10-
// 2. NODE_OPTIONS=--report-on-fatalerror (and friends) on every
11-
// Backend tests step, with the resulting node-report/ directory
12-
// uploaded as an artifact on failure. If the flake recurs we
13-
// finally get a V8 stack + libuv handle table.
9+
// mocha --exit on the Windows CI jobs, and ONLY there, so a leaked handle
10+
// can't hang the job at post-suite drain. Linux/local keep natural drain so
11+
// real handle leaks stay visible. Easy to silently revert in a workflow
12+
// refactor or leak into the shared test script; this test fails fast if it
13+
// disappears or spreads.
1414
//
15-
// Both pieces are easy to silently revert in a workflow refactor; this
16-
// test fails fast if either disappears.
15+
// (The earlier --report-on-fatalerror NODE_OPTIONS + node-report uploads were
16+
// diagnostics for hunting the flake; removed once the cause was found.)
1717

1818
import {readFileSync} from 'fs';
1919
import {join} from 'path';
@@ -24,27 +24,7 @@ const read = (rel: string) => readFileSync(join(repoRoot, rel), 'utf8');
2424

2525
const workflow = read('.github/workflows/backend-tests.yml');
2626

27-
describe('backend-tests flake mitigation (PR #7748)', () => {
28-
it('every Backend tests step exposes Node diagnostic reports via NODE_OPTIONS', () => {
29-
// Count the "Run the backend tests" steps so the expected-count is
30-
// explicit — if a job is added later, this test reminds the author
31-
// to wire the diag flags into it too.
32-
const runStepCount = (workflow.match(/name: Run the backend tests/g) || []).length;
33-
expect(runStepCount, 'expected 4 Backend tests step blocks (Linux × 2, Windows × 2)')
34-
.toBe(4);
35-
const nodeOptionsCount = (workflow.match(
36-
/--report-on-fatalerror --report-uncaught-exception --report-on-signal --report-compact/g,
37-
) || []).length;
38-
expect(nodeOptionsCount,
39-
'every Backend tests step must set NODE_OPTIONS with the report-on-fatalerror diag flags')
40-
.toBe(runStepCount);
41-
const uploadCount = (workflow.match(/name: Upload Node diagnostic reports on failure/g) || [])
42-
.length;
43-
expect(uploadCount,
44-
'every Backend tests step must be followed by an Upload Node diagnostic reports step')
45-
.toBe(runStepCount);
46-
});
47-
27+
describe('backend-tests Windows --exit invariant', () => {
4828
it('Windows backend-test steps invoke pnpm test with --exit', () => {
4929
// --exit is the Windows-only mitigation. Linux still runs natural-drain
5030
// so leaked-handle regressions stay visible there.

0 commit comments

Comments
 (0)