1010// in a way that bypassed JS handlers — SIGKILL, OOM, or a fatal native
1111// error — OR mocha itself called process.exit before the handlers ran.
1212//
13+ // Subsequent runs on develop (e.g. 26311025244, Windows with plugins,
14+ // Node 24) reproduced the same fingerprint: `[diag +0ms] diagnostics
15+ // loaded` lands but no other diag line appears before the silent
16+ // ELIFECYCLE. Death lands 300±50 ms after the previous test's teardown
17+ // and the lastSeenTest pointer (only updated in afterEach) tells us
18+ // nothing about whether the next test had started or whether mocha was
19+ // between tests at the kill moment.
20+ //
1321// This file:
1422// 1. Registers handlers UNCONDITIONALLY at mocha startup (common.ts is
1523// only imported by ~27 of 47 specs, so its handlers may register
1826// complete before the kernel returns from the syscall, so the line
1927// lands in the runner log even if the process is killed
2028// milliseconds later.
21- // 3. Tracks the last-seen test via a mocha root afterEach hook so the
22- // death point is identified.
23- // 4. Logs exit-related events so we can discriminate:
29+ // 3. Tracks BOTH the currently-running test (set in beforeEach) and the
30+ // last-finished test (set in afterEach), so the death point can be
31+ // bracketed even when the kill lands inside the next test's setup or
32+ // body, before its own afterEach has a chance to update the pointer.
33+ // 4. Emits a 1Hz heartbeat carrying the running-test name, RSS / heap
34+ // usage, and active handle / request counts. If the process dies
35+ // without firing any JS handler, the last heartbeat narrows the
36+ // kill window to <=1s and the handle-count trace exposes leaks
37+ // (sockets, timers, native bindings) that would otherwise be
38+ // invisible at the runner-log level.
39+ // 5. Logs exit-related events so we can discriminate:
2440// beforeExit + exit -> clean event-loop drain (Linux CI, local)
2541// only exit -> process.exit() called — expected when mocha
2642// is launched with --exit (the Windows CI
3551import { writeSync } from 'node:fs' ;
3652
3753const t0 = Date . now ( ) ;
38- let lastSeenTest = '<no test seen yet>' ;
54+ let currentTest = '<no test running>' ;
55+ let lastFinishedTest = '<no test finished yet>' ;
3956
4057const diag = ( msg : string ) : void => {
4158 const line = `[diag +${ Date . now ( ) - t0 } ms] ${ msg } \n` ;
@@ -48,18 +65,92 @@ const diag = (msg: string): void => {
4865
4966diag ( 'diagnostics loaded' ) ;
5067
68+ // Heartbeat. unref()'d so it never holds the event loop open by itself —
69+ // it only fires if mocha is otherwise alive. The interval cadence (1Hz) is
70+ // the trade-off between log noise (~60-120 extra lines per run) and how
71+ // tightly we can bracket the kill timestamp.
72+ //
73+ // When the backend-test workflow has `--report-directory` set (only the
74+ // Windows jobs do at time of writing), every heartbeat also writes a Node
75+ // diagnostic report into that directory. The previous two failing CI runs
76+ // proved the kill bypasses all JS handlers (uncaughtException, signal,
77+ // beforeExit — none fire), so we can't capture stack state at the moment
78+ // of death. The next-best thing is a rolling 1Hz snapshot of:
79+ // - V8 / native call stacks (all threads)
80+ // - libuv active handles (open TCP connections, timers, file handles)
81+ // - JS heap statistics
82+ // - System info (CPU, memory, environment)
83+ // On the next failure the workflow uploads node-report/ as an artifact,
84+ // and the latest report before the kill bracket gives us 0-1s of pre-death
85+ // state — including, critically, whether the V8 stack is inside jose's
86+ // JWT signing path, supertest's TCP roundtrip, or somewhere else.
87+ // Honor NODE_REPORT_DIR as a local-repro override by pushing it into
88+ // process.report.directory, which is the documented config knob. We can NOT
89+ // pass an absolute path into writeReport(): on Windows the runner sets
90+ // `--report-directory=D:\a\etherpad\etherpad/node-report` (mixed slashes),
91+ // and Node's report writer rejects any subsequent absolute path with errno
92+ // 22 / EINVAL. Pass a bare filename and let Node concatenate it against the
93+ // configured directory using its own platform-correct separator.
94+ if ( process . env . NODE_REPORT_DIR && ( process as any ) . report ) {
95+ ( process as any ) . report . directory = process . env . NODE_REPORT_DIR ;
96+ }
97+ const canWriteReport =
98+ typeof ( process as any ) . report ?. writeReport === 'function'
99+ && ! ! ( ( process as any ) . report ?. directory
100+ || ( process . env . NODE_OPTIONS || '' ) . includes ( '--report-directory=' ) ) ;
101+ let reportCounter = 0 ;
102+ let lastReportT = 0 ;
103+
104+ // Shared writer used by both the heartbeat tick and the beforeEach hook.
105+ // Throttled by minGapMs so a burst of fast tests doesn't produce hundreds of
106+ // reports — we just need dense enough coverage to bracket the kill.
107+ const tryWriteReport = ( prefix : string , minGapMs : number ) : void => {
108+ if ( ! canWriteReport ) return ;
109+ const now = Date . now ( ) ;
110+ if ( now - lastReportT < minGapMs ) return ;
111+ lastReportT = now ;
112+ reportCounter += 1 ;
113+ const safeTest = currentTest
114+ . replace ( / [ ^ a - z A - Z 0 - 9 . _ - ] + / g, '_' )
115+ . slice ( 0 , 80 ) ;
116+ const name = `${ prefix } -${ String ( reportCounter ) . padStart ( 4 , '0' ) } -${ safeTest } .json` ;
117+ try {
118+ // Bare filename only — see comment at canWriteReport definition above.
119+ ( process as any ) . report . writeReport ( name ) ;
120+ } catch { /* swallow — diagnostics must not throw */ }
121+ } ;
122+ const heartbeat = setInterval ( ( ) => {
123+ const mem = process . memoryUsage ( ) ;
124+ // _getActiveHandles / _getActiveRequests are undocumented Node internals.
125+ // The earlier shape `_getActiveHandles?.().length ?? -1` was a bug: `?.()`
126+ // only guards the call, so a missing method returns `undefined` and then
127+ // `.length` throws TypeError — which would take down the whole test run.
128+ // Capture the array first, then read .length only when it actually exists.
129+ const handlesArr = ( process as any ) . _getActiveHandles ?.( ) ;
130+ const handles = handlesArr ? handlesArr . length : - 1 ;
131+ const requestsArr = ( process as any ) . _getActiveRequests ?.( ) ;
132+ const requests = requestsArr ? requestsArr . length : - 1 ;
133+ diag ( `hb running="${ currentTest } " lastFinished="${ lastFinishedTest } " ` +
134+ `rss=${ Math . round ( mem . rss / 1024 / 1024 ) } M ` +
135+ `heap=${ Math . round ( mem . heapUsed / 1024 / 1024 ) } M ` +
136+ `handles=${ handles } requests=${ requests } ` ) ;
137+ // Heartbeat always writes — its 1Hz cadence is the floor.
138+ tryWriteReport ( 'hb' , 0 ) ;
139+ } , 1000 ) ;
140+ heartbeat . unref ( ) ;
141+
51142process . on ( 'unhandledRejection' , ( reason : any ) => {
52143 diag ( `unhandledRejection: ${
53144 reason && reason . stack ? reason . stack : String ( reason )
54- } (lastTest ="${ lastSeenTest } ")`) ;
145+ } (running ="${ currentTest } ", lastFinished=" ${ lastFinishedTest } ")`) ;
55146 // Re-throw so existing common.ts handlers / mocha behavior is preserved.
56147 throw reason ;
57148} ) ;
58149
59150process . on ( 'uncaughtException' , ( err : any ) => {
60151 diag ( `uncaughtException: ${
61152 err && err . stack ? err . stack : String ( err )
62- } (lastTest ="${ lastSeenTest } ")`) ;
153+ } (running ="${ currentTest } ", lastFinished=" ${ lastFinishedTest } ")`) ;
63154 // Force fail-fast. Specs that don't import common.ts only have THIS handler,
64155 // and Node won't exit on its own once an uncaughtException listener is
65156 // registered. Without the explicit exit a fatal error would be swallowed.
@@ -69,18 +160,18 @@ process.on('uncaughtException', (err: any) => {
69160
70161process . on ( 'beforeExit' , ( code : number ) => {
71162 diag ( `beforeExit code=${ code } exitCode=${ process . exitCode } ` +
72- `lastTest ="${ lastSeenTest } "` ) ;
163+ `running ="${ currentTest } " lastFinished=" ${ lastFinishedTest } "` ) ;
73164} ) ;
74165
75166process . on ( 'exit' , ( code : number ) => {
76- diag ( `exit code=${ code } lastTest ="${ lastSeenTest } "` ) ;
167+ diag ( `exit code=${ code } running ="${ currentTest } " lastFinished=" ${ lastFinishedTest } "` ) ;
77168} ) ;
78169
79170for ( const sig of [ 'SIGINT' , 'SIGTERM' , 'SIGHUP' , 'SIGBREAK' ] as const ) {
80171 // SIGHUP / SIGBREAK don't exist on every platform; ignore registration errors.
81172 try {
82173 process . on ( sig as any , ( ) => {
83- diag ( `received ${ sig } (lastTest ="${ lastSeenTest } ")` ) ;
174+ diag ( `received ${ sig } (running ="${ currentTest } ", lastFinished=" ${ lastFinishedTest } ")` ) ;
84175 // Let the default behavior (exit) happen.
85176 process . exit ( 128 ) ;
86177 } ) ;
@@ -89,12 +180,36 @@ for (const sig of ['SIGINT', 'SIGTERM', 'SIGHUP', 'SIGBREAK'] as const) {
89180 }
90181}
91182
92- // Mocha root hook — only registered if mocha picks up this file via --require.
93- // We track the most recently-finished test so the death point is visible.
183+ // Mocha root hooks — only registered if mocha picks up this file via --require.
184+ // beforeEach sets the running pointer so a mid-test kill is attributable to a
185+ // specific test, not just the previous one that successfully finished.
186+ //
187+ // We also emit a synchronous diag line on every test start. The 1Hz heartbeat
188+ // misses tests that take less than a second, and the silent backend-test
189+ // kills land ~300 ms after a test boundary — exactly the gap where heartbeat
190+ // resolution fails us. A `start` line per test gives sub-millisecond
191+ // resolution on which test was on the rails when the process died.
94192export const mochaHooks = {
193+ beforeEach ( this : any ) {
194+ if ( this . currentTest ) {
195+ currentTest = this . currentTest . fullTitle ( ) ;
196+ diag ( `test start: ${ currentTest } ` ) ;
197+ // Drop a node-report at test-boundary granularity when the inter-report
198+ // gap is wide enough. Run 26399285213's rerun caught the kill on the
199+ // socketio.ts duplicate-author test, but the previous boundary write
200+ // had landed 128 ms earlier — inside our 250 ms throttle, so the
201+ // dying test's own beforeEach was suppressed. 100 ms is tighter than
202+ // the inter-test cadence of fast burst suites (~2-5 ms per test, so
203+ // ~20-50× throttled = max ~10 writes/sec) yet still captures
204+ // boundary writes for any test whose neighbour fired ≥100 ms ago,
205+ // including the socketio tests in the dying-test pattern.
206+ tryWriteReport ( 'be' , 100 ) ;
207+ }
208+ } ,
95209 afterEach ( this : any ) {
96210 if ( this . currentTest ) {
97- lastSeenTest = this . currentTest . fullTitle ( ) ;
211+ lastFinishedTest = this . currentTest . fullTitle ( ) ;
212+ currentTest = '<no test running>' ;
98213 }
99214 } ,
100215} ;
0 commit comments