Skip to content

Commit f02d19d

Browse files
heiskrCopilot
andauthored
Add Node.js runtime metrics (heap, GC, event loop) to Datadog (#60367)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 85b2028 commit f02d19d

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

src/frame/start-server.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import dotenv from 'dotenv'
55

66
import { checkNodeVersion } from './lib/check-node-version'
77
import '../observability/lib/handle-exceptions'
8+
import { startRuntimeMetrics } from '@/observability/lib/runtime-metrics'
89
import createApp from './lib/app'
910
import warmServer from './lib/warm-server'
1011
import { createLogger } from '@/observability/logger'
@@ -55,6 +56,8 @@ async function startServer() {
5556
// Workaround for https://github.com/expressjs/express/issues/1101
5657
const server = http.createServer(app)
5758

59+
startRuntimeMetrics()
60+
5861
process.once('SIGTERM', () => {
5962
logger.info('Received SIGTERM, beginning graceful shutdown', { pid: process.pid, port })
6063
server.close(() => {
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/**
2+
* Periodically emits Node.js runtime metrics to Datadog via StatsD.
3+
*
4+
* Covers three categories that are otherwise invisible:
5+
* 1. V8 heap — used vs limit, so we can spot memory pressure before OOMs.
6+
* 2. GC — pause duration, so we can correlate latency spikes with GC.
7+
* 3. Event-loop delay — p50/p99, so we can see when the loop is blocked.
8+
*
9+
* Only activates when StatsD is sending real metrics (MODA_PROD_SERVICE_ENV).
10+
*/
11+
import v8 from 'node:v8'
12+
import { monitorEventLoopDelay, PerformanceObserver } from 'node:perf_hooks'
13+
14+
import statsd from './statsd'
15+
16+
export const INTERVAL_MS = 10_000
17+
18+
let started = false
19+
20+
function isMetricsEnabled(): boolean {
21+
return process.env.MODA_PROD_SERVICE_ENV === 'true' && process.env.NODE_ENV !== 'test'
22+
}
23+
24+
/**
25+
* Call once at server start. Safe to call multiple times (no-op after first).
26+
* Only starts collection when StatsD is sending real metrics.
27+
*/
28+
export function startRuntimeMetrics(): void {
29+
if (started) return
30+
started = true
31+
32+
if (!isMetricsEnabled()) return
33+
34+
// --- V8 heap stats (sampled on an interval) ---
35+
setInterval(() => {
36+
const heap = v8.getHeapStatistics()
37+
statsd.gauge('node.heap.used', heap.used_heap_size)
38+
statsd.gauge('node.heap.total', heap.total_heap_size)
39+
statsd.gauge('node.heap.limit', heap.heap_size_limit)
40+
statsd.gauge('node.heap.external', heap.external_memory)
41+
// Percentage of heap limit currently in use
42+
const pct = heap.heap_size_limit > 0 ? (heap.used_heap_size / heap.heap_size_limit) * 100 : 0
43+
statsd.gauge('node.heap.used_pct', pct)
44+
}, INTERVAL_MS).unref()
45+
46+
// --- GC pause durations ---
47+
const gcObserver = new PerformanceObserver((list) => {
48+
for (const entry of list.getEntries()) {
49+
const kind = (entry as unknown as { detail?: { kind?: number } }).detail?.kind
50+
// kind: 1 = Scavenge (minor), 2 = Mark-Sweep-Compact (major),
51+
// 4 = Incremental marking, 8 = Process weak callbacks, 15 = All
52+
const tag = kind === 1 ? 'minor' : kind === 2 ? 'major' : 'other'
53+
statsd.histogram('node.gc.pause', entry.duration, [`gc_type:${tag}`])
54+
}
55+
})
56+
gcObserver.observe({ entryTypes: ['gc'] })
57+
58+
// --- Event-loop delay (histogram sampled every 20 ms) ---
59+
const eld = monitorEventLoopDelay({ resolution: 20 })
60+
eld.enable()
61+
62+
setInterval(() => {
63+
// Values are in nanoseconds; convert to milliseconds for readability.
64+
statsd.gauge('node.eventloop.delay.p50', eld.percentile(50) / 1e6)
65+
statsd.gauge('node.eventloop.delay.p99', eld.percentile(99) / 1e6)
66+
statsd.gauge('node.eventloop.delay.max', eld.max / 1e6)
67+
eld.reset()
68+
}, INTERVAL_MS).unref()
69+
}
70+
71+
/**
72+
* Reset the started flag. Only for use in tests.
73+
*/
74+
export function _resetForTesting(): void {
75+
started = false
76+
}
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'
2+
3+
import statsd from '@/observability/lib/statsd'
4+
import {
5+
startRuntimeMetrics,
6+
_resetForTesting,
7+
INTERVAL_MS,
8+
} from '@/observability/lib/runtime-metrics'
9+
10+
vi.mock('@/observability/lib/statsd', () => ({
11+
default: {
12+
gauge: vi.fn(),
13+
histogram: vi.fn(),
14+
},
15+
}))
16+
17+
describe('startRuntimeMetrics', () => {
18+
beforeEach(() => {
19+
_resetForTesting()
20+
vi.useFakeTimers()
21+
vi.clearAllMocks()
22+
})
23+
24+
afterEach(() => {
25+
vi.unstubAllEnvs()
26+
vi.useRealTimers()
27+
})
28+
29+
it('is a no-op in test / non-prod environments', () => {
30+
vi.stubEnv('MODA_PROD_SERVICE_ENV', 'false')
31+
startRuntimeMetrics()
32+
vi.advanceTimersByTime(INTERVAL_MS + 1)
33+
expect(statsd.gauge).not.toHaveBeenCalled()
34+
})
35+
36+
it('is idempotent — second call does nothing extra', () => {
37+
vi.stubEnv('MODA_PROD_SERVICE_ENV', 'true')
38+
vi.stubEnv('NODE_ENV', 'production')
39+
startRuntimeMetrics()
40+
// Second call without reset — should be a no-op
41+
startRuntimeMetrics()
42+
vi.advanceTimersByTime(INTERVAL_MS + 1)
43+
const callCount = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.length
44+
45+
vi.clearAllMocks()
46+
vi.advanceTimersByTime(INTERVAL_MS)
47+
const secondTickCount = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.length
48+
// Same number of calls each tick — no duplicate timers registered
49+
expect(secondTickCount).toBe(callCount)
50+
})
51+
52+
it('emits heap gauges when enabled', () => {
53+
vi.stubEnv('MODA_PROD_SERVICE_ENV', 'true')
54+
vi.stubEnv('NODE_ENV', 'production')
55+
startRuntimeMetrics()
56+
vi.advanceTimersByTime(INTERVAL_MS + 1)
57+
58+
const gaugeNames = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.map(
59+
(c: unknown[]) => c[0],
60+
)
61+
expect(gaugeNames).toContain('node.heap.used')
62+
expect(gaugeNames).toContain('node.heap.total')
63+
expect(gaugeNames).toContain('node.heap.limit')
64+
expect(gaugeNames).toContain('node.heap.external')
65+
expect(gaugeNames).toContain('node.heap.used_pct')
66+
})
67+
68+
it('emits event-loop delay gauges when enabled', () => {
69+
vi.stubEnv('MODA_PROD_SERVICE_ENV', 'true')
70+
vi.stubEnv('NODE_ENV', 'production')
71+
startRuntimeMetrics()
72+
vi.advanceTimersByTime(INTERVAL_MS + 1)
73+
74+
const gaugeNames = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.map(
75+
(c: unknown[]) => c[0],
76+
)
77+
expect(gaugeNames).toContain('node.eventloop.delay.p50')
78+
expect(gaugeNames).toContain('node.eventloop.delay.p99')
79+
expect(gaugeNames).toContain('node.eventloop.delay.max')
80+
})
81+
82+
it('emits heap values that are positive numbers', () => {
83+
vi.stubEnv('MODA_PROD_SERVICE_ENV', 'true')
84+
vi.stubEnv('NODE_ENV', 'production')
85+
startRuntimeMetrics()
86+
vi.advanceTimersByTime(INTERVAL_MS + 1)
87+
88+
const heapUsedCall = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.find(
89+
(c: unknown[]) => c[0] === 'node.heap.used',
90+
)
91+
expect(heapUsedCall).toBeDefined()
92+
expect(heapUsedCall![1]).toBeGreaterThan(0)
93+
94+
const heapPctCall = (statsd.gauge as ReturnType<typeof vi.fn>).mock.calls.find(
95+
(c: unknown[]) => c[0] === 'node.heap.used_pct',
96+
)
97+
expect(heapPctCall).toBeDefined()
98+
expect(heapPctCall![1]).toBeGreaterThan(0)
99+
expect(heapPctCall![1]).toBeLessThan(100)
100+
})
101+
})

0 commit comments

Comments
 (0)