Skip to content

Commit 9967454

Browse files
committed
Add 10ms cooperative timeout circuit breaker for snapshot capture
Thread a CaptureContext with a performance.now() deadline through the capture walker so that snapshot serialization aborts early when the budget is exceeded. A single deadline is shared across all probes in a hook invocation; non-snapshot probes are unaffected. When the timeout fires, the snapshot is dropped entirely and capture stops at the next logical checkpoint (property, array element, map entry, etc.). The timeout value uses the real typeof the value, consistent with how all other Datadog tracers handle notCapturedReason.
1 parent 0040ee8 commit 9967454

4 files changed

Lines changed: 520 additions & 117 deletions

File tree

packages/debugger/src/domain/api.spec.ts

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,246 @@ describe('api', () => {
601601
})
602602
})
603603

604+
describe('snapshot timeout', () => {
605+
function createSnapshotProbe(methodName: string): Probe {
606+
return {
607+
id: `timeout-probe-${methodName}`,
608+
version: 0,
609+
type: 'LOG_PROBE',
610+
where: { typeName: 'TestClass', methodName },
611+
template: 'Test',
612+
captureSnapshot: true,
613+
capture: { maxReferenceDepth: 3 },
614+
sampling: { snapshotsPerSecond: 5000 },
615+
evaluateAt: 'ENTRY',
616+
}
617+
}
618+
619+
it('should drop snapshot when entry capture exceeds timeout', () => {
620+
const probe = createSnapshotProbe('entryTimeout')
621+
addProbe(probe)
622+
623+
let callCount = 0
624+
const realNow = performance.now.bind(performance)
625+
spyOn(performance, 'now').and.callFake(() => {
626+
callCount++
627+
// Let the first few calls (start time, deadline creation) use real time,
628+
// then jump past the deadline to simulate slow capture.
629+
if (callCount <= 3) {
630+
return realNow()
631+
}
632+
return realNow() + 20
633+
})
634+
635+
const probes = getProbes('TestClass;entryTimeout')!
636+
const deepObj = { level1: { level2: { level3: { level4: 'deep' } } } }
637+
onEntry(probes, {}, { arg: deepObj })
638+
onReturn(probes, null, {}, { arg: deepObj }, {})
639+
640+
// The entry capture timed out, so onEntry pushed null.
641+
// onReturn still gets an active entry from its own onEntry call, but
642+
// the entry snapshot is dropped. The return capture has its own timeout.
643+
// Since performance.now is still returning future values, the return
644+
// capture also times out and no snapshot is sent.
645+
expect(mockBatchAdd).not.toHaveBeenCalled()
646+
})
647+
648+
it('should drop snapshot when return capture exceeds timeout', () => {
649+
const probe = createSnapshotProbe('returnTimeout')
650+
addProbe(probe)
651+
652+
const probes = getProbes('TestClass;returnTimeout')!
653+
654+
// Let onEntry succeed with real time
655+
onEntry(probes, {}, { x: 1 })
656+
657+
// Now make performance.now jump forward so the return capture times out
658+
let callCount = 0
659+
const realNow = performance.now.bind(performance)
660+
spyOn(performance, 'now').and.callFake(() => {
661+
callCount++
662+
if (callCount <= 2) {
663+
return realNow()
664+
}
665+
return realNow() + 20
666+
})
667+
668+
onReturn(probes, null, {}, { x: 1 }, { local: 'value' })
669+
670+
expect(mockBatchAdd).not.toHaveBeenCalled()
671+
})
672+
673+
it('should drop snapshot when throw capture exceeds timeout', () => {
674+
const probe = createSnapshotProbe('throwTimeout')
675+
addProbe(probe)
676+
677+
const probes = getProbes('TestClass;throwTimeout')!
678+
679+
// Let onEntry succeed with real time
680+
onEntry(probes, {}, { x: 1 })
681+
682+
// Now make performance.now jump forward so the throw capture times out
683+
let callCount = 0
684+
const realNow = performance.now.bind(performance)
685+
spyOn(performance, 'now').and.callFake(() => {
686+
callCount++
687+
if (callCount <= 2) {
688+
return realNow()
689+
}
690+
return realNow() + 20
691+
})
692+
693+
onThrow(probes, new Error('test'), {}, { x: 1 })
694+
695+
expect(mockBatchAdd).not.toHaveBeenCalled()
696+
})
697+
698+
it('should not affect non-snapshot probes', () => {
699+
const probe: Probe = {
700+
id: 'non-snapshot-timeout',
701+
version: 0,
702+
type: 'LOG_PROBE',
703+
where: { typeName: 'TestClass', methodName: 'nonSnapshot' },
704+
template: 'Test',
705+
captureSnapshot: false,
706+
capture: {},
707+
sampling: { snapshotsPerSecond: 5000 },
708+
evaluateAt: 'ENTRY',
709+
}
710+
addProbe(probe)
711+
712+
// Spike performance.now to simulate slow execution
713+
let callCount = 0
714+
const realNow = performance.now.bind(performance)
715+
spyOn(performance, 'now').and.callFake(() => {
716+
callCount++
717+
if (callCount <= 2) {
718+
return realNow()
719+
}
720+
return realNow() + 20
721+
})
722+
723+
const probes = getProbes('TestClass;nonSnapshot')!
724+
onEntry(probes, {}, {})
725+
onReturn(probes, null, {}, {}, {})
726+
727+
expect(mockBatchAdd).toHaveBeenCalledTimes(1)
728+
})
729+
730+
it('should not leak active entries when entry capture times out', () => {
731+
const probe = createSnapshotProbe('entryLeakTest')
732+
addProbe(probe)
733+
734+
let shouldTimeout = true
735+
let callCount = 0
736+
const realNow = performance.now.bind(performance)
737+
spyOn(performance, 'now').and.callFake(() => {
738+
callCount++
739+
if (!shouldTimeout || callCount <= 3) {
740+
return realNow()
741+
}
742+
return realNow() + 20
743+
})
744+
745+
const probes = getProbes('TestClass;entryLeakTest')!
746+
// This onEntry will time out and push null
747+
onEntry(probes, {}, { x: 1 })
748+
749+
// onReturn should handle the null entry gracefully (no snapshot sent)
750+
shouldTimeout = false
751+
callCount = 0
752+
onReturn(probes, null, {}, { x: 1 }, {})
753+
754+
expect(mockBatchAdd).not.toHaveBeenCalled()
755+
})
756+
757+
it('should skip subsequent snapshot probes after timeout but still process non-snapshot probes', () => {
758+
const snapshotProbe1: Probe = {
759+
id: 'timeout-shared-1',
760+
version: 0,
761+
type: 'LOG_PROBE',
762+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
763+
template: 'Snapshot probe',
764+
captureSnapshot: true,
765+
capture: { maxReferenceDepth: 3 },
766+
sampling: { snapshotsPerSecond: 5000 },
767+
evaluateAt: 'ENTRY',
768+
}
769+
const nonSnapshotProbe: Probe = {
770+
id: 'timeout-shared-2',
771+
version: 0,
772+
type: 'LOG_PROBE',
773+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
774+
template: 'Non-snapshot probe',
775+
captureSnapshot: false,
776+
capture: {},
777+
sampling: { snapshotsPerSecond: 5000 },
778+
evaluateAt: 'ENTRY',
779+
}
780+
const snapshotProbe2: Probe = {
781+
id: 'timeout-shared-3',
782+
version: 0,
783+
type: 'LOG_PROBE',
784+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
785+
template: 'Second snapshot probe',
786+
captureSnapshot: true,
787+
capture: { maxReferenceDepth: 3 },
788+
sampling: { snapshotsPerSecond: 5000 },
789+
evaluateAt: 'ENTRY',
790+
}
791+
addProbe(snapshotProbe1)
792+
addProbe(nonSnapshotProbe)
793+
addProbe(snapshotProbe2)
794+
795+
let callCount = 0
796+
const realNow = performance.now.bind(performance)
797+
spyOn(performance, 'now').and.callFake(() => {
798+
callCount++
799+
if (callCount <= 3) {
800+
return realNow()
801+
}
802+
return realNow() + 20
803+
})
804+
805+
const probes = getProbes('TestClass;sharedDeadline')!
806+
onEntry(probes, {}, {})
807+
onReturn(probes, null, {}, {}, {})
808+
809+
// The non-snapshot probe should still send, but both snapshot probes should be dropped
810+
const calls = mockBatchAdd.calls.allArgs()
811+
expect(calls.length).toBe(1)
812+
expect(calls[0][0].message).toBe('Non-snapshot probe')
813+
})
814+
815+
it('should share deadline across probes so second snapshot probe exits immediately', () => {
816+
const probe1 = createSnapshotProbe('sharedDeadline1')
817+
const probe2: Probe = {
818+
...createSnapshotProbe('sharedDeadline2'),
819+
id: 'timeout-probe-sharedDeadline2',
820+
where: { typeName: 'TestClass', methodName: 'sharedDeadline1' },
821+
}
822+
addProbe(probe1)
823+
addProbe(probe2)
824+
825+
let callCount = 0
826+
const realNow = performance.now.bind(performance)
827+
spyOn(performance, 'now').and.callFake(() => {
828+
callCount++
829+
if (callCount <= 3) {
830+
return realNow()
831+
}
832+
return realNow() + 20
833+
})
834+
835+
const probes = getProbes('TestClass;sharedDeadline1')!
836+
onEntry(probes, {}, { x: 1 })
837+
onReturn(probes, null, {}, { x: 1 }, {})
838+
839+
// Both snapshot probes share the deadline -- neither should send
840+
expect(mockBatchAdd).not.toHaveBeenCalled()
841+
})
842+
})
843+
604844
describe('error handling', () => {
605845
it('should handle missing DD_RUM gracefully', () => {
606846
delete (window as any).DD_RUM

packages/debugger/src/domain/api.ts

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import type { Batch, Context, RumInternalContext } from '@datadog/browser-core'
33
import { timeStampNow, display, buildTag, generateUUID, getGlobalObject } from '@datadog/browser-core'
44
import type { BrowserWindow, DebuggerInitConfiguration } from '../entries/main'
55
import { capture, captureFields } from './capture'
6+
import type { CaptureContext } from './capture'
67
import type { InitializedProbe } from './probes'
78
import { checkGlobalSnapshotBudget } from './probes'
89
import type { ActiveEntry } from './activeEntries'
@@ -21,6 +22,8 @@ const hostname = 'location' in globalObj ? globalObj.location.hostname : 'unknow
2122

2223
const threadName = detectThreadName() // eslint-disable-line local-rules/disallow-side-effects
2324

25+
const SNAPSHOT_TIMEOUT_MS = 10
26+
2427
let debuggerBatch: Batch | undefined
2528
let debuggerConfig: DebuggerInitConfiguration | undefined
2629

@@ -44,6 +47,7 @@ export function resetDebuggerTransport(): void {
4447
*/
4548
export function onEntry(probes: InitializedProbe[], self: any, args: Record<string, any>): void {
4649
const start = performance.now()
50+
const captureCtx: CaptureContext = { deadline: start + SNAPSHOT_TIMEOUT_MS, timedOut: false }
4751

4852
// TODO: A lot of repeated work performed for each probe that could be shared between probes
4953
for (const probe of probes) {
@@ -84,14 +88,19 @@ export function onEntry(probes: InitializedProbe[], self: any, args: Record<stri
8488

8589
// Special case for evaluateAt=EXIT with a condition: we only capture the return snapshot
8690
const shouldCaptureEntrySnapshot = probe.captureSnapshot && (probe.evaluateAt === 'ENTRY' || !probe.condition)
87-
const entry = shouldCaptureEntrySnapshot
88-
? {
89-
arguments: {
90-
...captureFields(args, probe.capture),
91-
this: capture(self, probe.capture),
92-
},
93-
}
94-
: undefined
91+
let entry: { arguments: Record<string, any> } | undefined
92+
if (shouldCaptureEntrySnapshot) {
93+
entry = {
94+
arguments: {
95+
...captureFields(args, probe.capture, captureCtx),
96+
this: capture(self, probe.capture, captureCtx),
97+
},
98+
}
99+
if (captureCtx.timedOut) {
100+
stack.push(null)
101+
continue
102+
}
103+
}
95104

96105
stack.push({
97106
start,
@@ -121,6 +130,7 @@ export function onReturn(
121130
locals: Record<string, any>
122131
): any {
123132
const end = performance.now()
133+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
124134

125135
// TODO: A lot of repeated work performed for each probe that could be shared between probes
126136
for (const probe of probes) {
@@ -156,18 +166,21 @@ export function onReturn(
156166
result.message = evaluateProbeMessage(probe, context)
157167
}
158168

159-
result.return = probe.captureSnapshot
160-
? {
161-
arguments: {
162-
...captureFields(args, probe.capture),
163-
this: capture(self, probe.capture),
164-
},
165-
locals: {
166-
...captureFields(locals, probe.capture),
167-
'@return': capture(value, probe.capture),
168-
},
169-
}
170-
: undefined
169+
if (probe.captureSnapshot) {
170+
result.return = {
171+
arguments: {
172+
...captureFields(args, probe.capture, captureCtx),
173+
this: capture(self, probe.capture, captureCtx),
174+
},
175+
locals: {
176+
...captureFields(locals, probe.capture, captureCtx),
177+
'@return': capture(value, probe.capture, captureCtx),
178+
},
179+
}
180+
if (captureCtx.timedOut) {
181+
continue
182+
}
183+
}
171184

172185
sendDebuggerSnapshot(probe, result)
173186
}
@@ -185,6 +198,7 @@ export function onReturn(
185198
*/
186199
export function onThrow(probes: InitializedProbe[], error: Error, self: any, args: Record<string, any>): void {
187200
const end = performance.now()
201+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
188202

189203
// TODO: A lot of repeated work performed for each probe that could be shared between probes
190204
for (const probe of probes) {
@@ -220,13 +234,19 @@ export function onThrow(probes: InitializedProbe[], error: Error, self: any, arg
220234
result.message = evaluateProbeMessage(probe, context)
221235
}
222236

237+
let throwArguments: Record<string, any> | undefined
238+
if (probe.captureSnapshot) {
239+
throwArguments = {
240+
...captureFields(args, probe.capture, captureCtx),
241+
this: capture(self, probe.capture, captureCtx),
242+
}
243+
if (captureCtx.timedOut) {
244+
continue
245+
}
246+
}
247+
223248
result.return = {
224-
arguments: probe.captureSnapshot
225-
? {
226-
...captureFields(args, probe.capture),
227-
this: capture(self, probe.capture),
228-
}
229-
: undefined,
249+
arguments: throwArguments,
230250
throwable: {
231251
message: error.message,
232252
stacktrace: parseStackTrace(error),

0 commit comments

Comments
 (0)