Skip to content

Commit 46622ab

Browse files
committed
Add 10ms cooperative timeout circuit breaker for snapshot capture
Thread a CaptureContext with a performance.now() deadline through the capture walker so that snapshot serialization aborts early when the budget is exceeded. A single deadline is shared across all probes in a hook invocation; non-snapshot probes are unaffected. When the timeout fires, the snapshot is dropped entirely and capture stops at the next logical checkpoint (property, array element, map entry, etc.). The timeout value uses the real typeof the value, consistent with how all other Datadog tracers handle notCapturedReason.
1 parent 7d1efe7 commit 46622ab

4 files changed

Lines changed: 553 additions & 118 deletions

File tree

packages/debugger/src/domain/api.spec.ts

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,246 @@ describe('api', () => {
632632
})
633633
})
634634

635+
describe('snapshot timeout', () => {
636+
function createSnapshotProbe(methodName: string): Probe {
637+
return {
638+
id: `timeout-probe-${methodName}`,
639+
version: 0,
640+
type: 'LOG_PROBE',
641+
where: { typeName: 'TestClass', methodName },
642+
template: 'Test',
643+
captureSnapshot: true,
644+
capture: { maxReferenceDepth: 3 },
645+
sampling: { snapshotsPerSecond: 5000 },
646+
evaluateAt: 'ENTRY',
647+
}
648+
}
649+
650+
it('should drop snapshot when entry capture exceeds timeout', () => {
651+
const probe = createSnapshotProbe('entryTimeout')
652+
addProbe(probe)
653+
654+
let callCount = 0
655+
const realNow = performance.now.bind(performance)
656+
spyOn(performance, 'now').and.callFake(() => {
657+
callCount++
658+
// Let the first few calls (start time, deadline creation) use real time,
659+
// then jump past the deadline to simulate slow capture.
660+
if (callCount <= 3) {
661+
return realNow()
662+
}
663+
return realNow() + 20
664+
})
665+
666+
const probes = getProbes('TestClass;entryTimeout')!
667+
const deepObj = { level1: { level2: { level3: { level4: 'deep' } } } }
668+
onEntry(probes, {}, { arg: deepObj })
669+
onReturn(probes, null, {}, { arg: deepObj }, {})
670+
671+
// The entry capture timed out, so onEntry pushed null.
672+
// onReturn still gets an active entry from its own onEntry call, but
673+
// the entry snapshot is dropped. The return capture has its own timeout.
674+
// Since performance.now is still returning future values, the return
675+
// capture also times out and no snapshot is sent.
676+
expect(mockBatchAdd).not.toHaveBeenCalled()
677+
})
678+
679+
it('should drop snapshot when return capture exceeds timeout', () => {
680+
const probe = createSnapshotProbe('returnTimeout')
681+
addProbe(probe)
682+
683+
const probes = getProbes('TestClass;returnTimeout')!
684+
685+
// Let onEntry succeed with real time
686+
onEntry(probes, {}, { x: 1 })
687+
688+
// Now make performance.now jump forward so the return capture times out
689+
let callCount = 0
690+
const realNow = performance.now.bind(performance)
691+
spyOn(performance, 'now').and.callFake(() => {
692+
callCount++
693+
if (callCount <= 2) {
694+
return realNow()
695+
}
696+
return realNow() + 20
697+
})
698+
699+
onReturn(probes, null, {}, { x: 1 }, { local: 'value' })
700+
701+
expect(mockBatchAdd).not.toHaveBeenCalled()
702+
})
703+
704+
it('should drop snapshot when throw capture exceeds timeout', () => {
705+
const probe = createSnapshotProbe('throwTimeout')
706+
addProbe(probe)
707+
708+
const probes = getProbes('TestClass;throwTimeout')!
709+
710+
// Let onEntry succeed with real time
711+
onEntry(probes, {}, { x: 1 })
712+
713+
// Now make performance.now jump forward so the throw capture times out
714+
let callCount = 0
715+
const realNow = performance.now.bind(performance)
716+
spyOn(performance, 'now').and.callFake(() => {
717+
callCount++
718+
if (callCount <= 2) {
719+
return realNow()
720+
}
721+
return realNow() + 20
722+
})
723+
724+
onThrow(probes, new Error('test'), {}, { x: 1 })
725+
726+
expect(mockBatchAdd).not.toHaveBeenCalled()
727+
})
728+
729+
it('should not affect non-snapshot probes', () => {
730+
const probe: Probe = {
731+
id: 'non-snapshot-timeout',
732+
version: 0,
733+
type: 'LOG_PROBE',
734+
where: { typeName: 'TestClass', methodName: 'nonSnapshot' },
735+
template: 'Test',
736+
captureSnapshot: false,
737+
capture: {},
738+
sampling: { snapshotsPerSecond: 5000 },
739+
evaluateAt: 'ENTRY',
740+
}
741+
addProbe(probe)
742+
743+
// Spike performance.now to simulate slow execution
744+
let callCount = 0
745+
const realNow = performance.now.bind(performance)
746+
spyOn(performance, 'now').and.callFake(() => {
747+
callCount++
748+
if (callCount <= 2) {
749+
return realNow()
750+
}
751+
return realNow() + 20
752+
})
753+
754+
const probes = getProbes('TestClass;nonSnapshot')!
755+
onEntry(probes, {}, {})
756+
onReturn(probes, null, {}, {}, {})
757+
758+
expect(mockBatchAdd).toHaveBeenCalledTimes(1)
759+
})
760+
761+
it('should not leak active entries when entry capture times out', () => {
762+
const probe = createSnapshotProbe('entryLeakTest')
763+
addProbe(probe)
764+
765+
let shouldTimeout = true
766+
let callCount = 0
767+
const realNow = performance.now.bind(performance)
768+
spyOn(performance, 'now').and.callFake(() => {
769+
callCount++
770+
if (!shouldTimeout || callCount <= 3) {
771+
return realNow()
772+
}
773+
return realNow() + 20
774+
})
775+
776+
const probes = getProbes('TestClass;entryLeakTest')!
777+
// This onEntry will time out and push null
778+
onEntry(probes, {}, { x: 1 })
779+
780+
// onReturn should handle the null entry gracefully (no snapshot sent)
781+
shouldTimeout = false
782+
callCount = 0
783+
onReturn(probes, null, {}, { x: 1 }, {})
784+
785+
expect(mockBatchAdd).not.toHaveBeenCalled()
786+
})
787+
788+
it('should skip subsequent snapshot probes after timeout but still process non-snapshot probes', () => {
789+
const snapshotProbe1: Probe = {
790+
id: 'timeout-shared-1',
791+
version: 0,
792+
type: 'LOG_PROBE',
793+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
794+
template: 'Snapshot probe',
795+
captureSnapshot: true,
796+
capture: { maxReferenceDepth: 3 },
797+
sampling: { snapshotsPerSecond: 5000 },
798+
evaluateAt: 'ENTRY',
799+
}
800+
const nonSnapshotProbe: Probe = {
801+
id: 'timeout-shared-2',
802+
version: 0,
803+
type: 'LOG_PROBE',
804+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
805+
template: 'Non-snapshot probe',
806+
captureSnapshot: false,
807+
capture: {},
808+
sampling: { snapshotsPerSecond: 5000 },
809+
evaluateAt: 'ENTRY',
810+
}
811+
const snapshotProbe2: Probe = {
812+
id: 'timeout-shared-3',
813+
version: 0,
814+
type: 'LOG_PROBE',
815+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
816+
template: 'Second snapshot probe',
817+
captureSnapshot: true,
818+
capture: { maxReferenceDepth: 3 },
819+
sampling: { snapshotsPerSecond: 5000 },
820+
evaluateAt: 'ENTRY',
821+
}
822+
addProbe(snapshotProbe1)
823+
addProbe(nonSnapshotProbe)
824+
addProbe(snapshotProbe2)
825+
826+
let callCount = 0
827+
const realNow = performance.now.bind(performance)
828+
spyOn(performance, 'now').and.callFake(() => {
829+
callCount++
830+
if (callCount <= 3) {
831+
return realNow()
832+
}
833+
return realNow() + 20
834+
})
835+
836+
const probes = getProbes('TestClass;sharedDeadline')!
837+
onEntry(probes, {}, { x: 1 })
838+
onReturn(probes, null, {}, { x: 1 }, {})
839+
840+
// The non-snapshot probe should still send, but both snapshot probes should be dropped
841+
const calls = mockBatchAdd.calls.allArgs()
842+
expect(calls.length).toBe(1)
843+
expect(calls[0][0].message).toBe('Non-snapshot probe')
844+
})
845+
846+
it('should share deadline across probes so second snapshot probe exits immediately', () => {
847+
const probe1 = createSnapshotProbe('sharedDeadline1')
848+
const probe2: Probe = {
849+
...createSnapshotProbe('sharedDeadline2'),
850+
id: 'timeout-probe-sharedDeadline2',
851+
where: { typeName: 'TestClass', methodName: 'sharedDeadline1' },
852+
}
853+
addProbe(probe1)
854+
addProbe(probe2)
855+
856+
let callCount = 0
857+
const realNow = performance.now.bind(performance)
858+
spyOn(performance, 'now').and.callFake(() => {
859+
callCount++
860+
if (callCount <= 3) {
861+
return realNow()
862+
}
863+
return realNow() + 20
864+
})
865+
866+
const probes = getProbes('TestClass;sharedDeadline1')!
867+
onEntry(probes, {}, { x: 1 })
868+
onReturn(probes, null, {}, { x: 1 }, {})
869+
870+
// Both snapshot probes share the deadline -- neither should send
871+
expect(mockBatchAdd).not.toHaveBeenCalled()
872+
})
873+
})
874+
635875
describe('error handling', () => {
636876
it('should handle missing DD_RUM gracefully', () => {
637877
delete (window as any).DD_RUM

packages/debugger/src/domain/api.ts

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import type { Batch, Context, RumInternalContext } from '@datadog/browser-core'
33
import { timeStampNow, display, buildTag, generateUUID, getGlobalObject } from '@datadog/browser-core'
44
import type { BrowserWindow, DebuggerInitConfiguration } from '../entries/main'
55
import { capture, captureFields } from './capture'
6+
import type { CaptureContext } from './capture'
67
import type { InitializedProbe } from './probes'
78
import { checkGlobalSnapshotBudget } from './probes'
89
import type { ActiveEntry } from './activeEntries'
@@ -26,6 +27,8 @@ const hostname = 'location' in globalObj ? globalObj.location.hostname : 'unknow
2627

2728
const threadName = detectThreadName() // eslint-disable-line local-rules/disallow-side-effects
2829

30+
const SNAPSHOT_TIMEOUT_MS = 10
31+
2932
let debuggerBatch: Batch | undefined
3033
let debuggerConfig: DebuggerInitConfiguration | undefined
3134

@@ -49,6 +52,7 @@ export function resetDebuggerTransport(): void {
4952
*/
5053
export function onEntry(probes: InitializedProbe[], self: any, args: Record<string, any>): void {
5154
const start = performance.now()
55+
const captureCtx: CaptureContext = { deadline: start + SNAPSHOT_TIMEOUT_MS, timedOut: false }
5256

5357
// TODO: A lot of repeated work performed for each probe that could be shared between probes
5458
for (const probe of probes) {
@@ -89,14 +93,19 @@ export function onEntry(probes: InitializedProbe[], self: any, args: Record<stri
8993

9094
// Special case for evaluateAt=EXIT with a condition: we only capture the return snapshot
9195
const shouldCaptureEntrySnapshot = probe.captureSnapshot && (probe.evaluateAt === 'ENTRY' || !probe.condition)
92-
const entry = shouldCaptureEntrySnapshot
93-
? {
94-
arguments: {
95-
...captureFields(args, probe.capture),
96-
this: capture(self, probe.capture),
97-
},
98-
}
99-
: undefined
96+
let entry: { arguments: Record<string, any> } | undefined
97+
if (shouldCaptureEntrySnapshot) {
98+
entry = {
99+
arguments: {
100+
...captureFields(args, probe.capture, captureCtx),
101+
this: capture(self, probe.capture, captureCtx),
102+
},
103+
}
104+
if (captureCtx.timedOut) {
105+
stack.push(null)
106+
continue
107+
}
108+
}
100109

101110
stack.push({
102111
start,
@@ -126,6 +135,7 @@ export function onReturn(
126135
locals: Record<string, any>
127136
): any {
128137
const end = performance.now()
138+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
129139

130140
// TODO: A lot of repeated work performed for each probe that could be shared between probes
131141
for (const probe of probes) {
@@ -161,18 +171,21 @@ export function onReturn(
161171
result.message = evaluateProbeMessage(probe, context)
162172
}
163173

164-
result.return = probe.captureSnapshot
165-
? {
166-
arguments: {
167-
...captureFields(args, probe.capture),
168-
this: capture(self, probe.capture),
169-
},
170-
locals: {
171-
...captureFields(locals, probe.capture),
172-
'@return': capture(value, probe.capture),
173-
},
174-
}
175-
: undefined
174+
if (probe.captureSnapshot) {
175+
result.return = {
176+
arguments: {
177+
...captureFields(args, probe.capture, captureCtx),
178+
this: capture(self, probe.capture, captureCtx),
179+
},
180+
locals: {
181+
...captureFields(locals, probe.capture, captureCtx),
182+
'@return': capture(value, probe.capture, captureCtx),
183+
},
184+
}
185+
if (captureCtx.timedOut) {
186+
continue
187+
}
188+
}
176189

177190
sendDebuggerSnapshot(probe, result)
178191
}
@@ -190,6 +203,7 @@ export function onReturn(
190203
*/
191204
export function onThrow(probes: InitializedProbe[], error: Error, self: any, args: Record<string, any>): void {
192205
const end = performance.now()
206+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
193207

194208
// TODO: A lot of repeated work performed for each probe that could be shared between probes
195209
for (const probe of probes) {
@@ -225,13 +239,19 @@ export function onThrow(probes: InitializedProbe[], error: Error, self: any, arg
225239
result.message = evaluateProbeMessage(probe, context)
226240
}
227241

242+
let throwArguments: Record<string, any> | undefined
243+
if (probe.captureSnapshot) {
244+
throwArguments = {
245+
...captureFields(args, probe.capture, captureCtx),
246+
this: capture(self, probe.capture, captureCtx),
247+
}
248+
if (captureCtx.timedOut) {
249+
continue
250+
}
251+
}
252+
228253
result.return = {
229-
arguments: probe.captureSnapshot
230-
? {
231-
...captureFields(args, probe.capture),
232-
this: capture(self, probe.capture),
233-
}
234-
: undefined,
254+
arguments: throwArguments,
235255
throwable: {
236256
message: error.message,
237257
stacktrace: parseStackTrace(error),

0 commit comments

Comments
 (0)