Skip to content

Commit 34d65c0

Browse files
authored
[Live Debugger] Add 10ms snapshot capture timeout circuit breaker (#4512)
Co-authored-by: thomas.watson <thomas.watson@datadoghq.com>
1 parent a140b07 commit 34d65c0

4 files changed

Lines changed: 553 additions & 118 deletions

File tree

packages/debugger/src/domain/api.spec.ts

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,246 @@ describe('api', () => {
647647
})
648648
})
649649

650+
describe('snapshot timeout', () => {
651+
function createSnapshotProbe(methodName: string): Probe {
652+
return {
653+
id: `timeout-probe-${methodName}`,
654+
version: 0,
655+
type: 'LOG_PROBE',
656+
where: { typeName: 'TestClass', methodName },
657+
template: 'Test',
658+
captureSnapshot: true,
659+
capture: { maxReferenceDepth: 3 },
660+
sampling: { snapshotsPerSecond: 5000 },
661+
evaluateAt: 'ENTRY',
662+
}
663+
}
664+
665+
it('should drop snapshot when entry capture exceeds timeout', () => {
666+
const probe = createSnapshotProbe('entryTimeout')
667+
addProbe(probe)
668+
669+
let callCount = 0
670+
const realNow = performance.now.bind(performance)
671+
spyOn(performance, 'now').and.callFake(() => {
672+
callCount++
673+
// Let the first few calls (start time, deadline creation) use real time,
674+
// then jump past the deadline to simulate slow capture.
675+
if (callCount <= 3) {
676+
return realNow()
677+
}
678+
return realNow() + 20
679+
})
680+
681+
const probes = getProbes('TestClass;entryTimeout')!
682+
const deepObj = { level1: { level2: { level3: { level4: 'deep' } } } }
683+
onEntry(probes, {}, { arg: deepObj })
684+
onReturn(probes, null, {}, { arg: deepObj }, {})
685+
686+
// The entry capture timed out, so onEntry pushed null.
687+
// onReturn still gets an active entry from its own onEntry call, but
688+
// the entry snapshot is dropped. The return capture has its own timeout.
689+
// Since performance.now is still returning future values, the return
690+
// capture also times out and no snapshot is sent.
691+
expect(mockBatchAdd).not.toHaveBeenCalled()
692+
})
693+
694+
it('should drop snapshot when return capture exceeds timeout', () => {
695+
const probe = createSnapshotProbe('returnTimeout')
696+
addProbe(probe)
697+
698+
const probes = getProbes('TestClass;returnTimeout')!
699+
700+
// Let onEntry succeed with real time
701+
onEntry(probes, {}, { x: 1 })
702+
703+
// Now make performance.now jump forward so the return capture times out
704+
let callCount = 0
705+
const realNow = performance.now.bind(performance)
706+
spyOn(performance, 'now').and.callFake(() => {
707+
callCount++
708+
if (callCount <= 2) {
709+
return realNow()
710+
}
711+
return realNow() + 20
712+
})
713+
714+
onReturn(probes, null, {}, { x: 1 }, { local: 'value' })
715+
716+
expect(mockBatchAdd).not.toHaveBeenCalled()
717+
})
718+
719+
it('should drop snapshot when throw capture exceeds timeout', () => {
720+
const probe = createSnapshotProbe('throwTimeout')
721+
addProbe(probe)
722+
723+
const probes = getProbes('TestClass;throwTimeout')!
724+
725+
// Let onEntry succeed with real time
726+
onEntry(probes, {}, { x: 1 })
727+
728+
// Now make performance.now jump forward so the throw capture times out
729+
let callCount = 0
730+
const realNow = performance.now.bind(performance)
731+
spyOn(performance, 'now').and.callFake(() => {
732+
callCount++
733+
if (callCount <= 2) {
734+
return realNow()
735+
}
736+
return realNow() + 20
737+
})
738+
739+
onThrow(probes, new Error('test'), {}, { x: 1 })
740+
741+
expect(mockBatchAdd).not.toHaveBeenCalled()
742+
})
743+
744+
it('should not affect non-snapshot probes', () => {
745+
const probe: Probe = {
746+
id: 'non-snapshot-timeout',
747+
version: 0,
748+
type: 'LOG_PROBE',
749+
where: { typeName: 'TestClass', methodName: 'nonSnapshot' },
750+
template: 'Test',
751+
captureSnapshot: false,
752+
capture: {},
753+
sampling: { snapshotsPerSecond: 5000 },
754+
evaluateAt: 'ENTRY',
755+
}
756+
addProbe(probe)
757+
758+
// Spike performance.now to simulate slow execution
759+
let callCount = 0
760+
const realNow = performance.now.bind(performance)
761+
spyOn(performance, 'now').and.callFake(() => {
762+
callCount++
763+
if (callCount <= 2) {
764+
return realNow()
765+
}
766+
return realNow() + 20
767+
})
768+
769+
const probes = getProbes('TestClass;nonSnapshot')!
770+
onEntry(probes, {}, {})
771+
onReturn(probes, null, {}, {}, {})
772+
773+
expect(mockBatchAdd).toHaveBeenCalledTimes(1)
774+
})
775+
776+
it('should not leak active entries when entry capture times out', () => {
777+
const probe = createSnapshotProbe('entryLeakTest')
778+
addProbe(probe)
779+
780+
let shouldTimeout = true
781+
let callCount = 0
782+
const realNow = performance.now.bind(performance)
783+
spyOn(performance, 'now').and.callFake(() => {
784+
callCount++
785+
if (!shouldTimeout || callCount <= 3) {
786+
return realNow()
787+
}
788+
return realNow() + 20
789+
})
790+
791+
const probes = getProbes('TestClass;entryLeakTest')!
792+
// This onEntry will time out and push null
793+
onEntry(probes, {}, { x: 1 })
794+
795+
// onReturn should handle the null entry gracefully (no snapshot sent)
796+
shouldTimeout = false
797+
callCount = 0
798+
onReturn(probes, null, {}, { x: 1 }, {})
799+
800+
expect(mockBatchAdd).not.toHaveBeenCalled()
801+
})
802+
803+
it('should skip subsequent snapshot probes after timeout but still process non-snapshot probes', () => {
804+
const snapshotProbe1: Probe = {
805+
id: 'timeout-shared-1',
806+
version: 0,
807+
type: 'LOG_PROBE',
808+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
809+
template: 'Snapshot probe',
810+
captureSnapshot: true,
811+
capture: { maxReferenceDepth: 3 },
812+
sampling: { snapshotsPerSecond: 5000 },
813+
evaluateAt: 'ENTRY',
814+
}
815+
const nonSnapshotProbe: Probe = {
816+
id: 'timeout-shared-2',
817+
version: 0,
818+
type: 'LOG_PROBE',
819+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
820+
template: 'Non-snapshot probe',
821+
captureSnapshot: false,
822+
capture: {},
823+
sampling: { snapshotsPerSecond: 5000 },
824+
evaluateAt: 'ENTRY',
825+
}
826+
const snapshotProbe2: Probe = {
827+
id: 'timeout-shared-3',
828+
version: 0,
829+
type: 'LOG_PROBE',
830+
where: { typeName: 'TestClass', methodName: 'sharedDeadline' },
831+
template: 'Second snapshot probe',
832+
captureSnapshot: true,
833+
capture: { maxReferenceDepth: 3 },
834+
sampling: { snapshotsPerSecond: 5000 },
835+
evaluateAt: 'ENTRY',
836+
}
837+
addProbe(snapshotProbe1)
838+
addProbe(nonSnapshotProbe)
839+
addProbe(snapshotProbe2)
840+
841+
let callCount = 0
842+
const realNow = performance.now.bind(performance)
843+
spyOn(performance, 'now').and.callFake(() => {
844+
callCount++
845+
if (callCount <= 3) {
846+
return realNow()
847+
}
848+
return realNow() + 20
849+
})
850+
851+
const probes = getProbes('TestClass;sharedDeadline')!
852+
onEntry(probes, {}, { x: 1 })
853+
onReturn(probes, null, {}, { x: 1 }, {})
854+
855+
// The non-snapshot probe should still send, but both snapshot probes should be dropped
856+
const calls = mockBatchAdd.calls.allArgs()
857+
expect(calls.length).toBe(1)
858+
expect(calls[0][0].message).toBe('Non-snapshot probe')
859+
})
860+
861+
it('should share deadline across probes so second snapshot probe exits immediately', () => {
862+
const probe1 = createSnapshotProbe('sharedDeadline1')
863+
const probe2: Probe = {
864+
...createSnapshotProbe('sharedDeadline2'),
865+
id: 'timeout-probe-sharedDeadline2',
866+
where: { typeName: 'TestClass', methodName: 'sharedDeadline1' },
867+
}
868+
addProbe(probe1)
869+
addProbe(probe2)
870+
871+
let callCount = 0
872+
const realNow = performance.now.bind(performance)
873+
spyOn(performance, 'now').and.callFake(() => {
874+
callCount++
875+
if (callCount <= 3) {
876+
return realNow()
877+
}
878+
return realNow() + 20
879+
})
880+
881+
const probes = getProbes('TestClass;sharedDeadline1')!
882+
onEntry(probes, {}, { x: 1 })
883+
onReturn(probes, null, {}, { x: 1 }, {})
884+
885+
// Both snapshot probes share the deadline -- neither should send
886+
expect(mockBatchAdd).not.toHaveBeenCalled()
887+
})
888+
})
889+
650890
describe('error handling', () => {
651891
it('should handle missing DD_RUM gracefully', () => {
652892
delete (window as any).DD_RUM

packages/debugger/src/domain/api.ts

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import type { Batch, Context } from '@datadog/browser-core'
33
import { timeStampNow, display, buildTag, generateUUID, getGlobalObject } from '@datadog/browser-core'
44
import type { BrowserWindow, DebuggerInitConfiguration } from '../entries/main'
55
import { capture, captureFields } from './capture'
6+
import type { CaptureContext } from './capture'
67
import type { InitializedProbe } from './probes'
78
import { checkGlobalSnapshotBudget, resetProbeBudgetConfiguration, setProbeBudgetConfiguration } from './probes'
89
import type { ActiveEntry } from './activeEntries'
@@ -15,6 +16,8 @@ const globalObj = getGlobalObject<BrowserWindow>() // eslint-disable-line local-
1516

1617
const threadName = detectThreadName() // eslint-disable-line local-rules/disallow-side-effects
1718

19+
const SNAPSHOT_TIMEOUT_MS = 10
20+
1821
let debuggerBatch: Batch | undefined
1922
let debuggerConfig: DebuggerInitConfiguration | undefined
2023
let cachedDDtags: string | undefined
@@ -43,6 +46,7 @@ export function resetDebuggerTransport(): void {
4346
*/
4447
export function onEntry(probes: InitializedProbe[], self: any, args: Record<string, any>): void {
4548
const start = performance.now()
49+
const captureCtx: CaptureContext = { deadline: start + SNAPSHOT_TIMEOUT_MS, timedOut: false }
4650

4751
// TODO: A lot of repeated work performed for each probe that could be shared between probes
4852
for (const probe of probes) {
@@ -83,14 +87,19 @@ export function onEntry(probes: InitializedProbe[], self: any, args: Record<stri
8387

8488
// Special case for evaluateAt=EXIT with a condition: we only capture the return snapshot
8589
const shouldCaptureEntrySnapshot = probe.captureSnapshot && (probe.evaluateAt === 'ENTRY' || !probe.condition)
86-
const entry = shouldCaptureEntrySnapshot
87-
? {
88-
arguments: {
89-
...captureFields(args, probe.capture),
90-
this: capture(self, probe.capture),
91-
},
92-
}
93-
: undefined
90+
let entry: { arguments: Record<string, any> } | undefined
91+
if (shouldCaptureEntrySnapshot) {
92+
entry = {
93+
arguments: {
94+
...captureFields(args, probe.capture, captureCtx),
95+
this: capture(self, probe.capture, captureCtx),
96+
},
97+
}
98+
if (captureCtx.timedOut) {
99+
stack.push(null)
100+
continue
101+
}
102+
}
94103

95104
stack.push({
96105
start,
@@ -120,6 +129,7 @@ export function onReturn(
120129
locals: Record<string, any>
121130
): any {
122131
const end = performance.now()
132+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
123133

124134
// TODO: A lot of repeated work performed for each probe that could be shared between probes
125135
for (const probe of probes) {
@@ -155,18 +165,21 @@ export function onReturn(
155165
result.message = evaluateProbeMessage(probe, context)
156166
}
157167

158-
result.return = probe.captureSnapshot
159-
? {
160-
arguments: {
161-
...captureFields(args, probe.capture),
162-
this: capture(self, probe.capture),
163-
},
164-
locals: {
165-
...captureFields(locals, probe.capture),
166-
'@return': capture(value, probe.capture),
167-
},
168-
}
169-
: undefined
168+
if (probe.captureSnapshot) {
169+
result.return = {
170+
arguments: {
171+
...captureFields(args, probe.capture, captureCtx),
172+
this: capture(self, probe.capture, captureCtx),
173+
},
174+
locals: {
175+
...captureFields(locals, probe.capture, captureCtx),
176+
'@return': capture(value, probe.capture, captureCtx),
177+
},
178+
}
179+
if (captureCtx.timedOut) {
180+
continue
181+
}
182+
}
170183

171184
sendDebuggerSnapshot(probe, result)
172185
}
@@ -184,6 +197,7 @@ export function onReturn(
184197
*/
185198
export function onThrow(probes: InitializedProbe[], error: Error, self: any, args: Record<string, any>): void {
186199
const end = performance.now()
200+
const captureCtx: CaptureContext = { deadline: performance.now() + SNAPSHOT_TIMEOUT_MS, timedOut: false }
187201

188202
// TODO: A lot of repeated work performed for each probe that could be shared between probes
189203
for (const probe of probes) {
@@ -219,13 +233,19 @@ export function onThrow(probes: InitializedProbe[], error: Error, self: any, arg
219233
result.message = evaluateProbeMessage(probe, context)
220234
}
221235

236+
let throwArguments: Record<string, any> | undefined
237+
if (probe.captureSnapshot) {
238+
throwArguments = {
239+
...captureFields(args, probe.capture, captureCtx),
240+
this: capture(self, probe.capture, captureCtx),
241+
}
242+
if (captureCtx.timedOut) {
243+
continue
244+
}
245+
}
246+
222247
result.return = {
223-
arguments: probe.captureSnapshot
224-
? {
225-
...captureFields(args, probe.capture),
226-
this: capture(self, probe.capture),
227-
}
228-
: undefined,
248+
arguments: throwArguments,
229249
throwable: {
230250
message: error.message,
231251
stacktrace: parseStackTrace(error),

0 commit comments

Comments
 (0)