Skip to content

Commit 8b6704c

Browse files
committed
[SVLS-8230] Fix SnapStart cold_start tag using restore_time
SnapStart restore invocations were misclassified as proactive_initialization because sandbox_init_time (from snapshot creation) always exceeded the 10s threshold. Fix by tracking restore_time from PlatformRestoreStart telemetry and using it for proactive init detection in SnapStart functions. When restore_time is None (telemetry not yet delivered), assume cold start since the restore and invoke happened close together. https://datadoghq.atlassian.net/browse/SVLS-8230
1 parent b67655d commit 8b6704c

3 files changed

Lines changed: 81 additions & 5 deletions

File tree

bottlecap/src/config/aws.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
use std::env;
22
use tokio::time::Instant;
33

4+
use crate::tags::lambda::tags::SNAP_START_VALUE;
5+
46
const AWS_DEFAULT_REGION: &str = "AWS_DEFAULT_REGION";
57
const AWS_ACCESS_KEY_ID: &str = "AWS_ACCESS_KEY_ID";
68
const AWS_SECRET_ACCESS_KEY: &str = "AWS_SECRET_ACCESS_KEY";
@@ -46,6 +48,11 @@ impl AwsConfig {
4648
self.initialization_type
4749
.eq(LAMBDA_MANAGED_INSTANCES_INIT_TYPE)
4850
}
51+
52+
#[must_use]
53+
pub fn is_snapstart(&self) -> bool {
54+
self.initialization_type.eq(SNAP_START_VALUE)
55+
}
4956
}
5057

5158
#[allow(clippy::module_name_repetitions)]

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ use crate::{
4949
pub const MS_TO_NS: f64 = 1_000_000.0;
5050
pub const S_TO_MS: u64 = 1_000;
5151
pub const S_TO_NS: f64 = 1_000_000_000.0;
52+
/// Threshold for classifying a Lambda cold start as proactive initialization.
53+
///
54+
/// Proactive initialization is a Lambda optimization where the runtime pre-initializes
55+
/// a sandbox before any invocation is scheduled, to reduce cold start latency for future
56+
/// requests. When the gap between sandbox init and the first invocation exceeds this
57+
/// threshold, Lambda initialized the sandbox speculatively rather than in response to an
58+
/// incoming request — making it a proactive initialization rather than a true cold start.
5259
pub const PROACTIVE_INITIALIZATION_THRESHOLD_MS: u64 = 10_000;
5360

5461
pub const DATADOG_INVOCATION_ERROR_MESSAGE_KEY: &str = "x-datadog-invocation-error-msg";
@@ -96,6 +103,8 @@ pub struct Processor {
96103
/// logs agent. Decouples the trace agent from the logs agent: the trace agent sends spans
97104
/// to the lifecycle processor, which extracts durable context and relays it here.
98105
durable_context_tx: mpsc::Sender<DurableContextUpdate>,
106+
/// Time of the `SnapStart` restore event, set when `PlatformRestoreStart` is received.
107+
restore_time: Option<Instant>,
99108
}
100109

101110
impl Processor {
@@ -137,6 +146,7 @@ impl Processor {
137146
active_invocations: 0,
138147
awaiting_first_invocation: false,
139148
durable_context_tx,
149+
restore_time: None,
140150
}
141151
}
142152

@@ -252,12 +262,35 @@ impl Processor {
252262

253263
// If it's empty, then we are in a cold start
254264
if self.context_buffer.is_empty() {
255-
let now = Instant::now();
256-
let time_since_sandbox_init = now.duration_since(self.aws_config.sandbox_init_time);
257-
if time_since_sandbox_init.as_millis() > PROACTIVE_INITIALIZATION_THRESHOLD_MS.into() {
258-
proactive_initialization = true;
265+
if self.aws_config.is_snapstart() {
266+
match self.restore_time {
267+
None => {
268+
// PlatformRestoreStart hasn't arrived yet — restore and invoke
269+
// happened close together, so this is a cold start (not proactive).
270+
cold_start = true;
271+
}
272+
Some(restore_time) => {
273+
let now = Instant::now();
274+
let time_since_restore = now.duration_since(restore_time);
275+
if time_since_restore.as_millis()
276+
> PROACTIVE_INITIALIZATION_THRESHOLD_MS.into()
277+
{
278+
proactive_initialization = true;
279+
} else {
280+
cold_start = true;
281+
}
282+
}
283+
}
259284
} else {
260-
cold_start = true;
285+
let now = Instant::now();
286+
let time_since_sandbox_init = now.duration_since(self.aws_config.sandbox_init_time);
287+
if time_since_sandbox_init.as_millis()
288+
> PROACTIVE_INITIALIZATION_THRESHOLD_MS.into()
289+
{
290+
proactive_initialization = true;
291+
} else {
292+
cold_start = true;
293+
}
261294
}
262295

263296
// Resolve runtime only once
@@ -383,6 +416,8 @@ impl Processor {
383416
/// This is used to create a `snapstart_restore` span, since this telemetry event does not
384417
/// provide a `request_id`, we try to guess which invocation is the restore similar to init.
385418
pub fn on_platform_restore_start(&mut self, time: DateTime<Utc>) {
419+
self.restore_time = Some(Instant::now());
420+
386421
let start_time: i64 = SystemTime::from(time)
387422
.duration_since(UNIX_EPOCH)
388423
.expect("time went backwards")

integration-tests/tests/snapstart.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,23 @@ describe('Snapstart Integration Tests', () => {
101101
);
102102
expect(coldStartSpan).toBeUndefined();
103103
});
104+
105+
it('should have aws.lambda span with cold_start=true', () => {
106+
const result = getRestoreInvocation();
107+
expect(result).toBeDefined();
108+
const trace = result.traces![0];
109+
const awsLambdaSpan = trace.spans.find((span: any) =>
110+
span.attributes.operation_name === 'aws.lambda'
111+
);
112+
expect(awsLambdaSpan).toBeDefined();
113+
expect(awsLambdaSpan).toMatchObject({
114+
attributes: {
115+
custom: {
116+
cold_start: 'true'
117+
}
118+
}
119+
});
120+
});
104121
});
105122

106123
describe('second invocation (warm)', () => {
@@ -146,6 +163,23 @@ describe('Snapstart Integration Tests', () => {
146163
);
147164
expect(coldStartSpan).toBeUndefined();
148165
});
166+
167+
it('should have aws.lambda span with cold_start=false', () => {
168+
const result = getWarmInvocation();
169+
expect(result).toBeDefined();
170+
const trace = result.traces![0];
171+
const awsLambdaSpan = trace.spans.find((span: any) =>
172+
span.attributes.operation_name === 'aws.lambda'
173+
);
174+
expect(awsLambdaSpan).toBeDefined();
175+
expect(awsLambdaSpan).toMatchObject({
176+
attributes: {
177+
custom: {
178+
cold_start: 'false'
179+
}
180+
}
181+
});
182+
});
149183
});
150184

151185
describe('trace isolation', () => {

0 commit comments

Comments
 (0)