Skip to content

Commit b67655d

Browse files
fix: [SLES-2810] prune sorted_reparenting_info on context release to stop warning flood (#1161)
## Summary - **Root cause:** After `on_platform_report` removes a context from `context_buffer`, the corresponding `ReparentingInfo` entry was left in `sorted_reparenting_info` indefinitely (capacity 500). Every subsequent trace batch caused `update_reparenting` to iterate all stale entries and emit a `WARN` for each one, producing a flood of `"Mismatched request info. Context not found for request_id"` messages in CloudWatch. - **Fix:** Call `sorted_reparenting_info.retain(...)` immediately after `context_buffer.remove(request_id)` in `on_platform_report` to prune the completed invocation's entry. - **Tests:** Two regression tests added — one verifying the entry is pruned after `on_platform_report`, one reproducing the exact production sequence (invoke → add_reparenting → report → trace batch) to confirm stale entries no longer appear in `get_reparenting_info()`. Fixes https://datadoghq.atlassian.net/browse/SLES-2810 ## Test plan - [x] `cargo test test_reparenting_info_pruned_after_on_platform_report` passes - [x] `cargo test test_update_reparenting_ignores_completed_invocations` passes - [x] Full lifecycle test suite: 218 passed, 0 failed 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: tianning.li <tianning.li@datadoghq.com>
1 parent ce3c8e3 commit b67655d

1 file changed

Lines changed: 134 additions & 0 deletions

File tree

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,11 @@ impl Processor {
793793
// Release the context now that all processing for this invocation is complete.
794794
// This prevents unbounded memory growth across warm invocations.
795795
self.context_buffer.remove(request_id);
796+
// Prune the corresponding reparenting entry so that update_reparenting does not
797+
// warn about a missing context for already-completed invocations.
798+
self.context_buffer
799+
.sorted_reparenting_info
800+
.retain(|info| info.request_id != *request_id);
796801
trace!(
797802
"Context released (buffer size after remove: {})",
798803
self.context_buffer.size()
@@ -2266,4 +2271,133 @@ mod tests {
22662271
);
22672272
}
22682273
}
2274+
2275+
fn make_trace_sender(config: Arc<config::Config>) -> Arc<SendingTraceProcessor> {
2276+
use libdd_trace_obfuscation::obfuscation_config::ObfuscationConfig;
2277+
let (stats_concentrator_service, stats_concentrator_handle) =
2278+
StatsConcentratorService::new(Arc::clone(&config));
2279+
tokio::spawn(stats_concentrator_service.run());
2280+
Arc::new(SendingTraceProcessor {
2281+
appsec: None,
2282+
processor: Arc::new(trace_processor::ServerlessTraceProcessor {
2283+
obfuscation_config: Arc::new(
2284+
ObfuscationConfig::new().expect("Failed to create ObfuscationConfig"),
2285+
),
2286+
}),
2287+
trace_tx: tokio::sync::mpsc::channel(1).0,
2288+
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
2289+
})
2290+
}
2291+
2292+
// Regression test for SLES-2810: sorted_reparenting_info must be pruned when the
2293+
// context is released in on_platform_report, so that update_reparenting does not
2294+
// emit spurious warnings for already-completed invocations.
2295+
#[tokio::test]
2296+
async fn test_reparenting_info_pruned_after_on_platform_report() {
2297+
let mut p = setup();
2298+
let request_id = String::from("test-request-id");
2299+
2300+
p.on_invoke_event(request_id.clone());
2301+
p.add_reparenting(request_id.clone(), 42, 0);
2302+
assert_eq!(
2303+
p.context_buffer.sorted_reparenting_info.len(),
2304+
1,
2305+
"reparenting entry must exist before report"
2306+
);
2307+
2308+
let config = Arc::new(config::Config::default());
2309+
let tags_provider = Arc::new(provider::Provider::new(
2310+
Arc::clone(&config),
2311+
LAMBDA_RUNTIME_SLUG.to_string(),
2312+
&HashMap::from([("function_arn".to_string(), "test-arn".to_string())]),
2313+
));
2314+
let trace_sender = make_trace_sender(config);
2315+
2316+
p.on_platform_report(
2317+
&request_id,
2318+
ReportMetrics::OnDemand(OnDemandReportMetrics {
2319+
duration_ms: 10.0,
2320+
billed_duration_ms: 11,
2321+
memory_size_mb: 128,
2322+
max_memory_used_mb: 64,
2323+
init_duration_ms: None,
2324+
restore_duration_ms: None,
2325+
}),
2326+
chrono::Utc::now().timestamp(),
2327+
Status::Success,
2328+
None,
2329+
None,
2330+
tags_provider,
2331+
trace_sender,
2332+
)
2333+
.await;
2334+
2335+
assert!(
2336+
p.context_buffer.sorted_reparenting_info.is_empty(),
2337+
"reparenting entry must be pruned after on_platform_report"
2338+
);
2339+
}
2340+
2341+
// Regression test for SLES-2810: update_reparenting must not visit stale entries
2342+
// whose context has already been released, preventing the warning flood.
2343+
#[tokio::test]
2344+
async fn test_update_reparenting_ignores_completed_invocations() {
2345+
let mut p = setup();
2346+
2347+
// Invocation A completes fully.
2348+
let request_id_a = String::from("request-a");
2349+
p.on_invoke_event(request_id_a.clone());
2350+
p.add_reparenting(request_id_a.clone(), 11, 0);
2351+
2352+
let config = Arc::new(config::Config::default());
2353+
let tags_provider = Arc::new(provider::Provider::new(
2354+
Arc::clone(&config),
2355+
LAMBDA_RUNTIME_SLUG.to_string(),
2356+
&HashMap::from([("function_arn".to_string(), "test-arn".to_string())]),
2357+
));
2358+
let trace_sender = make_trace_sender(Arc::clone(&config));
2359+
2360+
p.on_platform_report(
2361+
&request_id_a,
2362+
ReportMetrics::OnDemand(OnDemandReportMetrics {
2363+
duration_ms: 10.0,
2364+
billed_duration_ms: 11,
2365+
memory_size_mb: 128,
2366+
max_memory_used_mb: 64,
2367+
init_duration_ms: None,
2368+
restore_duration_ms: None,
2369+
}),
2370+
chrono::Utc::now().timestamp(),
2371+
Status::Success,
2372+
None,
2373+
None,
2374+
tags_provider,
2375+
trace_sender,
2376+
)
2377+
.await;
2378+
2379+
// Invocation B is in-flight (context still live).
2380+
let request_id_b = String::from("request-b");
2381+
p.on_invoke_event(request_id_b.clone());
2382+
p.add_reparenting(request_id_b.clone(), 22, 0);
2383+
2384+
// Simulate the trace agent path: clone reparenting_info, then call update_reparenting.
2385+
// Before the fix this clone contained the stale entry for request-a, causing a warning.
2386+
let reparenting_info = p.get_reparenting_info();
2387+
2388+
// Only the live invocation B should be present.
2389+
assert_eq!(
2390+
reparenting_info.len(),
2391+
1,
2392+
"only the in-flight invocation must remain in reparenting_info"
2393+
);
2394+
assert_eq!(reparenting_info[0].request_id, request_id_b);
2395+
2396+
// update_reparenting must return no contexts to send (span IDs still unset).
2397+
let ctx_to_send = p.update_reparenting(reparenting_info);
2398+
assert!(
2399+
ctx_to_send.is_empty(),
2400+
"no contexts should be ready to send yet"
2401+
);
2402+
}
22692403
}

0 commit comments

Comments
 (0)