Skip to content

Commit da979a7

Browse files
fix(lifecycle): release invocation context after platform report to prevent memory leak
Contexts added to ContextBuffer on every invocation were never removed after processing, causing unbounded memory growth across warm invocations. The growth was most visible when DD_CAPTURE_LAMBDA_PAYLOAD=true with large response payloads (issue #1049), but affects all invocations. Remove the context at the end of on_platform_report, which is the last point in the lifecycle where downstream code still reads context fields (runtime_duration_ms for post-runtime metrics, enhanced_metric_data for network/CPU metrics). Both on-demand and managed instance paths are fixed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent de0e7a0 commit da979a7

File tree

2 files changed

+87
-9
lines changed

2 files changed

+87
-9
lines changed

bottlecap/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 86 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,14 @@ impl Processor {
768768
self.enhanced_metrics
769769
.set_cpu_time_enhanced_metrics(offsets.cpu_offset.clone());
770770
}
771+
772+
// Release the context now that all processing for this invocation is complete.
773+
// This prevents unbounded memory growth across warm invocations.
774+
self.context_buffer.remove(request_id);
775+
debug!(
776+
"Context released (buffer size after remove: {})",
777+
self.context_buffer.size()
778+
);
771779
}
772780

773781
/// Handles Managed Instance mode platform report processing.
@@ -1609,8 +1617,7 @@ mod tests {
16091617
duration_ms,
16101618
status,
16111619
error_type,
1612-
should_have_context_after,
1613-
): (&str, bool, f64, Status, Option<String>, bool) = $value;
1620+
): (&str, bool, f64, Status, Option<String>) = $value;
16141621

16151622
let mut processor = setup();
16161623

@@ -1652,6 +1659,17 @@ mod tests {
16521659
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
16531660
});
16541661

1662+
// Verify context state before on_platform_report
1663+
let request_id_string_for_get = request_id.to_string();
1664+
assert_eq!(
1665+
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1666+
// Use setup_context because it dictates whether the request is handled up front,
1667+
// which in turn signals whether the request is valid/processed.
1668+
setup_context,
1669+
"Context existence mismatch for request_id: {}",
1670+
request_id
1671+
);
1672+
16551673
// Call on_platform_report
16561674
let request_id_string = request_id.to_string();
16571675
processor.on_platform_report(
@@ -1669,7 +1687,7 @@ mod tests {
16691687
let request_id_string_for_get = request_id.to_string();
16701688
assert_eq!(
16711689
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1672-
should_have_context_after,
1690+
false,
16731691
"Context existence mismatch for request_id: {}",
16741692
request_id
16751693
);
@@ -1679,14 +1697,13 @@ mod tests {
16791697
}
16801698

16811699
platform_report_managed_instance_tests! {
1682-
// (request_id, setup_context, duration_ms, status, error_type, should_have_context_after)
1700+
// (request_id, setup_context, duration_ms, status, error_type)
16831701
test_on_platform_report_managed_instance_mode_with_valid_context: (
16841702
"test-request-id",
16851703
true, // setup context
16861704
123.45,
16871705
Status::Success,
16881706
None,
1689-
true, // context should still exist
16901707
),
16911708

16921709
test_on_platform_report_managed_instance_mode_without_context: (
@@ -1695,7 +1712,6 @@ mod tests {
16951712
123.45,
16961713
Status::Success,
16971714
None,
1698-
false, // context should not exist
16991715
),
17001716

17011717
test_on_platform_report_managed_instance_mode_with_error_status: (
@@ -1704,7 +1720,6 @@ mod tests {
17041720
200.0,
17051721
Status::Error,
17061722
Some("RuntimeError".to_string()),
1707-
true, // context should still exist
17081723
),
17091724

17101725
test_on_platform_report_managed_instance_mode_with_timeout: (
@@ -1713,10 +1728,73 @@ mod tests {
17131728
30000.0,
17141729
Status::Timeout,
17151730
None,
1716-
true, // context should still exist
17171731
),
17181732
}
17191733

1734+
#[tokio::test]
1735+
async fn test_context_removed_after_on_platform_report_on_demand() {
1736+
use libdd_trace_obfuscation::obfuscation_config::ObfuscationConfig;
1737+
1738+
let mut p = setup();
1739+
let request_id = String::from("test-request-id");
1740+
1741+
p.on_invoke_event(request_id.clone());
1742+
let start_time = chrono::Utc::now();
1743+
p.on_platform_start(request_id.clone(), start_time);
1744+
assert!(
1745+
p.context_buffer.get(&request_id).is_some(),
1746+
"context must exist before report"
1747+
);
1748+
1749+
let config = Arc::new(config::Config {
1750+
service: Some("test-service".to_string()),
1751+
..config::Config::default()
1752+
});
1753+
let tags_provider = Arc::new(provider::Provider::new(
1754+
Arc::clone(&config),
1755+
LAMBDA_RUNTIME_SLUG.to_string(),
1756+
&HashMap::from([("function_arn".to_string(), "test-arn".to_string())]),
1757+
));
1758+
let (stats_concentrator_service, stats_concentrator_handle) =
1759+
StatsConcentratorService::new(Arc::clone(&config));
1760+
tokio::spawn(stats_concentrator_service.run());
1761+
let trace_sender = Arc::new(SendingTraceProcessor {
1762+
appsec: None,
1763+
processor: Arc::new(trace_processor::ServerlessTraceProcessor {
1764+
obfuscation_config: Arc::new(
1765+
ObfuscationConfig::new().expect("Failed to create ObfuscationConfig"),
1766+
),
1767+
}),
1768+
trace_tx: tokio::sync::mpsc::channel(1).0,
1769+
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
1770+
});
1771+
1772+
p.on_platform_report(
1773+
&request_id,
1774+
ReportMetrics::OnDemand(OnDemandReportMetrics {
1775+
duration_ms: 123.45,
1776+
billed_duration_ms: 124,
1777+
memory_size_mb: 256,
1778+
max_memory_used_mb: 128,
1779+
init_duration_ms: None,
1780+
restore_duration_ms: None,
1781+
}),
1782+
chrono::Utc::now().timestamp(),
1783+
Status::Success,
1784+
None,
1785+
None,
1786+
tags_provider,
1787+
trace_sender,
1788+
)
1789+
.await;
1790+
1791+
assert!(
1792+
p.context_buffer.get(&request_id).is_none(),
1793+
"context must be removed after on_platform_report completes"
1794+
);
1795+
assert_eq!(p.context_buffer.size(), 0);
1796+
}
1797+
17201798
#[tokio::test]
17211799
async fn test_on_platform_init_start_sets_durable_function_tag() {
17221800
let mut processor = setup();

0 commit comments

Comments
 (0)