Skip to content

Commit db59a28

Browse files
fix(lifecycle): release invocation context after platform report to prevent memory leak (#1050)
JIRA: https://datadoghq.atlassian.net/browse/SVLS-8625 Git issue reported: #1049 ## What does this PR do? Fixes a memory leak in the extension where `ContextBuffer` accumulated `Context` objects indefinitely across warm Lambda invocations. **Root cause:** `on_platform_report()` reads the `Context` for runtime duration and CPU/network enhanced metrics, but never called `context_buffer.remove()` afterwards. Each warm invocation appended a new `Context` to the buffer; none were ever freed. **Fix:** Call `self.context_buffer.remove(request_id)` at the end of `on_platform_report()`, after all processing for the invocation is complete. **Impact:** Most visible when `DD_CAPTURE_LAMBDA_PAYLOAD=true` — each retained `Context` holds the full captured request/response payload as span metadata (~500 KB per invocation). Without the fix, a Lambda function invoked thousands of times would accumulate hundreds of MB of leaked memory in the extension process. ## Testing - Unit tests - A test with 50+ invocations shows | without the fix | with the fix | |--------|--------| | `buffer size after remove: 52`|`buffer size after remove: 1`| Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 1559722 commit db59a28

File tree

2 files changed

+89
-11
lines changed

2 files changed

+89
-11
lines changed

bottlecap/src/lifecycle/invocation/context.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::{
99

1010
use libdd_trace_protobuf::pb::Span;
1111
use serde_json::Value;
12-
use tracing::debug;
12+
use tracing::{debug, warn};
1313

1414
#[derive(Debug, Clone, PartialEq)]
1515
pub struct Context {
@@ -199,7 +199,7 @@ impl ContextBuffer {
199199
{
200200
return self.buffer.remove(i);
201201
}
202-
debug!("Context for request_id: {:?} not found", request_id);
202+
warn!("Context for request_id: {:?} not found", request_id);
203203

204204
None
205205
}

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 87 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use libdd_trace_protobuf::pb::Span;
99
use libdd_trace_utils::tracer_header_tags;
1010
use serde_json::Value;
1111
use tokio::time::Instant;
12-
use tracing::{debug, warn};
12+
use tracing::{debug, trace, warn};
1313

1414
use crate::{
1515
config::{self, aws::AwsConfig},
@@ -771,6 +771,14 @@ impl Processor {
771771
self.enhanced_metrics
772772
.set_cpu_time_enhanced_metrics(offsets.cpu_offset.clone());
773773
}
774+
775+
// Release the context now that all processing for this invocation is complete.
776+
// This prevents unbounded memory growth across warm invocations.
777+
self.context_buffer.remove(request_id);
778+
trace!(
779+
"Context released (buffer size after remove: {})",
780+
self.context_buffer.size()
781+
);
774782
}
775783

776784
/// Handles Managed Instance mode platform report processing.
@@ -1612,8 +1620,7 @@ mod tests {
16121620
duration_ms,
16131621
status,
16141622
error_type,
1615-
should_have_context_after,
1616-
): (&str, bool, f64, Status, Option<String>, bool) = $value;
1623+
): (&str, bool, f64, Status, Option<String>) = $value;
16171624

16181625
let mut processor = setup();
16191626

@@ -1655,6 +1662,17 @@ mod tests {
16551662
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
16561663
});
16571664

1665+
// Verify context state before on_platform_report
1666+
let request_id_string_for_get = request_id.to_string();
1667+
assert_eq!(
1668+
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1669+
// Use setup_context because it dictates whether the request is handled up front,
1670+
// which in turn signals whether the request is valid/processed.
1671+
setup_context,
1672+
"Context existence mismatch for request_id: {}",
1673+
request_id
1674+
);
1675+
16581676
// Call on_platform_report
16591677
let request_id_string = request_id.to_string();
16601678
processor.on_platform_report(
@@ -1672,7 +1690,7 @@ mod tests {
16721690
let request_id_string_for_get = request_id.to_string();
16731691
assert_eq!(
16741692
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1675-
should_have_context_after,
1693+
false,
16761694
"Context existence mismatch for request_id: {}",
16771695
request_id
16781696
);
@@ -1682,14 +1700,13 @@ mod tests {
16821700
}
16831701

16841702
platform_report_managed_instance_tests! {
1685-
// (request_id, setup_context, duration_ms, status, error_type, should_have_context_after)
1703+
// (request_id, setup_context, duration_ms, status, error_type)
16861704
test_on_platform_report_managed_instance_mode_with_valid_context: (
16871705
"test-request-id",
16881706
true, // setup context
16891707
123.45,
16901708
Status::Success,
16911709
None,
1692-
true, // context should still exist
16931710
),
16941711

16951712
test_on_platform_report_managed_instance_mode_without_context: (
@@ -1698,7 +1715,6 @@ mod tests {
16981715
123.45,
16991716
Status::Success,
17001717
None,
1701-
false, // context should not exist
17021718
),
17031719

17041720
test_on_platform_report_managed_instance_mode_with_error_status: (
@@ -1707,7 +1723,6 @@ mod tests {
17071723
200.0,
17081724
Status::Error,
17091725
Some("RuntimeError".to_string()),
1710-
true, // context should still exist
17111726
),
17121727

17131728
test_on_platform_report_managed_instance_mode_with_timeout: (
@@ -1716,10 +1731,73 @@ mod tests {
17161731
30000.0,
17171732
Status::Timeout,
17181733
None,
1719-
true, // context should still exist
17201734
),
17211735
}
17221736

1737+
#[tokio::test]
1738+
async fn test_context_removed_after_on_platform_report_on_demand() {
1739+
use libdd_trace_obfuscation::obfuscation_config::ObfuscationConfig;
1740+
1741+
let mut p = setup();
1742+
let request_id = String::from("test-request-id");
1743+
1744+
p.on_invoke_event(request_id.clone());
1745+
let start_time = chrono::Utc::now();
1746+
p.on_platform_start(request_id.clone(), start_time);
1747+
assert!(
1748+
p.context_buffer.get(&request_id).is_some(),
1749+
"context must exist before report"
1750+
);
1751+
1752+
let config = Arc::new(config::Config {
1753+
service: Some("test-service".to_string()),
1754+
..config::Config::default()
1755+
});
1756+
let tags_provider = Arc::new(provider::Provider::new(
1757+
Arc::clone(&config),
1758+
LAMBDA_RUNTIME_SLUG.to_string(),
1759+
&HashMap::from([("function_arn".to_string(), "test-arn".to_string())]),
1760+
));
1761+
let (stats_concentrator_service, stats_concentrator_handle) =
1762+
StatsConcentratorService::new(Arc::clone(&config));
1763+
tokio::spawn(stats_concentrator_service.run());
1764+
let trace_sender = Arc::new(SendingTraceProcessor {
1765+
appsec: None,
1766+
processor: Arc::new(trace_processor::ServerlessTraceProcessor {
1767+
obfuscation_config: Arc::new(
1768+
ObfuscationConfig::new().expect("Failed to create ObfuscationConfig"),
1769+
),
1770+
}),
1771+
trace_tx: tokio::sync::mpsc::channel(1).0,
1772+
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
1773+
});
1774+
1775+
p.on_platform_report(
1776+
&request_id,
1777+
ReportMetrics::OnDemand(OnDemandReportMetrics {
1778+
duration_ms: 123.45,
1779+
billed_duration_ms: 124,
1780+
memory_size_mb: 256,
1781+
max_memory_used_mb: 128,
1782+
init_duration_ms: None,
1783+
restore_duration_ms: None,
1784+
}),
1785+
chrono::Utc::now().timestamp(),
1786+
Status::Success,
1787+
None,
1788+
None,
1789+
tags_provider,
1790+
trace_sender,
1791+
)
1792+
.await;
1793+
1794+
assert!(
1795+
p.context_buffer.get(&request_id).is_none(),
1796+
"context must be removed after on_platform_report completes"
1797+
);
1798+
assert_eq!(p.context_buffer.size(), 0);
1799+
}
1800+
17231801
#[tokio::test]
17241802
async fn test_on_platform_init_start_sets_durable_function_tag() {
17251803
let mut processor = setup();

0 commit comments

Comments
 (0)