Skip to content

Commit 03f60b4

Browse files
fix(lifecycle): release invocation context after platform report to prevent memory leak
Contexts added to ContextBuffer on every invocation were never removed after processing, causing unbounded memory growth across warm invocations. The growth was most visible when DD_CAPTURE_LAMBDA_PAYLOAD=true with large response payloads (issue #1049), but affects all invocations. Remove the context at the end of on_platform_report, which is the last point in the lifecycle where downstream code still reads context fields (runtime_duration_ms for post-runtime metrics, enhanced_metric_data for network/CPU metrics). Both on-demand and managed instance paths are fixed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent de0e7a0 commit 03f60b4

File tree

3 files changed

+94
-13
lines changed

3 files changed

+94
-13
lines changed

bottlecap/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bottlecap/src/lifecycle/invocation/context.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use std::{
99

1010
use libdd_trace_protobuf::pb::Span;
1111
use serde_json::Value;
12-
use tracing::debug;
12+
use tracing::{debug, warn};
1313

1414
#[derive(Debug, Clone, PartialEq)]
1515
pub struct Context {
@@ -199,7 +199,7 @@ impl ContextBuffer {
199199
{
200200
return self.buffer.remove(i);
201201
}
202-
debug!("Context for request_id: {:?} not found", request_id);
202+
warn!("Context for request_id: {:?} not found", request_id);
203203

204204
None
205205
}

bottlecap/src/lifecycle/invocation/processor.rs

Lines changed: 91 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use libdd_trace_protobuf::pb::Span;
99
use libdd_trace_utils::tracer_header_tags;
1010
use serde_json::Value;
1111
use tokio::time::Instant;
12-
use tracing::{debug, warn};
12+
use tracing::{debug, trace, warn};
1313

1414
use crate::{
1515
config::{self, aws::AwsConfig},
@@ -285,7 +285,10 @@ impl Processor {
285285
/// This is used to create a cold start span, since this telemetry event does not
286286
/// provide a `request_id`, we try to guess which invocation is the cold start.
287287
pub fn on_platform_init_start(&mut self, time: DateTime<Utc>, runtime_version: Option<String>) {
288-
if runtime_version.as_deref().map_or(false, |rv| rv.contains("DurableFunction")) {
288+
if runtime_version
289+
.as_deref()
290+
.is_some_and(|rv| rv.contains("DurableFunction"))
291+
{
289292
self.enhanced_metrics.set_durable_function_tag();
290293
}
291294
let start_time: i64 = SystemTime::from(time)
@@ -768,6 +771,14 @@ impl Processor {
768771
self.enhanced_metrics
769772
.set_cpu_time_enhanced_metrics(offsets.cpu_offset.clone());
770773
}
774+
775+
// Release the context now that all processing for this invocation is complete.
776+
// This prevents unbounded memory growth across warm invocations.
777+
self.context_buffer.remove(request_id);
778+
trace!(
779+
"Context released (buffer size after remove: {})",
780+
self.context_buffer.size()
781+
);
771782
}
772783

773784
/// Handles Managed Instance mode platform report processing.
@@ -1609,8 +1620,7 @@ mod tests {
16091620
duration_ms,
16101621
status,
16111622
error_type,
1612-
should_have_context_after,
1613-
): (&str, bool, f64, Status, Option<String>, bool) = $value;
1623+
): (&str, bool, f64, Status, Option<String>) = $value;
16141624

16151625
let mut processor = setup();
16161626

@@ -1652,6 +1662,17 @@ mod tests {
16521662
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
16531663
});
16541664

1665+
// Verify context state before on_platform_report
1666+
let request_id_string_for_get = request_id.to_string();
1667+
assert_eq!(
1668+
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1669+
// Use setup_context because it dictates whether the request is handled up front,
1670+
// which in turn signals whether the request is valid/processed.
1671+
setup_context,
1672+
"Context existence mismatch for request_id: {}",
1673+
request_id
1674+
);
1675+
16551676
// Call on_platform_report
16561677
let request_id_string = request_id.to_string();
16571678
processor.on_platform_report(
@@ -1669,7 +1690,7 @@ mod tests {
16691690
let request_id_string_for_get = request_id.to_string();
16701691
assert_eq!(
16711692
processor.context_buffer.get(&request_id_string_for_get).is_some(),
1672-
should_have_context_after,
1693+
false,
16731694
"Context existence mismatch for request_id: {}",
16741695
request_id
16751696
);
@@ -1679,14 +1700,13 @@ mod tests {
16791700
}
16801701

16811702
platform_report_managed_instance_tests! {
1682-
// (request_id, setup_context, duration_ms, status, error_type, should_have_context_after)
1703+
// (request_id, setup_context, duration_ms, status, error_type)
16831704
test_on_platform_report_managed_instance_mode_with_valid_context: (
16841705
"test-request-id",
16851706
true, // setup context
16861707
123.45,
16871708
Status::Success,
16881709
None,
1689-
true, // context should still exist
16901710
),
16911711

16921712
test_on_platform_report_managed_instance_mode_without_context: (
@@ -1695,7 +1715,6 @@ mod tests {
16951715
123.45,
16961716
Status::Success,
16971717
None,
1698-
false, // context should not exist
16991718
),
17001719

17011720
test_on_platform_report_managed_instance_mode_with_error_status: (
@@ -1704,7 +1723,6 @@ mod tests {
17041723
200.0,
17051724
Status::Error,
17061725
Some("RuntimeError".to_string()),
1707-
true, // context should still exist
17081726
),
17091727

17101728
test_on_platform_report_managed_instance_mode_with_timeout: (
@@ -1713,10 +1731,73 @@ mod tests {
17131731
30000.0,
17141732
Status::Timeout,
17151733
None,
1716-
true, // context should still exist
17171734
),
17181735
}
17191736

1737+
#[tokio::test]
1738+
async fn test_context_removed_after_on_platform_report_on_demand() {
1739+
use libdd_trace_obfuscation::obfuscation_config::ObfuscationConfig;
1740+
1741+
let mut p = setup();
1742+
let request_id = String::from("test-request-id");
1743+
1744+
p.on_invoke_event(request_id.clone());
1745+
let start_time = chrono::Utc::now();
1746+
p.on_platform_start(request_id.clone(), start_time);
1747+
assert!(
1748+
p.context_buffer.get(&request_id).is_some(),
1749+
"context must exist before report"
1750+
);
1751+
1752+
let config = Arc::new(config::Config {
1753+
service: Some("test-service".to_string()),
1754+
..config::Config::default()
1755+
});
1756+
let tags_provider = Arc::new(provider::Provider::new(
1757+
Arc::clone(&config),
1758+
LAMBDA_RUNTIME_SLUG.to_string(),
1759+
&HashMap::from([("function_arn".to_string(), "test-arn".to_string())]),
1760+
));
1761+
let (stats_concentrator_service, stats_concentrator_handle) =
1762+
StatsConcentratorService::new(Arc::clone(&config));
1763+
tokio::spawn(stats_concentrator_service.run());
1764+
let trace_sender = Arc::new(SendingTraceProcessor {
1765+
appsec: None,
1766+
processor: Arc::new(trace_processor::ServerlessTraceProcessor {
1767+
obfuscation_config: Arc::new(
1768+
ObfuscationConfig::new().expect("Failed to create ObfuscationConfig"),
1769+
),
1770+
}),
1771+
trace_tx: tokio::sync::mpsc::channel(1).0,
1772+
stats_generator: Arc::new(StatsGenerator::new(stats_concentrator_handle)),
1773+
});
1774+
1775+
p.on_platform_report(
1776+
&request_id,
1777+
ReportMetrics::OnDemand(OnDemandReportMetrics {
1778+
duration_ms: 123.45,
1779+
billed_duration_ms: 124,
1780+
memory_size_mb: 256,
1781+
max_memory_used_mb: 128,
1782+
init_duration_ms: None,
1783+
restore_duration_ms: None,
1784+
}),
1785+
chrono::Utc::now().timestamp(),
1786+
Status::Success,
1787+
None,
1788+
None,
1789+
tags_provider,
1790+
trace_sender,
1791+
)
1792+
.await;
1793+
1794+
assert!(
1795+
p.context_buffer.get(&request_id).is_none(),
1796+
"context must be removed after on_platform_report completes"
1797+
);
1798+
assert_eq!(p.context_buffer.size(), 0);
1799+
}
1800+
17201801
#[tokio::test]
17211802
async fn test_on_platform_init_start_sets_durable_function_tag() {
17221803
let mut processor = setup();

0 commit comments

Comments
 (0)