remove locking on metrics

duncanista · duncanista · commit d60314e05fd5 · 2026-02-05T16:39:07.000-05:00
diff --git a/bottlecap/Cargo.lock b/bottlecap/Cargo.lock
diff --git a/bottlecap/Cargo.toml b/bottlecap/Cargo.toml
@@ -71,8 +71,8 @@ libdd-trace-utils = { git = "https://github.com/DataDog/libdatadog", rev = "158b
 libdd-trace-normalization = { git = "https://github.com/DataDog/libdatadog",  rev = "158b59471f1132e3cb36023fa3c46ccb2dd0eda1" }
 libdd-trace-obfuscation = { git = "https://github.com/DataDog/libdatadog", rev = "158b59471f1132e3cb36023fa3c46ccb2dd0eda1"  }
 libdd-trace-stats = { git = "https://github.com/DataDog/libdatadog", rev = "158b59471f1132e3cb36023fa3c46ccb2dd0eda1"  }
-dogstatsd = { git = "https://github.com/DataDog/serverless-components", rev = "18b49baba8bfef97060d7edd8b830584d0da3373", default-features = false }
-datadog-fips = { git = "https://github.com/DataDog/serverless-components", rev = "18b49baba8bfef97060d7edd8b830584d0da3373", default-features = false }
+dogstatsd = { git = "https://github.com/DataDog/serverless-components", rev = "e4f0341c84bf57d7af3784f4bf9e7f33e4c7ecd1", default-features = false }
+datadog-fips = { git = "https://github.com/DataDog/serverless-components", rev = "e4f0341c84bf57d7af3784f4bf9e7f33e4c7ecd1", default-features = false }
 libddwaf = { version = "1.28.1", git = "https://github.com/DataDog/libddwaf-rust", rev = "d1534a158d976bd4f747bf9fcc58e0712d2d17fc", default-features = false, features = ["serde"] }
 
 [dev-dependencies]
diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs
@@ -450,10 +450,8 @@ async fn extension_loop_active(
                         // Wait for any pending flushes
                         flushing_service.await_handles().await;
                         // Final flush to capture any data that accumulated since the last
-                        // spawn_non_blocking(). We pass force_stats=true since this is our
-                        // last opportunity to send data before shutdown.
-                        let mut locked_metrics = flushing_service.metrics_flushers().lock().await;
-                        flushing_service.flush_blocking(true, &mut locked_metrics).await;
+                        // spawn_non_blocking(). This is our last opportunity to send data.
+                        flushing_service.flush_blocking_final().await;
                         break;
                     }
                 }
@@ -635,19 +633,13 @@ async fn extension_loop_active(
                             }
                         }
                         _ = race_flush_interval.tick() => {
-                            let mut locked_metrics = metrics_flushers.lock().await;
-                            flushing_service
-                                .flush_blocking(false, &mut locked_metrics)
-                                .await;
+                            flushing_service.flush_blocking().await;
                             race_flush_interval.reset();
                         }
                     }
                 }
                 // flush
-                let mut locked_metrics = metrics_flushers.lock().await;
-                flushing_service
-                    .flush_blocking(false, &mut locked_metrics)
-                    .await;
+                flushing_service.flush_blocking().await;
                 race_flush_interval.reset();
                 let next_response =
                     extension::next_event(client, &aws_config.runtime_api, &r.extension_id).await;
@@ -664,10 +656,7 @@ async fn extension_loop_active(
                         }
                     }
                     FlushDecision::Periodic => {
-                        let mut locked_metrics = metrics_flushers.lock().await;
-                        flushing_service
-                            .flush_blocking(false, &mut locked_metrics)
-                            .await;
+                        flushing_service.flush_blocking().await;
                         race_flush_interval.reset();
                     }
                     _ => {
@@ -695,10 +684,7 @@ async fn extension_loop_active(
                         }
                         _ = race_flush_interval.tick() => {
                             if flush_control.flush_strategy == FlushStrategy::Default {
-                                let mut locked_metrics = metrics_flushers.lock().await;
-                                flushing_service
-                                    .flush_blocking(false, &mut locked_metrics)
-                                    .await;
+                                flushing_service.flush_blocking().await;
                                 race_flush_interval.reset();
                             }
                         }
@@ -744,11 +730,8 @@ async fn extension_loop_active(
                 &lifecycle_listener_shutdown_token,
             );
 
-            // Final flush with force_stats=true since this is our last opportunity
-            let mut locked_metrics = metrics_flushers.lock().await;
-            flushing_service
-                .flush_blocking(true, &mut locked_metrics)
-                .await;
+            // Final flush - this is our last opportunity to send data before shutdown
+            flushing_service.flush_blocking_final().await;
 
             // Even though we're shutting down, we need to reset the flush interval to prevent any future flushes
             race_flush_interval.reset();
@@ -1178,7 +1161,7 @@ async fn start_dogstatsd(
     api_key_factory: Arc<ApiKeyFactory>,
     config: &Arc<Config>,
 ) -> (
-    Arc<TokioMutex<Vec<MetricsFlusher>>>,
+    Arc<Vec<MetricsFlusher>>,
     MetricsAggregatorHandle,
     CancellationToken,
 ) {
@@ -1200,17 +1183,18 @@ async fn start_dogstatsd(
     });
 
     // Get flushers with aggregator handle
-    let flushers = Arc::new(TokioMutex::new(start_metrics_flushers(
+    let flushers = Arc::new(start_metrics_flushers(
         Arc::clone(&api_key_factory),
         &aggregator_handle,
         config,
-    )));
+    ));
 
     // Create Dogstatsd server
     let dogstatsd_config = DogStatsDConfig {
         host: EXTENSION_HOST.to_string(),
         port: DOGSTATSD_PORT,
         metric_namespace: config.statsd_metric_namespace.clone(),
+        windows_pipe_name: None,
     };
     let cancel_token = tokio_util::sync::CancellationToken::new();
     let dogstatsd_agent = DogStatsD::new(
diff --git a/bottlecap/src/flushing/service.rs b/bottlecap/src/flushing/service.rs
@@ -2,7 +2,6 @@
 
 use std::sync::Arc;
 
-use tokio::sync::Mutex as TokioMutex;
 use tracing::{debug, error};
 
 use dogstatsd::{
@@ -29,7 +28,7 @@ pub struct FlushingService {
     trace_flusher: Arc<TraceFlusher>,
     stats_flusher: Arc<StatsFlusher>,
     proxy_flusher: Arc<ProxyFlusher>,
-    metrics_flushers: Arc<TokioMutex<Vec<MetricsFlusher>>>,
+    metrics_flushers: Arc<Vec<MetricsFlusher>>,
 
     // Metrics aggregator handle for getting data to flush
     metrics_aggr_handle: MetricsAggregatorHandle,
@@ -46,7 +45,7 @@ impl FlushingService {
         trace_flusher: Arc<TraceFlusher>,
         stats_flusher: Arc<StatsFlusher>,
         proxy_flusher: Arc<ProxyFlusher>,
-        metrics_flushers: Arc<TokioMutex<Vec<MetricsFlusher>>>,
+        metrics_flushers: Arc<Vec<MetricsFlusher>>,
         metrics_aggr_handle: MetricsAggregatorHandle,
     ) -> Self {
         Self {
@@ -90,22 +89,17 @@ impl FlushingService {
 
         // Spawn metrics flush
         // First get the data from aggregator, then spawn flush tasks for each flusher
-        let (metrics_flushers_copy, series, sketches) = {
-            let locked_metrics = self.metrics_flushers.lock().await;
-            let flush_response = self
-                .metrics_aggr_handle
-                .clone()
-                .flush()
-                .await
-                .expect("can't flush metrics handle");
-            (
-                locked_metrics.clone(),
-                flush_response.series,
-                flush_response.distributions,
-            )
-        };
+        let flush_response = self
+            .metrics_aggr_handle
+            .clone()
+            .flush()
+            .await
+            .expect("can't flush metrics handle");
+        let series = flush_response.series;
+        let sketches = flush_response.distributions;
 
-        for (idx, mut flusher) in metrics_flushers_copy.into_iter().enumerate() {
+        for (idx, flusher) in self.metrics_flushers.iter().enumerate() {
+            let flusher = flusher.clone();
             let series_clone = series.clone();
             let sketches_clone = sketches.clone();
             let handle = tokio::spawn(async move {
@@ -240,8 +234,7 @@ impl FlushingService {
                             retry_batch.sketches.len()
                         );
                         joinset.spawn(async move {
-                            let mut locked_flushers = mf.lock().await;
-                            if let Some(flusher) = locked_flushers.get_mut(retry_batch.flusher_id) {
+                            if let Some(flusher) = mf.get(retry_batch.flusher_id) {
                                 flusher
                                     .flush_metrics(retry_batch.series, retry_batch.sketches)
                                     .await;
@@ -288,34 +281,42 @@ impl FlushingService {
         flush_error
     }
 
-    /// Performs a blocking flush of all data.
+    /// Performs a blocking flush of all telemetry data.
     ///
-    /// This method flushes all data synchronously using `tokio::join!` for parallelism.
-    /// Unlike `spawn_non_blocking`, this waits for all flushes to complete before returning.
+    /// Flushes logs, metrics (series and distributions), traces, stats, and APM proxy
+    /// data in parallel using `tokio::join!`. Unlike `spawn_non_blocking`, this waits
+    /// for all flushes to complete before returning.
     ///
-    /// # Arguments
+    /// The stats flusher respects its normal timing constraints (time-based bucketing),
+    /// which may result in some stats being held back until the next flush cycle.
+    pub async fn flush_blocking(&self) {
+        self.flush_blocking_inner(false).await;
+    }
+
+    /// Performs a final blocking flush of all telemetry data before shutdown.
     ///
-    /// * `force_stats` - If `true`, forces the stats flusher to flush immediately
-    ///   regardless of timing constraints.
-    /// * `metrics_flushers` - Mutable slice of metrics flushers. The caller must acquire
-    ///   the lock before calling this method.
+    /// Flushes logs, metrics (series and distributions), traces, stats, and APM proxy
+    /// data in parallel. Unlike `flush_blocking`, this forces the stats flusher to
+    /// flush immediately regardless of its normal timing constraints.
     ///
-    /// # Note
+    /// Use this during shutdown when this is the last opportunity to send data.
+    pub async fn flush_blocking_final(&self) {
+        self.flush_blocking_inner(true).await;
+    }
+
+    /// Internal implementation for blocking flush operations.
     ///
-    /// TODO: The caller must acquire the lock on `metrics_flushers` and pass a mutable slice
-    /// because `MetricsFlusher::flush_metrics` requires `&mut self`. This creates awkward
-    /// ergonomics. Consider modifying the `dogstatsd` crate to use interior mutability
-    /// (e.g., `Arc<Mutex<...>>` internally) so `flush_metrics` can take `&self`, allowing
-    /// this method to handle locking internally.
-    pub async fn flush_blocking(&self, force_stats: bool, metrics_flushers: &mut [MetricsFlusher]) {
+    /// Fetches metrics from the aggregator and flushes all data types in parallel.
+    async fn flush_blocking_inner(&self, force_stats: bool) {
         let flush_response = self
             .metrics_aggr_handle
             .flush()
             .await
             .expect("can't flush metrics aggr handle");
 
-        let metrics_futures: Vec<_> = metrics_flushers
-            .iter_mut()
+        let metrics_futures: Vec<_> = self
+            .metrics_flushers
+            .iter()
             .map(|f| {
                 f.flush_metrics(
                     flush_response.series.clone(),
@@ -332,15 +333,6 @@ impl FlushingService {
             self.proxy_flusher.flush(None),
         );
     }
-
-    /// Returns a reference to the metrics flushers mutex for external locking.
-    ///
-    /// This is useful when you need to lock the metrics flushers and pass them
-    /// to `flush_blocking` or `flush_blocking_with_interval`.
-    #[must_use]
-    pub fn metrics_flushers(&self) -> &Arc<TokioMutex<Vec<MetricsFlusher>>> {
-        &self.metrics_flushers
-    }
 }
 
 impl std::fmt::Debug for FlushingService {
diff --git a/bottlecap/tests/metrics_integration_test.rs b/bottlecap/tests/metrics_integration_test.rs
@@ -57,7 +57,7 @@ async fn test_enhanced_metrics() {
         retry_strategy: dogstatsd::datadog::RetryStrategy::Immediate(1),
         compression_level: 6,
     };
-    let mut metrics_flusher = MetricsFlusher::new(flusher_config);
+    let metrics_flusher = MetricsFlusher::new(flusher_config);
     let lambda_enhanced_metrics =
         enhanced_metrics::new(metrics_aggr_handle.clone(), Arc::clone(&arc_config));