Merge branch 'main' into joey/handle_msk

joeyzhao2018 · web-flow · commit 85229617ec08 · 2026-03-23T11:46:17.000-04:00
diff --git a/.github/workflows/rs_ci.yml b/.github/workflows/rs_ci.yml
@@ -135,7 +135,7 @@ jobs:
       - uses: actions-rust-lang/setup-rust-toolchain@150fca883cd4034361b621bd4e6a9d34e5143606 # v1.15.4
         with:
           cache: false
-      - uses: taiki-e/install-action@94a7388bec5d4c8dd93e3ebf09e0ff448f3f6f4d # v2.68.35
+      - uses: taiki-e/install-action@c12d62a803cbdfe2e7263af15f5a9548065cb4f2 # v2.69.3
         with:
           tool: nextest@0.9
       - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
diff --git a/bottlecap/src/tags/lambda/tags.rs b/bottlecap/src/tags/lambda/tags.rs
@@ -45,7 +45,7 @@ const FUNCTION_TAGS_KEY: &str = "_dd.tags.function";
 // TODO(astuyve) decide what to do with the version
 const EXTENSION_VERSION_KEY: &str = "dd_extension_version";
 // TODO(duncanista) figure out a better way to not hardcode this
-pub const EXTENSION_VERSION: &str = "93-next";
+pub const EXTENSION_VERSION: &str = "94-next";
 
 const REGION_KEY: &str = "region";
 const ACCOUNT_ID_KEY: &str = "account_id";
diff --git a/bottlecap/src/traces/stats_flusher.rs b/bottlecap/src/traces/stats_flusher.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use tokio::sync::Mutex;
 use tokio::sync::OnceCell;
 
+use crate::FLUSH_RETRY_COUNT;
 use crate::config;
 use crate::lifecycle::invocation::processor::S_TO_MS;
 use crate::traces::http_client::HttpClient;
@@ -93,32 +94,36 @@ impl StatsFlusher {
 
         let stats_url = trace_stats_url(&self.config.site);
 
-        let start = std::time::Instant::now();
-
-        let resp = stats_utils::send_stats_payload_with_client(
-            serialized_stats_payload,
-            endpoint,
-            api_key.as_str(),
-            Some(&self.http_client),
-        )
-        .await;
-        let elapsed = start.elapsed();
-        debug!(
-            "STATS | Stats request to {} took {} ms",
-            stats_url,
-            elapsed.as_millis()
-        );
-        match resp {
-            Ok(()) => {
-                debug!("STATS | Successfully flushed stats");
-                None
-            }
-            Err(e) => {
-                // Network/server errors are temporary - return stats for retry
-                error!("STATS | Error sending stats: {e:?}");
-                Some(stats)
+        for attempt in 1..=FLUSH_RETRY_COUNT {
+            let start = std::time::Instant::now();
+            let resp = stats_utils::send_stats_payload_with_client(
+                serialized_stats_payload.clone(),
+                endpoint,
+                api_key.as_str(),
+                Some(&self.http_client),
+            )
+            .await;
+            let elapsed = start.elapsed();
+
+            match resp {
+                Ok(()) => {
+                    debug!(
+                        "STATS | Successfully flushed stats to {stats_url} in {} ms (attempt {attempt}/{FLUSH_RETRY_COUNT})",
+                        elapsed.as_millis()
+                    );
+                    return None;
+                }
+                Err(e) => {
+                    debug!(
+                        "STATS | Failed to send stats to {stats_url} in {} ms (attempt {attempt}/{FLUSH_RETRY_COUNT}): {e:?}",
+                        elapsed.as_millis()
+                    );
+                }
             }
         }
+
+        error!("STATS | Exhausted all {FLUSH_RETRY_COUNT} attempts, returning stats for redrive");
+        Some(stats)
     }
 
     /// Flushes stats from the aggregator.
diff --git a/bottlecap/src/traces/trace_flusher.rs b/bottlecap/src/traces/trace_flusher.rs
@@ -6,6 +6,7 @@ use libdd_common::Endpoint;
 use libdd_trace_utils::{
     config_utils::trace_intake_url_prefixed,
     send_data::SendData,
+    send_with_retry::{RetryBackoffType, RetryStrategy},
     trace_utils::{self},
     tracer_payload::TracerPayloadCollection,
 };
@@ -14,11 +15,25 @@ use std::sync::Arc;
 use tokio::task::JoinSet;
 use tracing::{debug, error};
 
+use crate::FLUSH_RETRY_COUNT;
 use crate::config::Config;
 use crate::lifecycle::invocation::processor::S_TO_MS;
 use crate::traces::http_client::HttpClient;
 use crate::traces::trace_aggregator_service::AggregatorHandle;
 
+/// Retry strategy for trace flushing using the shared `FLUSH_RETRY_COUNT`
+/// with no delay between attempts. In Lambda, every millisecond of wall-clock
+/// time matters, and the per-attempt request timeout already bounds how long
+/// each retry can take.
+fn trace_retry_strategy() -> RetryStrategy {
+    RetryStrategy::new(
+        u32::try_from(FLUSH_RETRY_COUNT).unwrap_or(3),
+        0,
+        RetryBackoffType::Constant,
+        None,
+    )
+}
+
 pub struct TraceFlusher {
     pub aggregator_handle: AggregatorHandle,
     pub config: Arc<Config>,
@@ -113,7 +128,11 @@ impl TraceFlusher {
             let traces_with_tags: Vec<_> = trace_builders
                 .into_iter()
                 .map(|info| {
-                    let trace = info.builder.with_api_key(api_key.as_str()).build();
+                    let trace = info
+                        .builder
+                        .with_api_key(api_key.as_str())
+                        .with_retry_strategy(trace_retry_strategy())
+                        .build();
                     (trace, info.header_tags)
                 })
                 .collect();
@@ -125,12 +144,16 @@ impl TraceFlusher {
                 let additional_traces: Vec<_> = traces_with_tags
                     .iter()
                     .filter_map(|(trace, tags)| match trace.get_payloads() {
-                        TracerPayloadCollection::V07(payloads) => Some(SendData::new(
-                            trace.len(),
-                            TracerPayloadCollection::V07(payloads.clone()),
-                            tags.to_tracer_header_tags(),
-                            &endpoint,
-                        )),
+                        TracerPayloadCollection::V07(payloads) => {
+                            let mut send_data = SendData::new(
+                                trace.len(),
+                                TracerPayloadCollection::V07(payloads.clone()),
+                                tags.to_tracer_header_tags(),
+                                &endpoint,
+                            );
+                            send_data.set_retry_strategy(trace_retry_strategy());
+                            Some(send_data)
+                        }
                         // All payloads in the extension are V07 (produced by
                         // collect_pb_trace_chunks), so this branch is unreachable.
                         _ => None,
@@ -174,12 +197,23 @@ impl TraceFlusher {
         debug!("TRACES | Flushing {} traces", coalesced_traces.len());
 
         for trace in &coalesced_traces {
-            let send_result = trace.send(&http_client).await.last_result;
-
-            if let Err(e) = send_result {
-                error!("TRACES | Request failed: {e:?}");
+            let result = trace.send(&http_client).await;
+
+            if let Err(e) = &result.last_result {
+                error!(
+                    "TRACES | Request failed after {} attempts ({} timeouts, {} network errors, {} status code errors): {e:?}",
+                    result.requests_count,
+                    result.errors_timeout,
+                    result.errors_network,
+                    result.errors_status_code,
+                );
                 return Some(coalesced_traces);
             }
+
+            debug!(
+                "TRACES | Successfully sent trace ({} attempts, {} bytes)",
+                result.requests_count, result.bytes_sent,
+            );
         }
 
         debug!("TRACES | Flushing took {} ms", start.elapsed().as_millis());
diff --git a/integration-tests/tests/lmi.test.ts b/integration-tests/tests/lmi.test.ts
@@ -9,7 +9,7 @@ const identifier = getIdentifier();
 const stackName = `integ-${identifier}-lmi`;
 
 describe('LMI Integration Tests', () => {
-  let results: Record<string, DatadogTelemetry[][]>;
+  let telemetry: Record<string, DatadogTelemetry>;
 
   beforeAll(async () => {
     const functions: FunctionConfig[] = runtimes.map(runtime => ({
@@ -20,13 +20,13 @@ describe('LMI Integration Tests', () => {
     console.log('Invoking LMI functions...');
 
     // Invoke all LMI functions and collect telemetry
-    results = await invokeAndCollectTelemetry(functions, 1);
+    telemetry = await invokeAndCollectTelemetry(functions, 1);
 
     console.log('LMI invocation and data fetching completed');
   }, 600000);
 
   describe.each(runtimes)('%s Runtime with LMI', (runtime) => {
-    const getResult = () => results[runtime]?.[0]?.[0];
+    const getResult = () => telemetry[runtime]?.threads[0]?.[0];
 
     it('should invoke Lambda successfully', () => {
       const result = getResult();
diff --git a/integration-tests/tests/on-demand.test.ts b/integration-tests/tests/on-demand.test.ts
@@ -1,5 +1,5 @@
 import { invokeAndCollectTelemetry, FunctionConfig } from './utils/default';
-import { DatadogTelemetry } from './utils/datadog';
+import { DatadogTelemetry, DURATION_METRICS } from './utils/datadog';
 import { forceColdStart } from './utils/lambda';
 import { getIdentifier } from '../config';
 
@@ -10,27 +10,25 @@ const identifier = getIdentifier();
 const stackName = `integ-${identifier}-on-demand`;
 
 describe('On-Demand Integration Tests', () => {
-  let results: Record<string, DatadogTelemetry[][]>;
+  let telemetry: Record<string, DatadogTelemetry>;
 
   beforeAll(async () => {
     const functions: FunctionConfig[] = runtimes.map(runtime => ({
       functionName: `${stackName}-${runtime}-lambda`,
       runtime,
     }));
 
-    // Force cold starts
     await Promise.all(functions.map(fn => forceColdStart(fn.functionName)));
 
-    // Add 5s delay between invocations to ensure warm container is reused
-    // Required because there is post-runtime processing with 'end' flush strategy
-    results = await invokeAndCollectTelemetry(functions, 2, 1, 5000);
+    telemetry = await invokeAndCollectTelemetry(functions, 2, 1, 5000);
 
     console.log('All invocations and data fetching completed');
   }, 600000);
 
   describe.each(runtimes)('%s runtime', (runtime) => {
-    const getFirstInvocation = () => results[runtime]?.[0]?.[0];
-    const getSecondInvocation = () => results[runtime]?.[0]?.[1];
+    const getTelemetry = () => telemetry[runtime];
+    const getFirstInvocation = () => getTelemetry()?.threads[0]?.[0];
+    const getSecondInvocation = () => getTelemetry()?.threads[0]?.[1];
 
     describe('first invocation (cold start)', () => {
       it('should invoke Lambda successfully', () => {
@@ -74,7 +72,6 @@ describe('On-Demand Integration Tests', () => {
         });
       });
 
-      // Python has known issues with cold_start spans - mark as failing
       if (runtime === 'python') {
         it.failing('[failing] should have aws.lambda.cold_start span', () => {
           const result = getFirstInvocation();
@@ -151,5 +148,13 @@ describe('On-Demand Integration Tests', () => {
         expect(coldStartSpan).toBeUndefined();
       });
     });
+
+    describe.skip.each(DURATION_METRICS)('%s', (metric) => {
+      it('should have points with positive values', () => {
+        const points = getTelemetry().metrics[metric];
+        expect(points.length).toBeGreaterThan(0);
+        expect(points.every(p => p.value >= 0)).toBe(true);
+      });
+    });
   });
 });
diff --git a/integration-tests/tests/otlp.test.ts b/integration-tests/tests/otlp.test.ts
@@ -9,7 +9,7 @@ const identifier = getIdentifier();
 const stackName = `integ-${identifier}-otlp`;
 
 describe('OTLP Integration Tests', () => {
-  let results: Record<string, DatadogTelemetry[][]>;
+  let telemetry: Record<string, DatadogTelemetry>;
 
   beforeAll(async () => {
     // Build function configs for all runtimes plus response validation
@@ -27,13 +27,13 @@ describe('OTLP Integration Tests', () => {
     console.log('Invoking all OTLP Lambda functions...');
 
     // Invoke all OTLP functions and collect telemetry
-    results = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS);
+    telemetry = await invokeAndCollectTelemetry(functions, 1, 1, 0, {}, DATADOG_INDEXING_WAIT_5_MIN_MS);
 
     console.log('All OTLP Lambda invocations and data fetching completed');
   }, 700000);
 
   describe.each(runtimes)('%s Runtime', (runtime) => {
-    const getResult = () => results[runtime]?.[0]?.[0];
+    const getResult = () => telemetry[runtime]?.threads[0]?.[0];
 
     it('should invoke Lambda successfully', () => {
       const result = getResult();
@@ -56,7 +56,7 @@ describe('OTLP Integration Tests', () => {
   });
 
   describe('OTLP Response Validation', () => {
-    const getResult = () => results['responseValidation']?.[0]?.[0];
+    const getResult = () => telemetry['responseValidation']?.threads[0]?.[0];
 
     it('should invoke response validation Lambda successfully', () => {
       const result = getResult();
diff --git a/integration-tests/tests/snapstart.test.ts b/integration-tests/tests/snapstart.test.ts
@@ -10,7 +10,7 @@ const identifier = getIdentifier();
 const stackName = `integ-${identifier}-snapstart`;
 
 describe('Snapstart Integration Tests', () => {
-  let results: Record<string, DatadogTelemetry[][]>;
+  let telemetry: Record<string, DatadogTelemetry>;
 
   beforeAll(async () => {
     // Publish new versions and wait for SnapStart optimization
@@ -43,20 +43,20 @@ describe('Snapstart Integration Tests', () => {
     // - Second invocation: warm (no snapstart_restore span)
     // - 5s delay ensures warm container reuse
     // - 2 threads for trace isolation testing
-    results = await invokeAndCollectTelemetry(functions, 2, 2, 5000);
+    telemetry = await invokeAndCollectTelemetry(functions, 2, 2, 5000);
 
     console.log('All Snapstart Lambda invocations and data fetching completed');
   }, 900000);
 
   describe.each(runtimes)('%s Runtime with SnapStart', (runtime) => {
     // With concurrency=2, invocations=2:
-    // - results[runtime][0][0] = thread 0, first invocation (restore)
-    // - results[runtime][0][1] = thread 0, second invocation (warm)
-    // - results[runtime][1][0] = thread 1, first invocation (restore)
-    // - results[runtime][1][1] = thread 1, second invocation (warm)
-    const getRestoreInvocation = () => results[runtime]?.[0]?.[0];
-    const getWarmInvocation = () => results[runtime]?.[0]?.[1];
-    const getOtherThreadInvocation = () => results[runtime]?.[1]?.[0];
+    // - telemetry[runtime].threads[0][0] = thread 0, first invocation (restore)
+    // - telemetry[runtime].threads[0][1] = thread 0, second invocation (warm)
+    // - telemetry[runtime].threads[1][0] = thread 1, first invocation (restore)
+    // - telemetry[runtime].threads[1][1] = thread 1, second invocation (warm)
+    const getRestoreInvocation = () => telemetry[runtime]?.threads[0]?.[0];
+    const getWarmInvocation = () => telemetry[runtime]?.threads[0]?.[1];
+    const getOtherThreadInvocation = () => telemetry[runtime]?.threads[1]?.[0];
 
     describe('first invocation (restore from snapshot)', () => {
       it('should invoke successfully', () => {
@@ -150,10 +150,10 @@ describe('Snapstart Integration Tests', () => {
 
     describe('trace isolation', () => {
       it('should have different trace IDs for all 4 invocations', () => {
-        const thread0Restore = results[runtime]?.[0]?.[0];
-        const thread0Warm = results[runtime]?.[0]?.[1];
-        const thread1Restore = results[runtime]?.[1]?.[0];
-        const thread1Warm = results[runtime]?.[1]?.[1];
+        const thread0Restore = telemetry[runtime]?.threads[0]?.[0];
+        const thread0Warm = telemetry[runtime]?.threads[0]?.[1];
+        const thread1Restore = telemetry[runtime]?.threads[1]?.[0];
+        const thread1Warm = telemetry[runtime]?.threads[1]?.[1];
 
         expect(thread0Restore).toBeDefined();
         expect(thread0Warm).toBeDefined();
diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts
diff --git a/integration-tests/tests/utils/default.ts b/integration-tests/tests/utils/default.ts