Combine handle_next_invocation and next_event. Don't block for PlatformReport on timeout shutdowns (#692)

astuyve · web-flow · commit a3d3b1f389c0 · 2025-06-03T13:19:03.000-04:00
1. Refactors handle_next_invocation to call next_event. The methods are separate because we call next_event in an idle loop, but otherwise we always call them together from main, so this simplifies things 2. In the shutdown loop, only block for the report line if the shutdown isn't a timeout. On timeouts, we won't get a report log. That's not a problem for most use cases where the sandbox will re-initailize, the telemetry API will re-pass events, and then we can forward events on the next invocation. But if a function continuosly times out, we may not forward the custom `task timed out` log until a few invocations down the line. ~I'll verify if this is true with OOMs~ OOMs can be variable, so i've added a new line to the END log to explain the status reason. For node it's a runtimeExit: <img width="941" alt="image" src="https://github.com/user-attachments/assets/e22a56e1-9276-488d-b333-29ec644489bd" />
diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs
@@ -496,12 +496,11 @@ async fn extension_loop_active(
         start_time.elapsed().as_millis().to_string()
     );
     // first invoke we must call next
-    let next_lambda_response = next_event(client, &r.extension_id).await;
     let mut pending_flush_handles = PendingFlushHandles::new();
     let mut last_continuous_flush_error = false;
-    handle_next_invocation(next_lambda_response, invocation_processor.clone()).await;
+    handle_next_invocation(client, &r.extension_id, invocation_processor.clone()).await;
     loop {
-        let shutdown;
+        let maybe_shutdown_event;
 
         let current_flush_decision = flush_control.evaluate_flush_decision();
         if current_flush_decision == FlushDecision::End {
@@ -542,8 +541,8 @@ async fn extension_loop_active(
                 &mut race_flush_interval,
             )
             .await;
-            let next_response = next_event(client, &r.extension_id).await;
-            shutdown = handle_next_invocation(next_response, invocation_processor.clone()).await;
+            maybe_shutdown_event =
+                handle_next_invocation(client, &r.extension_id, invocation_processor.clone()).await;
         } else {
             //Periodic flush scenario, flush at top of invocation
             if current_flush_decision == FlushDecision::Continuous && !last_continuous_flush_error {
@@ -593,7 +592,8 @@ async fn extension_loop_active(
             // If we get platform.runtimeDone or platform.runtimeReport
             // That's fine, we still wait to break until we get the response from next
             // and then we break to determine if we'll flush or not
-            let next_lambda_response = next_event(client, &r.extension_id);
+            let next_lambda_response =
+                handle_next_invocation(client, &r.extension_id, invocation_processor.clone());
             tokio::pin!(next_lambda_response);
             'next_invocation: loop {
                 tokio::select! {
@@ -607,7 +607,7 @@ async fn extension_loop_active(
                         race_flush_interval.reset();
                         // Thank you for not removing race_flush_interval.reset();
 
-                        shutdown = handle_next_invocation(next_response, invocation_processor.clone()).await;
+                        maybe_shutdown_event= next_response;
                         // Need to break here to re-call next
                         break 'next_invocation;
                     }
@@ -629,19 +629,26 @@ async fn extension_loop_active(
             }
         }
 
-        if shutdown {
+        if let NextEventResponse::Shutdown {
+            shutdown_reason, ..
+        } = maybe_shutdown_event
+        {
             // Redrive/block on any failed payloads
             let tf = trace_flusher.clone();
             pending_flush_handles
                 .await_flush_handles(&logs_flusher.clone(), &tf, &metrics_flusher)
                 .await;
-            'shutdown: loop {
-                tokio::select! {
-                    Some(event) = event_bus.rx.recv() => {
-                        if let Some(telemetry_event) = handle_event_bus_event(event, invocation_processor.clone(), tags_provider.clone(), trace_processor.clone(), trace_agent_channel.clone()).await {
-                            if let TelemetryRecord::PlatformReport{ .. } = telemetry_event.record {
-                                // Wait for the report event before shutting down
-                                break 'shutdown;
+            // The Shutdown event we get during a timeout will
+            // never include a report log
+            if shutdown_reason != "timeout" {
+                'shutdown: loop {
+                    tokio::select! {
+                        Some(event) = event_bus.rx.recv() => {
+                            if let Some(telemetry_event) = handle_event_bus_event(event, invocation_processor.clone(), tags_provider.clone(), trace_processor.clone(), trace_agent_channel.clone()).await {
+                                if let TelemetryRecord::PlatformReport{ .. } = telemetry_event.record {
+                                    // Wait for the report event before shutting down
+                                    break 'shutdown;
+                                }
                             }
                         }
                     }
@@ -759,39 +766,44 @@ async fn handle_event_bus_event(
 }
 
 async fn handle_next_invocation(
-    next_response: Result<NextEventResponse>,
+    client: &Client,
+    ext_id: &str,
     invocation_processor: Arc<TokioMutex<InvocationProcessor>>,
-) -> bool {
+) -> NextEventResponse {
+    let next_response = next_event(client, ext_id).await;
     match next_response {
         Ok(NextEventResponse::Invoke {
-            request_id,
+            ref request_id,
             deadline_ms,
-            invoked_function_arn,
+            ref invoked_function_arn,
         }) => {
             debug!(
                 "Invoke event {}; deadline: {}, invoked_function_arn: {}",
-                request_id, deadline_ms, invoked_function_arn
+                request_id.clone(),
+                deadline_ms,
+                invoked_function_arn.clone()
             );
             let mut p = invocation_processor.lock().await;
-            p.on_invoke_event(request_id);
+            p.on_invoke_event(request_id.into());
             drop(p);
-            false
         }
         Ok(NextEventResponse::Shutdown {
-            shutdown_reason,
+            ref shutdown_reason,
             deadline_ms,
         }) => {
             let mut p = invocation_processor.lock().await;
             p.on_shutdown_event();
             println!("Exiting: {shutdown_reason}, deadline: {deadline_ms}");
-            true
         }
-        Err(err) => {
+        Err(ref err) => {
             eprintln!("Error: {err:?}");
             println!("Exiting");
-            true
         }
     }
+    next_response.unwrap_or(NextEventResponse::Shutdown {
+        shutdown_reason: "panic".into(),
+        deadline_ms: 0,
+    })
 }
 
 fn setup_tag_provider(
diff --git a/bottlecap/src/logs/lambda/mod.rs b/bottlecap/src/logs/lambda/mod.rs
@@ -43,6 +43,7 @@ impl Message {
         request_id: Option<String>,
         function_arn: String,
         timestamp: i64,
+        status: Option<String>,
     ) -> Message {
         Message {
             message,
@@ -51,7 +52,7 @@ impl Message {
                 request_id,
             },
             timestamp,
-            status: "info".to_string(),
+            status: status.unwrap_or("info".to_string()),
         }
     }
 }
diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs
@@ -103,6 +103,7 @@ impl LambdaProcessor {
                         None,
                         self.function_arn.clone(),
                         event.time.timestamp_millis(),
+                        None,
                     ));
                 }
 
@@ -125,6 +126,7 @@ impl LambdaProcessor {
                     None,
                     self.function_arn.clone(),
                     event.time.timestamp_millis(),
+                    None,
                 ))
             },
             // TODO: check if we could do anything with the fields from `PlatformInitReport`
@@ -153,18 +155,24 @@ impl LambdaProcessor {
                     Some(request_id),
                     self.function_arn.clone(),
                     event.time.timestamp_millis(),
+                    None,
                 ))
             },
-            TelemetryRecord::PlatformRuntimeDone { request_id, status, metrics, .. } => {  // TODO: check what to do with rest of the fields
+            TelemetryRecord::PlatformRuntimeDone { request_id, status, metrics, error_type, .. } => {  // TODO: check what to do with rest of the fields
                 if let Err(e) = self.event_bus.send(Event::Telemetry(copy)).await {
                     error!("Failed to send PlatformRuntimeDone to the main event bus: {}", e);
                 }
 
                 let mut message = format!("END RequestId: {request_id}"); 
+                let mut result_status = "info".to_string();
                 if let Some(metrics) = metrics {
                     self.invocation_context.runtime_duration_ms = metrics.duration_ms;
                     if status == Status::Timeout {
                         message.push_str(&format!(" Task timed out after {:.2} seconds", metrics.duration_ms / 1000.0));
+                        result_status = "error".to_string();
+                    } else if status == Status::Error {
+                        message.push_str(&format!(" Task failed: {:?}", error_type.unwrap_or_default()));
+                        result_status = "error".to_string();
                     }
                 }
                 // Remove the `request_id` since no more orphan logs will be processed with this one
@@ -175,6 +183,7 @@ impl LambdaProcessor {
                     Some(request_id),
                     self.function_arn.clone(),
                     event.time.timestamp_millis(),
+                    Some(result_status),
                 ))
             },
             TelemetryRecord::PlatformReport { request_id, metrics, .. } => { // TODO: check what to do with rest of the fields
@@ -209,6 +218,7 @@ impl LambdaProcessor {
                     Some(request_id),
                     self.function_arn.clone(),
                     event.time.timestamp_millis(),
+                    None,
                 ))
             },
             // TODO: PlatformInitRuntimeDone
@@ -522,7 +532,7 @@ mod tests {
                         request_id: Some("test-request-id".to_string()),
                     },
                     timestamp: 1_673_061_827_000,
-                    status: "info".to_string(),
+                    status: "error".to_string(),
                 },
         ),
 

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ impl Message {`
`43`	`43`	`request_id: Option<String>,`
`44`	`44`	`function_arn: String,`
`45`	`45`	`timestamp: i64,`
	`46`	`+ status: Option<String>,`
`46`	`47`	`) -> Message {`
`47`	`48`	`Message {`
`48`	`49`	`message,`
`@@ -51,7 +52,7 @@ impl Message {`
`51`	`52`	`request_id,`
`52`	`53`	`},`
`53`	`54`	`timestamp,`
`54`		`- status: "info".to_string(),`
	`55`	`+ status: status.unwrap_or("info".to_string()),`
`55`	`56`	`}`
`56`	`57`	`}`
`57`	`58`	`}`