@@ -532,6 +532,11 @@ func (s *Server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR
532532
533533 sbxlogger .E (sbx ).Info (ctx , "Killing sandbox" , zap .String ("kill_reason" , killReason ))
534534
535+ // Record why the execution is ending before triggering the stop, so the
536+ // lifecycle observer reads the kill reason rather than treating it as a
537+ // crash. The execution-duration sample is emitted there, not here.
538+ sbx .SetEndReason (killReason )
539+
535540 // Check health metrics before stopping the sandbox
536541 sbx .Checks .Healthcheck (ctx , true )
537542
@@ -552,7 +557,6 @@ func (s *Server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR
552557 eventData [executionEventDataKey ] = s .getSandboxExecutionData (sbx )
553558 addKillReason (eventData , killReason )
554559 recordSandboxKill (ctx , s .sandboxKilledCounter , killReason )
555- s .recordExecutionDuration (ctx , sbx , killReason )
556560
557561 eventType := events .SandboxKilledEventPair
558562 go s .sbxEventsService .Publish (
@@ -595,14 +599,47 @@ func recordSandboxKill(ctx context.Context, counter metric.Int64Counter, killRea
595599 counter .Add (ctx , 1 , metric .WithAttributes (attribute .String ("kill_reason" , killReason )))
596600}
597601
598- // endReasonPause labels execution-duration samples for executions that ended
599- // because the sandbox was paused rather than killed.
600- const endReasonPause = "pause"
602+ const (
603+ // endReasonPause labels execution-duration samples for executions that
604+ // ended because the sandbox was paused.
605+ endReasonPause = "pause"
606+
607+ // endReasonCrashed labels executions that ended without an initiated
608+ // teardown — the Firecracker process exited on its own (crash, OOM, host
609+ // failure) and no kill/pause set an end reason.
610+ endReasonCrashed = "crashed"
611+
612+ // endReasonCheckpoint marks the old sandbox object replaced by an
613+ // in-place checkpoint resume. The ExecutionID continues on the resumed
614+ // sandbox, so this stop is not an execution boundary and must not emit a
615+ // duration sample.
616+ endReasonCheckpoint = "checkpoint"
617+ )
618+
619+ // resolveEndReason maps a sandbox's recorded teardown reason to the
620+ // execution-duration end_reason label. An unset reason means no teardown was
621+ // initiated: a non-nil exit error is a crash, otherwise it is unknown.
622+ func resolveEndReason (reason string , exitErr error ) string {
623+ if reason != "" {
624+ return reason
625+ }
626+
627+ if exitErr != nil {
628+ return endReasonCrashed
629+ }
630+
631+ return killReasonUnknown
632+ }
601633
602634// recordExecutionDuration records the duration of a single sandbox execution
603635// (start/resume until pause or kill) tagged with the reason the execution
604- // ended. For kills this is the kill reason; for pauses it is endReasonPause.
636+ // ended.
605637func (s * Server ) recordExecutionDuration (ctx context.Context , sbx * sandbox.Sandbox , endReason string ) {
638+ if endReason == endReasonCheckpoint {
639+ // An in-place checkpoint resume is not an execution boundary.
640+ return
641+ }
642+
606643 startedAt := sbx .GetStartedAt ()
607644 if startedAt .IsZero () {
608645 // A zero start time means the sandbox never finished starting (e.g. it
@@ -664,6 +701,11 @@ func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
664701
665702 sbxlogger .E (sbx ).Info (ctx , "Pausing sandbox" )
666703
704+ // Mark this execution as ending in a pause before the stop is triggered, so
705+ // the lifecycle observer records it as paused rather than crashed. The
706+ // execution-duration sample is emitted there, not here.
707+ sbx .SetEndReason (endReasonPause )
708+
667709 // Stop the old sandbox in background after we're done
668710 defer s .stopSandboxAsync (context .WithoutCancel (ctx ), sbx )
669711
@@ -677,8 +719,6 @@ func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
677719
678720 s .uploadSnapshotAsync (ctx , sbx , res )
679721
680- s .recordExecutionDuration (ctx , sbx , endReasonPause )
681-
682722 teamID , buildId , eventData := s .prepareSandboxEventData (ctx , sbx )
683723 eventData [executionEventDataKey ] = s .getSandboxExecutionData (sbx )
684724
@@ -818,6 +858,14 @@ func (s *Server) Checkpoint(ctx context.Context, in *orchestrator.SandboxCheckpo
818858 // after the last checkpoint.
819859 resumedSbx .SetStartedAt (sbx .GetStartedAt ())
820860
861+ // The resume succeeded and resumedSbx now carries this execution. Mark the
862+ // old sandbox object as a checkpoint hand-off so its impending stop is not
863+ // recorded as an execution end (or a crash) — the execution continues under
864+ // the same ExecutionID and will be recorded when resumedSbx is paused or
865+ // killed. On checkpoint failure we return before this point, so the old
866+ // sandbox's abnormal stop is still recorded as a crash.
867+ sbx .SetEndReason (endReasonCheckpoint )
868+
821869 // Collect prefetch data immediately after resume while it's most accurate
822870 prefetchData , prefetchErr := resumedSbx .MemoryPrefetchData (ctx )
823871 if prefetchErr != nil {
@@ -1063,6 +1111,12 @@ func (s *Server) setupSandboxLifecycle(ctx context.Context, sbx *sandbox.Sandbox
10631111 sbxlogger .I (sbx ).Error (ctx , "failed to wait for sandbox, cleaning up" , zap .Error (waitErr ))
10641112 }
10651113
1114+ // This is the single observer of every execution end (kill, pause, or
1115+ // crash), so record the execution duration here. An unset end reason
1116+ // means no teardown was initiated: derive killed/paused from the reason
1117+ // set by Delete/Pause, otherwise classify by the exit error as crashed.
1118+ s .recordExecutionDuration (ctx , sbx , resolveEndReason (sbx .GetEndReason (), waitErr ))
1119+
10661120 cleanupErr := sbx .Close (ctx )
10671121 if cleanupErr != nil {
10681122 sbxlogger .I (sbx ).Error (ctx , "failed to cleanup sandbox, will remove from cache" , zap .Error (cleanupErr ))
0 commit comments