Skip to content

Commit 0177b3c

Browse files
committed
feat(orchestrator): classify execution end as killed, paused, or crashed
Record orchestrator.sandbox.execution.duration from a single termination observer instead of inline in Delete and Pause, so abnormal exits are captured too. The end_reason label now distinguishes killed (the kill reason), paused, and crashed. Add Metadata.endReason (guarded like startedAt/endAt) set by the teardown initiator: Delete sets the kill reason, Pause sets "pause", and Checkpoint marks the old sandbox "checkpoint" so its in-place hand-off is not counted. setupSandboxLifecycle, the one goroutine that observes every sandbox's exit, reads the reason and records the sample; an unset reason with a non-nil exit error is classified as crashed, otherwise unknown.
1 parent 594d3db commit 0177b3c

3 files changed

Lines changed: 115 additions & 8 deletions

File tree

packages/orchestrator/pkg/sandbox/sandbox.go

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,9 +222,39 @@ type Metadata struct {
222222
Config *Config
223223
Runtime RuntimeMetadata
224224

225-
rwmu sync.RWMutex // protects startedAt, endAt
225+
rwmu sync.RWMutex // protects startedAt, endAt, endReason
226226
startedAt time.Time
227227
endAt time.Time
228+
229+
// endReason records why the execution ended (e.g. a kill reason, "pause",
230+
// or "checkpoint"). It is set by whoever initiates the teardown and read at
231+
// the single termination observer (setupSandboxLifecycle) to label the
232+
// execution-duration metric. An empty value means the sandbox terminated
233+
// without an initiated teardown — i.e. it crashed.
234+
endReason string
235+
}
236+
237+
// GetEndReason returns the reason the execution ended in a thread-safe manner.
238+
// An empty string means no teardown reason was set (the sandbox crashed).
239+
func (m *Metadata) GetEndReason() string {
240+
m.rwmu.RLock()
241+
defer m.rwmu.RUnlock()
242+
243+
return m.endReason
244+
}
245+
246+
// SetEndReason records why the execution ended in a thread-safe manner. The
247+
// first non-empty reason wins; later calls are ignored so an initiated teardown
248+
// (kill/pause/checkpoint) is never overwritten by a subsequent stop.
249+
func (m *Metadata) SetEndReason(reason string) {
250+
m.rwmu.Lock()
251+
defer m.rwmu.Unlock()
252+
253+
if m.endReason != "" {
254+
return
255+
}
256+
257+
m.endReason = reason
228258
}
229259

230260
// GetEndAt returns the sandbox end time in a thread-safe manner.

packages/orchestrator/pkg/server/sandboxes.go

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,11 @@ func (s *Server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR
532532

533533
sbxlogger.E(sbx).Info(ctx, "Killing sandbox", zap.String("kill_reason", killReason))
534534

535+
// Record why the execution is ending before triggering the stop, so the
536+
// lifecycle observer reads the kill reason rather than treating it as a
537+
// crash. The execution-duration sample is emitted there, not here.
538+
sbx.SetEndReason(killReason)
539+
535540
// Check health metrics before stopping the sandbox
536541
sbx.Checks.Healthcheck(ctx, true)
537542

@@ -552,7 +557,6 @@ func (s *Server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR
552557
eventData[executionEventDataKey] = s.getSandboxExecutionData(sbx)
553558
addKillReason(eventData, killReason)
554559
recordSandboxKill(ctx, s.sandboxKilledCounter, killReason)
555-
s.recordExecutionDuration(ctx, sbx, killReason)
556560

557561
eventType := events.SandboxKilledEventPair
558562
go s.sbxEventsService.Publish(
@@ -595,14 +599,47 @@ func recordSandboxKill(ctx context.Context, counter metric.Int64Counter, killRea
595599
counter.Add(ctx, 1, metric.WithAttributes(attribute.String("kill_reason", killReason)))
596600
}
597601

598-
// endReasonPause labels execution-duration samples for executions that ended
599-
// because the sandbox was paused rather than killed.
600-
const endReasonPause = "pause"
602+
const (
603+
// endReasonPause labels execution-duration samples for executions that
604+
// ended because the sandbox was paused.
605+
endReasonPause = "pause"
606+
607+
// endReasonCrashed labels executions that ended without an initiated
608+
// teardown — the Firecracker process exited on its own (crash, OOM, host
609+
// failure) and no kill/pause set an end reason.
610+
endReasonCrashed = "crashed"
611+
612+
// endReasonCheckpoint marks the old sandbox object replaced by an
613+
// in-place checkpoint resume. The ExecutionID continues on the resumed
614+
// sandbox, so this stop is not an execution boundary and must not emit a
615+
// duration sample.
616+
endReasonCheckpoint = "checkpoint"
617+
)
618+
619+
// resolveEndReason maps a sandbox's recorded teardown reason to the
620+
// execution-duration end_reason label. An unset reason means no teardown was
621+
// initiated: a non-nil exit error is a crash, otherwise it is unknown.
622+
func resolveEndReason(reason string, exitErr error) string {
623+
if reason != "" {
624+
return reason
625+
}
626+
627+
if exitErr != nil {
628+
return endReasonCrashed
629+
}
630+
631+
return killReasonUnknown
632+
}
601633

602634
// recordExecutionDuration records the duration of a single sandbox execution
603635
// (start/resume until pause or kill) tagged with the reason the execution
604-
// ended. For kills this is the kill reason; for pauses it is endReasonPause.
636+
// ended.
605637
func (s *Server) recordExecutionDuration(ctx context.Context, sbx *sandbox.Sandbox, endReason string) {
638+
if endReason == endReasonCheckpoint {
639+
// An in-place checkpoint resume is not an execution boundary.
640+
return
641+
}
642+
606643
startedAt := sbx.GetStartedAt()
607644
if startedAt.IsZero() {
608645
// A zero start time means the sandbox never finished starting (e.g. it
@@ -664,6 +701,11 @@ func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
664701

665702
sbxlogger.E(sbx).Info(ctx, "Pausing sandbox")
666703

704+
// Mark this execution as ending in a pause before the stop is triggered, so
705+
// the lifecycle observer records it as paused rather than crashed. The
706+
// execution-duration sample is emitted there, not here.
707+
sbx.SetEndReason(endReasonPause)
708+
667709
// Stop the old sandbox in background after we're done
668710
defer s.stopSandboxAsync(context.WithoutCancel(ctx), sbx)
669711

@@ -677,8 +719,6 @@ func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
677719

678720
s.uploadSnapshotAsync(ctx, sbx, res)
679721

680-
s.recordExecutionDuration(ctx, sbx, endReasonPause)
681-
682722
teamID, buildId, eventData := s.prepareSandboxEventData(ctx, sbx)
683723
eventData[executionEventDataKey] = s.getSandboxExecutionData(sbx)
684724

@@ -818,6 +858,14 @@ func (s *Server) Checkpoint(ctx context.Context, in *orchestrator.SandboxCheckpo
818858
// after the last checkpoint.
819859
resumedSbx.SetStartedAt(sbx.GetStartedAt())
820860

861+
// The resume succeeded and resumedSbx now carries this execution. Mark the
862+
// old sandbox object as a checkpoint hand-off so its impending stop is not
863+
// recorded as an execution end (or a crash) — the execution continues under
864+
// the same ExecutionID and will be recorded when resumedSbx is paused or
865+
// killed. On checkpoint failure we return before this point, so the old
866+
// sandbox's abnormal stop is still recorded as a crash.
867+
sbx.SetEndReason(endReasonCheckpoint)
868+
821869
// Collect prefetch data immediately after resume while it's most accurate
822870
prefetchData, prefetchErr := resumedSbx.MemoryPrefetchData(ctx)
823871
if prefetchErr != nil {
@@ -1063,6 +1111,12 @@ func (s *Server) setupSandboxLifecycle(ctx context.Context, sbx *sandbox.Sandbox
10631111
sbxlogger.I(sbx).Error(ctx, "failed to wait for sandbox, cleaning up", zap.Error(waitErr))
10641112
}
10651113

1114+
// This is the single observer of every execution end (kill, pause, or
1115+
// crash), so record the execution duration here. An unset end reason
1116+
// means no teardown was initiated: derive killed/paused from the reason
1117+
// set by Delete/Pause, otherwise classify by the exit error as crashed.
1118+
s.recordExecutionDuration(ctx, sbx, resolveEndReason(sbx.GetEndReason(), waitErr))
1119+
10661120
cleanupErr := sbx.Close(ctx)
10671121
if cleanupErr != nil {
10681122
sbxlogger.I(sbx).Error(ctx, "failed to cleanup sandbox, will remove from cache", zap.Error(cleanupErr))

packages/orchestrator/pkg/server/sandboxes_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package server
44

55
import (
66
"context"
7+
"errors"
78
"net"
89
"reflect"
910
"testing"
@@ -212,12 +213,17 @@ func TestRecordExecutionDuration(t *testing.T) {
212213

213214
s.recordExecutionDuration(context.Background(), newSbx(), "timeout")
214215
s.recordExecutionDuration(context.Background(), newSbx(), endReasonPause)
216+
s.recordExecutionDuration(context.Background(), newSbx(), endReasonCrashed)
215217
s.recordExecutionDuration(context.Background(), newSbx(), "")
216218

217219
// A sandbox that never finished starting has a zero start time and must not
218220
// emit a sample (it would be a massive time.Since(zero) outlier).
219221
s.recordExecutionDuration(context.Background(), &sandbox.Sandbox{Metadata: &sandbox.Metadata{}}, "timeout")
220222

223+
// An in-place checkpoint hand-off is not an execution boundary and must not
224+
// emit a sample.
225+
s.recordExecutionDuration(context.Background(), newSbx(), endReasonCheckpoint)
226+
221227
var rm metricdata.ResourceMetrics
222228
require.NoError(t, reader.Collect(context.Background(), &rm))
223229

@@ -241,5 +247,22 @@ func TestRecordExecutionDuration(t *testing.T) {
241247

242248
assert.Equal(t, uint64(1), got["timeout"])
243249
assert.Equal(t, uint64(1), got[endReasonPause])
250+
assert.Equal(t, uint64(1), got[endReasonCrashed])
244251
assert.Equal(t, uint64(1), got[killReasonUnknown])
252+
assert.Zero(t, got[endReasonCheckpoint])
253+
}
254+
255+
func TestResolveEndReason(t *testing.T) {
256+
t.Parallel()
257+
258+
someErr := errors.New("fc exited")
259+
260+
// An explicit reason always wins, regardless of the exit error.
261+
assert.Equal(t, "timeout", resolveEndReason("timeout", nil))
262+
assert.Equal(t, endReasonPause, resolveEndReason(endReasonPause, someErr))
263+
assert.Equal(t, endReasonCheckpoint, resolveEndReason(endReasonCheckpoint, someErr))
264+
265+
// No reason set: a non-nil exit error is a crash, otherwise unknown.
266+
assert.Equal(t, endReasonCrashed, resolveEndReason("", someErr))
267+
assert.Equal(t, killReasonUnknown, resolveEndReason("", nil))
245268
}

0 commit comments

Comments
 (0)