Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 51 additions & 3 deletions packages/orchestrator/pkg/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,16 @@ const (
SandboxTypeBuild SandboxType = "build"
)

type StopReason string

const (
StopReasonKilled StopReason = "killed"
StopReasonPaused StopReason = "paused"
StopReasonCrashed StopReason = "crashed"
StopReasonCheckpointing StopReason = "checkpointing"
StopReasonUnknown StopReason = "unknown"
)

// String returns the sandbox type as a string, defaulting to "sandbox" if empty.
func (t SandboxType) String() string {
if t == "" {
Expand Down Expand Up @@ -222,9 +232,10 @@ type Metadata struct {
Config *Config
Runtime RuntimeMetadata

rwmu sync.RWMutex // protects startedAt, endAt
startedAt time.Time
endAt time.Time
rwmu sync.RWMutex // protects startedAt, endAt, endReason
startedAt time.Time
endAt time.Time
stopReason *StopReason
}

// GetEndAt returns the sandbox end time in a thread-safe manner.
Expand Down Expand Up @@ -290,6 +301,43 @@ type Sandbox struct {
startupStatsOnce sync.Once
}

// GetStopReason returns stop reason, in case of sandbox exiting sooner, the reason is returned as crashed
func (s *Sandbox) GetStopReason(ctx context.Context) StopReason {
s.Metadata.rwmu.RLock()
defer s.Metadata.rwmu.RUnlock()

if s.Metadata.stopReason == nil {
select {
case <-s.process.Exit.Done():
// The Firecracker process already exited but no reason was recorded,
// so the sandbox went down on its own rather than via an explicit
// pause/checkpoint/kill.

logger.L().Error(ctx, "sandbox crashed", logger.WithSandboxID(s.Runtime.SandboxID), logger.WithExecutionID(s.Runtime.ExecutionID), logger.WithTemplateID(s.Runtime.TemplateID))

return StopReasonCrashed
default:
logger.L().Warn(ctx, "unknown sandbox stop reason", logger.WithSandboxID(s.Runtime.SandboxID), logger.WithExecutionID(s.Runtime.ExecutionID), logger.WithTemplateID(s.Runtime.TemplateID))

return StopReasonUnknown
}
}

return *s.Metadata.stopReason
}
Comment thread
cursor[bot] marked this conversation as resolved.

// SetStopReason records why the execution ended. The first call wins
func (m *Metadata) SetStopReason(reason StopReason) {
m.rwmu.Lock()
defer m.rwmu.Unlock()

if m.stopReason != nil {
return
}

m.stopReason = &reason
}

func (s *Sandbox) LoggerMetadata() sbxlogger.SandboxMetadata {
return sbxlogger.SandboxMetadata{
SandboxID: s.Runtime.SandboxID,
Expand Down
41 changes: 24 additions & 17 deletions packages/orchestrator/pkg/server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,24 @@ type Server struct {
orchestrator.UnimplementedSandboxServiceServer
orchestrator.UnimplementedChunkServiceServer

config cfg.Config
sandboxFactory *sandbox.Factory
info *service.ServiceInfo
proxy *proxy.SandboxProxy
networkPool *network.Pool
templateCache *template.Cache
devicePool *nbd.DevicePool
persistence storage.StorageProvider
featureFlags *featureflags.Client
sbxEventsService *events.EventsService
startingSandboxes *utils.AdjustableSemaphore
peerRegistry peerclient.Registry
uploadedBuilds *ttlcache.Cache[string, struct{}]
uploads *sandbox.Uploads
sandboxCreateDuration metric.Int64Histogram
sandboxKilledCounter metric.Int64Counter
uploadFailedCounter metric.Int64Counter
config cfg.Config
sandboxFactory *sandbox.Factory
info *service.ServiceInfo
proxy *proxy.SandboxProxy
networkPool *network.Pool
templateCache *template.Cache
devicePool *nbd.DevicePool
persistence storage.StorageProvider
featureFlags *featureflags.Client
sbxEventsService *events.EventsService
startingSandboxes *utils.AdjustableSemaphore
peerRegistry peerclient.Registry
uploadedBuilds *ttlcache.Cache[string, struct{}]
uploads *sandbox.Uploads
sandboxCreateDuration metric.Int64Histogram
sandboxExecutionDuration metric.Int64Histogram
sandboxKilledCounter metric.Int64Counter
uploadFailedCounter metric.Int64Counter

// uploadsWG tracks in-flight async snapshot uploads so a graceful shutdown
// can wait for them to finish instead of dropping them. uploadsInFlight is
Expand Down Expand Up @@ -129,6 +130,12 @@ func New(ctx context.Context, cfg ServiceConfig) (*Server, error) {
}
server.sandboxCreateDuration = sandboxCreateDuration

sandboxExecutionDuration, err := telemetry.GetHistogram(meter, telemetry.OrchestratorSandboxExecutionDurationName)
if err != nil {
return nil, fmt.Errorf("failed to register sandbox execution duration histogram: %w", err)
}
server.sandboxExecutionDuration = sandboxExecutionDuration

sandboxKilledCounter, err := telemetry.GetCounter(meter, telemetry.OrchestratorSandboxKilledCounterName)
if err != nil {
return nil, fmt.Errorf("failed to register sandbox kills counter: %w", err)
Expand Down
19 changes: 19 additions & 0 deletions packages/orchestrator/pkg/server/sandboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,8 @@ func (s *Server) Delete(ctxConn context.Context, in *orchestrator.SandboxDeleteR

sbxlogger.E(sbx).Info(ctx, "Killing sandbox", zap.String("kill_reason", killReason))

sbx.SetStopReason(sandbox.StopReasonKilled)

// Check health metrics before stopping the sandbox
sbx.Checks.Healthcheck(ctx, true)

Expand Down Expand Up @@ -594,6 +596,14 @@ func recordSandboxKill(ctx context.Context, counter metric.Int64Counter, killRea
counter.Add(ctx, 1, metric.WithAttributes(attribute.String("kill_reason", killReason)))
}

// recordExecutionDuration records the duration of a single sandbox execution
func (s *Server) recordExecutionDuration(ctx context.Context, sbx *sandbox.Sandbox, stopReason sandbox.StopReason) {
startedAt := sbx.GetStartedAt()

s.sandboxExecutionDuration.Record(ctx, time.Since(startedAt).Milliseconds(),
metric.WithAttributes(attribute.String("stop_reason", string(stopReason))))
}

func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest) (*orchestrator.SandboxPauseResponse, error) {
ctx, childSpan := tracer.Start(ctx, "sandbox-pause")
defer childSpan.End()
Expand Down Expand Up @@ -637,6 +647,7 @@ func (s *Server) Pause(ctx context.Context, in *orchestrator.SandboxPauseRequest
}

sbxlogger.E(sbx).Info(ctx, "Pausing sandbox")
sbx.SetStopReason(sandbox.StopReasonPaused)

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Failed pause labeled as pause

Medium Severity

SetEndReason(endReasonPause) runs before snapshotAndCacheSandbox. If snapshotting fails, the handler still returns an error but the deferred stopSandboxAsync tears the sandbox down, and setupSandboxLifecycle records orchestrator.sandbox.execution.duration with end_reason=pause even though the pause RPC failed and no paused event was published.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 0177b3c. Configure here.

// Stop the old sandbox in background after we're done
defer s.stopSandboxAsync(context.WithoutCancel(ctx), sbx)
Expand Down Expand Up @@ -737,6 +748,7 @@ func (s *Server) Checkpoint(ctx context.Context, in *orchestrator.SandboxCheckpo
defer s.stopSandboxAsync(context.WithoutCancel(ctx), sbx)

sbxlogger.E(sbx).Info(ctx, "Checkpointing sandbox")
sbx.SetStopReason(sandbox.StopReasonCheckpointing)

// Checkpoint always takes a full memory snapshot; filesystem-only checkpoint
// (resume-in-place would need to reboot) is not supported yet.
Expand Down Expand Up @@ -821,6 +833,11 @@ func (s *Server) Checkpoint(ctx context.Context, in *orchestrator.SandboxCheckpo
if err != nil {
telemetry.ReportCriticalError(ctx, "error uploading snapshot for checkpoint", err, telemetry.WithSandboxID(in.GetSandboxId()))

// Orchestrator-initiated teardown: the snapshot upload failed so the
// resumed sandbox is unusable and we stop it ourselves. Record the
// reason so the lifecycle doesn't mislabel this as a crash.
resumedSbx.SetStopReason(sandbox.StopReasonKilled)

s.sandboxFactory.Sandboxes.MarkStopping(ctx, resumedSbx.Runtime.SandboxID, resumedSbx.LifecycleID)
s.stopSandboxAsync(context.WithoutCancel(ctx), resumedSbx)

Expand Down Expand Up @@ -1027,6 +1044,8 @@ func (s *Server) setupSandboxLifecycle(ctx context.Context, sbx *sandbox.Sandbox
sbxlogger.I(sbx).Error(ctx, "failed to wait for sandbox, cleaning up", zap.Error(waitErr))
}

s.recordExecutionDuration(ctx, sbx, sbx.GetStopReason(ctx))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Record execution duration at pause time

When a sandbox is paused or checkpointed, sbx.Pause has already paused the VM, but this lifecycle path does not sample the new histogram until after stopSandboxAsync runs and sbx.Wait returns. In the checkpoint flow the stop is deferred before snapshotting and, with synchronous checkpoint uploads, the handler can also wait in res.upload.Run before that defer fires, so stop_reason=checkpointing samples include snapshot/resume/upload time after the old VM stopped executing. Record the duration when the VM is paused, or pass that end timestamp into the lifecycle recorder, so the new execution-duration metric is not inflated for pause/checkpoint paths.

Useful? React with 👍 / 👎.


cleanupErr := sbx.Close(ctx)
if cleanupErr != nil {
sbxlogger.I(sbx).Error(ctx, "failed to cleanup sandbox, will remove from cache", zap.Error(cleanupErr))
Expand Down
47 changes: 47 additions & 0 deletions packages/orchestrator/pkg/server/sandboxes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,50 @@ func TestRecordSandboxKill(t *testing.T) {
assert.Equal(t, int64(1), got["timeout"])
assert.Equal(t, int64(1), got[killReasonUnknown])
}

func TestRecordExecutionDuration(t *testing.T) {
t.Parallel()

reader := sdkmetric.NewManualReader()
meter := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)).Meter("github.com/e2b-dev/infra/packages/orchestrator/pkg/server")
histogram, err := telemetry.GetHistogram(meter, telemetry.OrchestratorSandboxExecutionDurationName)
require.NoError(t, err)

s := &Server{sandboxExecutionDuration: histogram}

newSbx := func() *sandbox.Sandbox {
sbx := &sandbox.Sandbox{Metadata: &sandbox.Metadata{}}
sbx.SetStartedAt(time.Now().Add(-time.Minute))

return sbx
}

s.recordExecutionDuration(context.Background(), newSbx(), sandbox.StopReasonKilled)
s.recordExecutionDuration(context.Background(), newSbx(), sandbox.StopReasonPaused)
s.recordExecutionDuration(context.Background(), newSbx(), sandbox.StopReasonCrashed)

var rm metricdata.ResourceMetrics
require.NoError(t, reader.Collect(context.Background(), &rm))

got := map[string]uint64{}
for _, sm := range rm.ScopeMetrics {
for _, m := range sm.Metrics {
if m.Name != string(telemetry.OrchestratorSandboxExecutionDurationName) {
continue
}

hist, ok := m.Data.(metricdata.Histogram[int64])
require.True(t, ok)

for _, dp := range hist.DataPoints {
v, ok := dp.Attributes.Value(attribute.Key("stop_reason"))
require.True(t, ok)
got[v.AsString()] += dp.Count
}
}
}

assert.Equal(t, uint64(1), got[string(sandbox.StopReasonKilled)])
assert.Equal(t, uint64(1), got[string(sandbox.StopReasonPaused)])
assert.Equal(t, uint64(1), got[string(sandbox.StopReasonCrashed)])
}
25 changes: 14 additions & 11 deletions packages/shared/pkg/telemetry/meters.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,10 @@ const (
BuildStepDurationHistogramName HistogramType = "template.build.step.duration"

// Sandbox timing histograms
OrchestratorSandboxCreateDurationName HistogramType = "orchestrator.sandbox.create.duration"
WaitForEnvdDurationHistogramName HistogramType = "orchestrator.sandbox.envd.init.duration"
GuestSyncDurationHistogramName HistogramType = "orchestrator.sandbox.guest_sync.duration"
OrchestratorSandboxCreateDurationName HistogramType = "orchestrator.sandbox.create.duration"
OrchestratorSandboxExecutionDurationName HistogramType = "orchestrator.sandbox.execution.duration"
WaitForEnvdDurationHistogramName HistogramType = "orchestrator.sandbox.envd.init.duration"
GuestSyncDurationHistogramName HistogramType = "orchestrator.sandbox.guest_sync.duration"

// Pre-pause envd heap collapse round-trip duration (the pause-path cost of
// POST /collapse: network plus envd's madvise work), recorded once per pause
Expand Down Expand Up @@ -406,14 +407,15 @@ func GetGaugeInt(meter metric.Meter, name GaugeIntType) (metric.Int64ObservableG
var histogramDesc = map[HistogramType]string{
ApiRedisStoragePublisherPublishDuration: "Duration of a single Redis PUBLISH round-trip from the storage publisher",

BuildDurationHistogramName: "Time taken to build a template",
BuildPhaseDurationHistogramName: "Time taken to build each phase of a template",
BuildStepDurationHistogramName: "Time taken to build each step of a template",
BuildRootfsSizeHistogramName: "Size of the built template rootfs in bytes",
OrchestratorSandboxCreateDurationName: "Time taken to create a sandbox",
WaitForEnvdDurationHistogramName: "Time taken for Envd to initialize successfully",
EnvdCollapseDurationHistogramName: "Time taken for the pre-pause envd heap collapse round-trip",
GuestSyncDurationHistogramName: "Time taken for the mandatory pre-pause guest sync (filesystem-only pause)",
BuildDurationHistogramName: "Time taken to build a template",
BuildPhaseDurationHistogramName: "Time taken to build each phase of a template",
BuildStepDurationHistogramName: "Time taken to build each step of a template",
BuildRootfsSizeHistogramName: "Size of the built template rootfs in bytes",
OrchestratorSandboxCreateDurationName: "Time taken to create a sandbox",
OrchestratorSandboxExecutionDurationName: "Duration of a single sandbox execution, from start/resume until it ends",
WaitForEnvdDurationHistogramName: "Time taken for Envd to initialize successfully",
EnvdCollapseDurationHistogramName: "Time taken for the pre-pause envd heap collapse round-trip",
GuestSyncDurationHistogramName: "Time taken for the mandatory pre-pause guest sync (filesystem-only pause)",

UffdStartupPagesHistogramName: "Demand-fault pages a guest needed to reach a successful envd init, per start",
UffdStartupSourcePagesHistogramName: "Subset of startup demand-fault pages pulled from the source (e.g. GCS), per start",
Expand Down Expand Up @@ -458,6 +460,7 @@ var histogramUnits = map[HistogramType]string{
BuildStepDurationHistogramName: "ms",
BuildRootfsSizeHistogramName: "{By}",
OrchestratorSandboxCreateDurationName: "ms",
OrchestratorSandboxExecutionDurationName: "ms",
WaitForEnvdDurationHistogramName: "ms",
EnvdCollapseDurationHistogramName: "ms",
GuestSyncDurationHistogramName: "ms",
Expand Down
Loading