From 34b178341b2e279601ab2a5a32d586ad43ff28ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Mon, 29 Jun 2026 11:59:12 +0200 Subject: [PATCH 1/4] feat(orchestrator): classify envd-init by exit type Add an exit_type attribute (success/timeout/canceled/other) to the orchestrator.sandbox.envd.init.calls and orchestrator.sandbox.envd.init.duration meters so init outcomes can be told apart instead of only knowing success vs failure. The success bool is kept for backward compatibility until consumers move to exit_type. WaitForEnvd signals its timeout via context cancellation, so a dedicated ErrWaitForEnvdTimeout cause is used to distinguish a real timeout from a generic cancellation. Intermediate retried attempts that preceded an eventual success are tagged as other. --- packages/orchestrator/pkg/sandbox/envd.go | 44 ++++++++++++++++--- .../orchestrator/pkg/sandbox/envd_test.go | 35 +++++++++++++++ packages/orchestrator/pkg/sandbox/sandbox.go | 7 ++- 3 files changed, 79 insertions(+), 7 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/envd.go b/packages/orchestrator/pkg/sandbox/envd.go index b5246e4b03..78753b598c 100644 --- a/packages/orchestrator/pkg/sandbox/envd.go +++ b/packages/orchestrator/pkg/sandbox/envd.go @@ -6,6 +6,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -28,6 +29,30 @@ const ( loopDelay = 5 * time.Millisecond ) +// envdInitExitType classifies the outcome of an envd init call. +type envdInitExitType string + +const ( + envdInitExitSuccess envdInitExitType = "success" + envdInitExitTimeout envdInitExitType = "timeout" + envdInitExitCanceled envdInitExitType = "canceled" + envdInitExitOther envdInitExitType = "other" +) + +// classifyEnvdInitExit maps an init error to an exit_type. +func classifyEnvdInitExit(err error) envdInitExitType { + switch { + case err == nil: + return envdInitExitSuccess + case errors.Is(err, ErrWaitForEnvdTimeout), errors.Is(err, context.DeadlineExceeded): + return envdInitExitTimeout + case errors.Is(err, context.Canceled): + return envdInitExitCanceled + default: + return envdInitExitOther + } +} + // envdOp is the path segment of a parameterless envd POST endpoint. type envdOp string @@ -237,8 +262,14 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { }() attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.String("start_type", string(startType))} - attributesFail := append(attributes, attribute.Bool("success", false)) - attributesSuccess := append(attributes, attribute.Bool("success", true)) + + // success is kept for backward compatibility until consumers move to exit_type. + callAttributes := func(exit envdInitExitType) []attribute.KeyValue { + return append(attributes, + attribute.Bool("success", exit == envdInitExitSuccess), + attribute.String("exit_type", string(exit)), + ) + } address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort) @@ -252,18 +283,19 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { zap.Error(err), ) - envdInitCalls.Add(ctx, count, metric.WithAttributes(attributesFail...)) + exit := classifyEnvdInitExit(err) + envdInitCalls.Add(ctx, count, metric.WithAttributes(callAttributes(exit)...)) return fmt.Errorf("failed to init envd: %w", err) } if count > 1 { - // Track failed envd init calls - envdInitCalls.Add(ctx, count-1, metric.WithAttributes(attributesFail...)) + // Retried attempts were transient per-request failures, classify as other. + envdInitCalls.Add(ctx, count-1, metric.WithAttributes(callAttributes(envdInitExitOther)...)) } // Track successful envd init - envdInitCalls.Add(ctx, 1, metric.WithAttributes(attributesSuccess...)) + envdInitCalls.Add(ctx, 1, metric.WithAttributes(callAttributes(envdInitExitSuccess)...)) defer response.Body.Close() body, err := io.ReadAll(response.Body) diff --git a/packages/orchestrator/pkg/sandbox/envd_test.go b/packages/orchestrator/pkg/sandbox/envd_test.go index 48641628fc..f3052550c0 100644 --- a/packages/orchestrator/pkg/sandbox/envd_test.go +++ b/packages/orchestrator/pkg/sandbox/envd_test.go @@ -5,6 +5,8 @@ package sandbox import ( "context" "encoding/json" + "errors" + "fmt" "net/http" "net/http/httptest" "testing" @@ -18,6 +20,39 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network" ) +func TestClassifyEnvdInitExit(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want envdInitExitType + }{ + {"nil", nil, envdInitExitSuccess}, + {"deadline_exceeded", context.DeadlineExceeded, envdInitExitTimeout}, + {"wrapped_deadline", fmt.Errorf("init: %w", context.DeadlineExceeded), envdInitExitTimeout}, + {"wait_for_envd_timeout", ErrWaitForEnvdTimeout, envdInitExitTimeout}, + { + "wrapped_wait_for_envd_timeout", + // Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the + // cause is the timeout sentinel, both wrapped together. + fmt.Errorf("%w with cause: %w", context.Canceled, ErrWaitForEnvdTimeout), + envdInitExitTimeout, + }, + {"canceled", context.Canceled, envdInitExitCanceled}, + {"wrapped_canceled", fmt.Errorf("init: %w", context.Canceled), envdInitExitCanceled}, + {"other", errors.New("connection refused"), envdInitExitOther}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + assert.Equal(t, tt.want, classifyEnvdInitExit(tt.err)) + }) + } +} + // mockEgressProxy is a test EgressProxy that returns a fixed CA bundle string. type mockEgressProxy struct { bundle string diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index b7c30b2fbb..367212fd41 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -67,6 +67,9 @@ const ( StartTypeReboot StartType = "reboot" // cold boot from a snapshot rootfs (filesystem-only resume) ) +// ErrWaitForEnvdTimeout is the cancel cause used when WaitForEnvd exceeds its timeout. +var ErrWaitForEnvdTimeout = errors.New("syncing took too long") + var SandboxHttpTransport = otelhttp.NewTransport( &http.Transport{ DisableKeepAlives: true, @@ -1753,11 +1756,13 @@ func (s *Sandbox) WaitForEnvd( // cover its timing/size. if !s.skipStartupMetrics { duration := time.Since(start).Milliseconds() + // success is kept for backward compatibility until consumers move to exit_type. waitForEnvdDurationHistogram.Record(ctx, duration, metric.WithAttributes( telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.Bool("success", e == nil), attribute.String("start_type", string(startType)), + attribute.String("exit_type", string(classifyEnvdInitExit(e))), )) // Record the demand-fault working set the guest needed to reach this @@ -1794,7 +1799,7 @@ func (s *Sandbox) WaitForEnvd( select { // Ensure the syncing takes at most timeout seconds. case <-time.After(timeout): - cancel(errors.New("syncing took too long")) + cancel(ErrWaitForEnvdTimeout) case <-ctx.Done(): return case <-s.process.Exit.Done(): From bdcd0f624c4d0977fa08bee45649c4b800bc0a7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Mon, 29 Jun 2026 13:49:43 +0200 Subject: [PATCH 2/4] style(orchestrator): wrap envd-init attributes slice --- packages/orchestrator/pkg/sandbox/envd.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/envd.go b/packages/orchestrator/pkg/sandbox/envd.go index 78753b598c..cfa3918231 100644 --- a/packages/orchestrator/pkg/sandbox/envd.go +++ b/packages/orchestrator/pkg/sandbox/envd.go @@ -261,7 +261,11 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { span.End() }() - attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.String("start_type", string(startType))} + attributes := []attribute.KeyValue{ + telemetry.WithEnvdVersion(s.Config.Envd.Version), + attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), + attribute.String("start_type", string(startType)), + } // success is kept for backward compatibility until consumers move to exit_type. callAttributes := func(exit envdInitExitType) []attribute.KeyValue { From de17270a5ea7a81f55401e14f399f811f4a61b38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Mon, 29 Jun 2026 15:03:07 +0200 Subject: [PATCH 3/4] fix(orchestrator): classify fc process exit as other, not canceled WaitForEnvd cancels its context when the Firecracker process exits, but WithCancelCause forces ctx.Err() to context.Canceled, so a real FC crash (OOM-kill, panic, segfault) was misclassified as exit_type=canceled. Introduce an ErrFcProcessExited cancel cause and classify it as other so FC deaths are no longer conflated with caller-initiated cancellations. --- packages/orchestrator/pkg/sandbox/envd.go | 2 ++ packages/orchestrator/pkg/sandbox/envd_test.go | 9 +++++++++ packages/orchestrator/pkg/sandbox/sandbox.go | 5 ++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/envd.go b/packages/orchestrator/pkg/sandbox/envd.go index cfa3918231..7440205700 100644 --- a/packages/orchestrator/pkg/sandbox/envd.go +++ b/packages/orchestrator/pkg/sandbox/envd.go @@ -46,6 +46,8 @@ func classifyEnvdInitExit(err error) envdInitExitType { return envdInitExitSuccess case errors.Is(err, ErrWaitForEnvdTimeout), errors.Is(err, context.DeadlineExceeded): return envdInitExitTimeout + case errors.Is(err, ErrFcProcessExited): + return envdInitExitOther case errors.Is(err, context.Canceled): return envdInitExitCanceled default: diff --git a/packages/orchestrator/pkg/sandbox/envd_test.go b/packages/orchestrator/pkg/sandbox/envd_test.go index f3052550c0..0329ee69ac 100644 --- a/packages/orchestrator/pkg/sandbox/envd_test.go +++ b/packages/orchestrator/pkg/sandbox/envd_test.go @@ -41,6 +41,15 @@ func TestClassifyEnvdInitExit(t *testing.T) { }, {"canceled", context.Canceled, envdInitExitCanceled}, {"wrapped_canceled", fmt.Errorf("init: %w", context.Canceled), envdInitExitCanceled}, + {"fc_process_exited", ErrFcProcessExited, envdInitExitOther}, + { + "wrapped_fc_process_exited", + // Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the + // cause is the fc-exit sentinel, both wrapped together. Must not be + // misclassified as canceled. + fmt.Errorf("%w with cause: %w", context.Canceled, ErrFcProcessExited), + envdInitExitOther, + }, {"other", errors.New("connection refused"), envdInitExitOther}, } diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 367212fd41..6ee3d25fc6 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -70,6 +70,9 @@ const ( // ErrWaitForEnvdTimeout is the cancel cause used when WaitForEnvd exceeds its timeout. var ErrWaitForEnvdTimeout = errors.New("syncing took too long") +// ErrFcProcessExited is the cancel cause used when the Firecracker process exits during WaitForEnvd. +var ErrFcProcessExited = errors.New("fc process exited prematurely") + var SandboxHttpTransport = otelhttp.NewTransport( &http.Transport{ DisableKeepAlives: true, @@ -1805,7 +1808,7 @@ func (s *Sandbox) WaitForEnvd( case <-s.process.Exit.Done(): err := s.process.Exit.Error() - cancel(fmt.Errorf("fc process exited prematurely: %w", err)) + cancel(fmt.Errorf("%w: %w", ErrFcProcessExited, err)) } }() From d9f8d648bb24869cc2438c98bee09e69a05f223e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Van=C4=9Bk?= Date: Tue, 30 Jun 2026 09:34:30 +0200 Subject: [PATCH 4/4] feat(orchestrator): tag retried envd-init attempts as transient Retried attempts that preceded a successful init were tagged exit_type=other, overloading the residual terminal-failure bucket. Introduce a dedicated transient value so a successful init episode reports its count-1 retries distinctly, keeping other for unclassified terminal failures only. --- packages/orchestrator/pkg/sandbox/envd.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/envd.go b/packages/orchestrator/pkg/sandbox/envd.go index 7440205700..c83de86235 100644 --- a/packages/orchestrator/pkg/sandbox/envd.go +++ b/packages/orchestrator/pkg/sandbox/envd.go @@ -37,6 +37,9 @@ const ( envdInitExitTimeout envdInitExitType = "timeout" envdInitExitCanceled envdInitExitType = "canceled" envdInitExitOther envdInitExitType = "other" + // envdInitExitTransient marks a retried attempt that failed but was not the + // terminal outcome of the init episode. + envdInitExitTransient envdInitExitType = "transient" ) // classifyEnvdInitExit maps an init error to an exit_type. @@ -296,8 +299,8 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { } if count > 1 { - // Retried attempts were transient per-request failures, classify as other. - envdInitCalls.Add(ctx, count-1, metric.WithAttributes(callAttributes(envdInitExitOther)...)) + // Retried attempts were transient per-request failures that preceded the success. + envdInitCalls.Add(ctx, count-1, metric.WithAttributes(callAttributes(envdInitExitTransient)...)) } // Track successful envd init