diff --git a/packages/orchestrator/pkg/sandbox/envd.go b/packages/orchestrator/pkg/sandbox/envd.go index b5246e4b03..c83de86235 100644 --- a/packages/orchestrator/pkg/sandbox/envd.go +++ b/packages/orchestrator/pkg/sandbox/envd.go @@ -6,6 +6,7 @@ import ( "bytes" "context" "encoding/json" + "errors" "fmt" "io" "net/http" @@ -28,6 +29,35 @@ const ( loopDelay = 5 * time.Millisecond ) +// envdInitExitType classifies the outcome of an envd init call. +type envdInitExitType string + +const ( + envdInitExitSuccess envdInitExitType = "success" + envdInitExitTimeout envdInitExitType = "timeout" + envdInitExitCanceled envdInitExitType = "canceled" + envdInitExitOther envdInitExitType = "other" + // envdInitExitTransient marks a retried attempt that failed but was not the + // terminal outcome of the init episode. + envdInitExitTransient envdInitExitType = "transient" +) + +// classifyEnvdInitExit maps an init error to an exit_type. +func classifyEnvdInitExit(err error) envdInitExitType { + switch { + case err == nil: + return envdInitExitSuccess + case errors.Is(err, ErrWaitForEnvdTimeout), errors.Is(err, context.DeadlineExceeded): + return envdInitExitTimeout + case errors.Is(err, ErrFcProcessExited): + return envdInitExitOther + case errors.Is(err, context.Canceled): + return envdInitExitCanceled + default: + return envdInitExitOther + } +} + // envdOp is the path segment of a parameterless envd POST endpoint. type envdOp string @@ -236,9 +266,19 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { span.End() }() - attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.String("start_type", string(startType))} - attributesFail := append(attributes, attribute.Bool("success", false)) - attributesSuccess := append(attributes, attribute.Bool("success", true)) + attributes := []attribute.KeyValue{ + telemetry.WithEnvdVersion(s.Config.Envd.Version), + attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), + attribute.String("start_type", string(startType)), + } + + // success is kept for backward compatibility until consumers move to exit_type. + callAttributes := func(exit envdInitExitType) []attribute.KeyValue { + return append(attributes, + attribute.Bool("success", exit == envdInitExitSuccess), + attribute.String("exit_type", string(exit)), + ) + } address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort) @@ -252,18 +292,19 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) { zap.Error(err), ) - envdInitCalls.Add(ctx, count, metric.WithAttributes(attributesFail...)) + exit := classifyEnvdInitExit(err) + envdInitCalls.Add(ctx, count, metric.WithAttributes(callAttributes(exit)...)) return fmt.Errorf("failed to init envd: %w", err) } if count > 1 { - // Track failed envd init calls - envdInitCalls.Add(ctx, count-1, metric.WithAttributes(attributesFail...)) + // Retried attempts were transient per-request failures that preceded the success. + envdInitCalls.Add(ctx, count-1, metric.WithAttributes(callAttributes(envdInitExitTransient)...)) } // Track successful envd init - envdInitCalls.Add(ctx, 1, metric.WithAttributes(attributesSuccess...)) + envdInitCalls.Add(ctx, 1, metric.WithAttributes(callAttributes(envdInitExitSuccess)...)) defer response.Body.Close() body, err := io.ReadAll(response.Body) diff --git a/packages/orchestrator/pkg/sandbox/envd_test.go b/packages/orchestrator/pkg/sandbox/envd_test.go index 48641628fc..0329ee69ac 100644 --- a/packages/orchestrator/pkg/sandbox/envd_test.go +++ b/packages/orchestrator/pkg/sandbox/envd_test.go @@ -5,6 +5,8 @@ package sandbox import ( "context" "encoding/json" + "errors" + "fmt" "net/http" "net/http/httptest" "testing" @@ -18,6 +20,48 @@ import ( "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network" ) +func TestClassifyEnvdInitExit(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want envdInitExitType + }{ + {"nil", nil, envdInitExitSuccess}, + {"deadline_exceeded", context.DeadlineExceeded, envdInitExitTimeout}, + {"wrapped_deadline", fmt.Errorf("init: %w", context.DeadlineExceeded), envdInitExitTimeout}, + {"wait_for_envd_timeout", ErrWaitForEnvdTimeout, envdInitExitTimeout}, + { + "wrapped_wait_for_envd_timeout", + // Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the + // cause is the timeout sentinel, both wrapped together. + fmt.Errorf("%w with cause: %w", context.Canceled, ErrWaitForEnvdTimeout), + envdInitExitTimeout, + }, + {"canceled", context.Canceled, envdInitExitCanceled}, + {"wrapped_canceled", fmt.Errorf("init: %w", context.Canceled), envdInitExitCanceled}, + {"fc_process_exited", ErrFcProcessExited, envdInitExitOther}, + { + "wrapped_fc_process_exited", + // Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the + // cause is the fc-exit sentinel, both wrapped together. Must not be + // misclassified as canceled. + fmt.Errorf("%w with cause: %w", context.Canceled, ErrFcProcessExited), + envdInitExitOther, + }, + {"other", errors.New("connection refused"), envdInitExitOther}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + assert.Equal(t, tt.want, classifyEnvdInitExit(tt.err)) + }) + } +} + // mockEgressProxy is a test EgressProxy that returns a fixed CA bundle string. type mockEgressProxy struct { bundle string diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index b7c30b2fbb..6ee3d25fc6 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -67,6 +67,12 @@ const ( StartTypeReboot StartType = "reboot" // cold boot from a snapshot rootfs (filesystem-only resume) ) +// ErrWaitForEnvdTimeout is the cancel cause used when WaitForEnvd exceeds its timeout. +var ErrWaitForEnvdTimeout = errors.New("syncing took too long") + +// ErrFcProcessExited is the cancel cause used when the Firecracker process exits during WaitForEnvd. +var ErrFcProcessExited = errors.New("fc process exited prematurely") + var SandboxHttpTransport = otelhttp.NewTransport( &http.Transport{ DisableKeepAlives: true, @@ -1753,11 +1759,13 @@ func (s *Sandbox) WaitForEnvd( // cover its timing/size. if !s.skipStartupMetrics { duration := time.Since(start).Milliseconds() + // success is kept for backward compatibility until consumers move to exit_type. waitForEnvdDurationHistogram.Record(ctx, duration, metric.WithAttributes( telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.Bool("success", e == nil), attribute.String("start_type", string(startType)), + attribute.String("exit_type", string(classifyEnvdInitExit(e))), )) // Record the demand-fault working set the guest needed to reach this @@ -1794,13 +1802,13 @@ func (s *Sandbox) WaitForEnvd( select { // Ensure the syncing takes at most timeout seconds. case <-time.After(timeout): - cancel(errors.New("syncing took too long")) + cancel(ErrWaitForEnvdTimeout) case <-ctx.Done(): return case <-s.process.Exit.Done(): err := s.process.Exit.Error() - cancel(fmt.Errorf("fc process exited prematurely: %w", err)) + cancel(fmt.Errorf("%w: %w", ErrFcProcessExited, err)) } }()