Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 48 additions & 7 deletions packages/orchestrator/pkg/sandbox/envd.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
Expand All @@ -28,6 +29,35 @@ const (
loopDelay = 5 * time.Millisecond
)

// envdInitExitType classifies the outcome of an envd init call.
type envdInitExitType string

const (
envdInitExitSuccess envdInitExitType = "success"
envdInitExitTimeout envdInitExitType = "timeout"
envdInitExitCanceled envdInitExitType = "canceled"
envdInitExitOther envdInitExitType = "other"
// envdInitExitTransient marks a retried attempt that failed but was not the
// terminal outcome of the init episode.
envdInitExitTransient envdInitExitType = "transient"
)

// classifyEnvdInitExit maps an init error to an exit_type.
func classifyEnvdInitExit(err error) envdInitExitType {
switch {
case err == nil:
return envdInitExitSuccess
case errors.Is(err, ErrWaitForEnvdTimeout), errors.Is(err, context.DeadlineExceeded):
return envdInitExitTimeout
case errors.Is(err, ErrFcProcessExited):
return envdInitExitOther
case errors.Is(err, context.Canceled):
return envdInitExitCanceled
default:
return envdInitExitOther
}
}

// envdOp is the path segment of a parameterless envd POST endpoint.
type envdOp string

Expand Down Expand Up @@ -236,9 +266,19 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) {
span.End()
}()

attributes := []attribute.KeyValue{telemetry.WithEnvdVersion(s.Config.Envd.Version), attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()), attribute.String("start_type", string(startType))}
attributesFail := append(attributes, attribute.Bool("success", false))
attributesSuccess := append(attributes, attribute.Bool("success", true))
attributes := []attribute.KeyValue{
telemetry.WithEnvdVersion(s.Config.Envd.Version),
attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()),
attribute.String("start_type", string(startType)),
}

// success is kept for backward compatibility until consumers move to exit_type.
callAttributes := func(exit envdInitExitType) []attribute.KeyValue {
return append(attributes,
attribute.Bool("success", exit == envdInitExitSuccess),
attribute.String("exit_type", string(exit)),
)
}
Comment thread
arkamar marked this conversation as resolved.

address := fmt.Sprintf("http://%s:%d/init", s.Slot.HostIPString(), consts.DefaultEnvdServerPort)

Expand All @@ -252,18 +292,19 @@ func (s *Sandbox) initEnvd(ctx context.Context, startType StartType) (e error) {
zap.Error(err),
)

envdInitCalls.Add(ctx, count, metric.WithAttributes(attributesFail...))
exit := classifyEnvdInitExit(err)
envdInitCalls.Add(ctx, count, metric.WithAttributes(callAttributes(exit)...))
Comment thread
arkamar marked this conversation as resolved.

return fmt.Errorf("failed to init envd: %w", err)
}

if count > 1 {
// Track failed envd init calls
envdInitCalls.Add(ctx, count-1, metric.WithAttributes(attributesFail...))
// Retried attempts were transient per-request failures that preceded the success.
envdInitCalls.Add(ctx, count-1, metric.WithAttributes(callAttributes(envdInitExitTransient)...))
}

// Track successful envd init
envdInitCalls.Add(ctx, 1, metric.WithAttributes(attributesSuccess...))
envdInitCalls.Add(ctx, 1, metric.WithAttributes(callAttributes(envdInitExitSuccess)...))
Comment thread
arkamar marked this conversation as resolved.

defer response.Body.Close()
body, err := io.ReadAll(response.Body)
Expand Down
44 changes: 44 additions & 0 deletions packages/orchestrator/pkg/sandbox/envd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ package sandbox
import (
"context"
"encoding/json"
"errors"
"fmt"
"net/http"
"net/http/httptest"
"testing"
Expand All @@ -18,6 +20,48 @@ import (
"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/network"
)

func TestClassifyEnvdInitExit(t *testing.T) {
t.Parallel()

tests := []struct {
name string
err error
want envdInitExitType
}{
{"nil", nil, envdInitExitSuccess},
{"deadline_exceeded", context.DeadlineExceeded, envdInitExitTimeout},
{"wrapped_deadline", fmt.Errorf("init: %w", context.DeadlineExceeded), envdInitExitTimeout},
{"wait_for_envd_timeout", ErrWaitForEnvdTimeout, envdInitExitTimeout},
{
"wrapped_wait_for_envd_timeout",
// Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the
// cause is the timeout sentinel, both wrapped together.
fmt.Errorf("%w with cause: %w", context.Canceled, ErrWaitForEnvdTimeout),
envdInitExitTimeout,
},
{"canceled", context.Canceled, envdInitExitCanceled},
{"wrapped_canceled", fmt.Errorf("init: %w", context.Canceled), envdInitExitCanceled},
{"fc_process_exited", ErrFcProcessExited, envdInitExitOther},
{
"wrapped_fc_process_exited",
// Mirrors doRequestWithInfiniteRetries: ctx.Err() is Canceled, the
// cause is the fc-exit sentinel, both wrapped together. Must not be
// misclassified as canceled.
fmt.Errorf("%w with cause: %w", context.Canceled, ErrFcProcessExited),
envdInitExitOther,
},
{"other", errors.New("connection refused"), envdInitExitOther},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()

assert.Equal(t, tt.want, classifyEnvdInitExit(tt.err))
})
}
}

// mockEgressProxy is a test EgressProxy that returns a fixed CA bundle string.
type mockEgressProxy struct {
bundle string
Expand Down
12 changes: 10 additions & 2 deletions packages/orchestrator/pkg/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ const (
StartTypeReboot StartType = "reboot" // cold boot from a snapshot rootfs (filesystem-only resume)
)

// ErrWaitForEnvdTimeout is the cancel cause used when WaitForEnvd exceeds its timeout.
var ErrWaitForEnvdTimeout = errors.New("syncing took too long")

// ErrFcProcessExited is the cancel cause used when the Firecracker process exits during WaitForEnvd.
var ErrFcProcessExited = errors.New("fc process exited prematurely")

var SandboxHttpTransport = otelhttp.NewTransport(
&http.Transport{
DisableKeepAlives: true,
Expand Down Expand Up @@ -1753,11 +1759,13 @@ func (s *Sandbox) WaitForEnvd(
// cover its timing/size.
if !s.skipStartupMetrics {
duration := time.Since(start).Milliseconds()
// success is kept for backward compatibility until consumers move to exit_type.
waitForEnvdDurationHistogram.Record(ctx, duration, metric.WithAttributes(
telemetry.WithEnvdVersion(s.Config.Envd.Version),
attribute.Int64("timeout_ms", s.internalConfig.EnvdInitRequestTimeout.Milliseconds()),
attribute.Bool("success", e == nil),
attribute.String("start_type", string(startType)),
attribute.String("exit_type", string(classifyEnvdInitExit(e))),
))

// Record the demand-fault working set the guest needed to reach this
Expand Down Expand Up @@ -1794,13 +1802,13 @@ func (s *Sandbox) WaitForEnvd(
select {
// Ensure the syncing takes at most timeout seconds.
case <-time.After(timeout):
cancel(errors.New("syncing took too long"))
cancel(ErrWaitForEnvdTimeout)
case <-ctx.Done():
return
case <-s.process.Exit.Done():
Comment thread
arkamar marked this conversation as resolved.
err := s.process.Exit.Error()

cancel(fmt.Errorf("fc process exited prematurely: %w", err))
cancel(fmt.Errorf("%w: %w", ErrFcProcessExited, err))
}
}()

Expand Down
Loading