Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions packages/orchestrator/pkg/server/sandboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,12 @@ func (s *Server) Create(ctx context.Context, req *orchestrator.SandboxCreateRequ
return nil, status.Errorf(codes.FailedPrecondition, "sandbox files for '%s' not found", req.GetSandbox().GetSandboxId())
}

if errors.Is(err, storage.ErrObjectArchived) {
telemetry.ReportError(ctx, "sandbox files archived", err, telemetry.WithSandboxID(req.GetSandbox().GetSandboxId()))

return nil, status.Errorf(codes.FailedPrecondition, "sandbox files for '%s' are archived and not directly accessible, please rebuild the template", req.GetSandbox().GetSandboxId())
Comment on lines +258 to +261

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Handle archived metadata before resuming

When the archived object is the template metadata blob, template.Metadata() above returns the ErrObjectArchived produced by awsObject.WriteTo, but this new check only runs after RebootSandbox/ResumeSandbox return an error. That path exits earlier with failed to read template metadata instead of the new FailedPrecondition rebuild response, so archived templates whose metadata object is in GLACIER still surface as a generic Create failure.

Useful? React with 👍 / 👎.

}

err = errors.Join(err, context.Cause(ctx))
telemetry.ReportCriticalError(ctx, "failed to create sandbox", err)
logger.L().Error(ctx, "failed to create sandbox", zap.Error(err),
Expand Down
5 changes: 5 additions & 0 deletions packages/shared/pkg/storage/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ const (
OutcomeErrIO = "err_io"
OutcomeErrTimeout = "err_timeout"
OutcomeTransitioned = "transitioned"
// OutcomeArchived means the object exists but is in an archived storage
// class that does not allow direct reads.
OutcomeArchived = "archived"
// OutcomeContended is a writeback skipped because another goroutine held the
// NFS chunk lock — normal cache dedup, nothing written.
OutcomeContended = "contended"
Expand Down Expand Up @@ -80,6 +83,8 @@ func Outcome(err error) string {
return OutcomeOK
case errors.Is(err, ErrObjectNotExist), errors.Is(err, fs.ErrNotExist):
return OutcomeNotFound
case errors.Is(err, ErrObjectArchived):
return OutcomeArchived
case errors.Is(err, context.Canceled):
return OutcomeErrCanceled
case errors.Is(err, context.DeadlineExceeded):
Expand Down
5 changes: 5 additions & 0 deletions packages/shared/pkg/storage/storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ var tracer = otel.Tracer("github.com/e2b-dev/infra/packages/shared/pkg/storage")

var ErrObjectNotExist = errors.New("object does not exist")

// ErrObjectArchived means the object exists but is stored in an archived
// storage class (e.g. GLACIER, ARCHIVE, COLD_ARCHIVE) that does not allow
// direct reads. The object must be restored or rebuilt before it can be used.
var ErrObjectArchived = errors.New("object is archived and not accessible")

// ErrObjectRateLimited means per-object mutation rate limiting —
// multiple concurrent writers racing to write the same content-addressed object.
var ErrObjectRateLimited = errors.New("object access rate limited")
Expand Down
42 changes: 23 additions & 19 deletions packages/shared/pkg/storage/storage_aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,26 @@ func (s *awsStorage) OpenBlob(_ context.Context, path string) (Blob, error) {
}, nil
}

// mapAWSError translates well-known AWS S3 errors into storage-layer sentinels.
func mapAWSError(err error, path string) error {
if err == nil {
return nil
}

var nsk *types.NoSuchKey
var nfd *types.NotFound
var ios *types.InvalidObjectState

switch {
case errors.As(err, &nsk), errors.As(err, &nfd):
return fmt.Errorf("%q: %w", path, ErrObjectNotExist)
case errors.As(err, &ios):
return fmt.Errorf("%q: %w", path, ErrObjectArchived)
default:
return err
}
}

func (o *awsObject) WriteTo(ctx context.Context, dst io.Writer) (n int64, err error) {
start := time.Now()
defer func() { RecordReadBlob(ctx, time.Since(start), n, o.path, SourceAWS, err) }()
Expand All @@ -163,12 +183,7 @@ func (o *awsObject) WriteTo(ctx context.Context, dst io.Writer) (n int64, err er

resp, err := o.client.GetObject(ctx, &s3.GetObjectInput{Bucket: &o.bucketName, Key: &o.path})
if err != nil {
var nsk *types.NoSuchKey
if errors.As(err, &nsk) {
return 0, ErrObjectNotExist
}

return 0, err
return 0, mapAWSError(err, o.path)
}

defer resp.Body.Close()
Expand Down Expand Up @@ -272,12 +287,7 @@ func (o *awsObject) OpenRangeReader(ctx context.Context, off, length int64, fram
Range: readRange,
})
if err != nil {
var nsk *types.NoSuchKey
if errors.As(err, &nsk) {
return nil, SourceAWS, ErrObjectNotExist
}

return nil, SourceAWS, fmt.Errorf("failed to create S3 range reader for %q: %w", o.path, err)
return nil, SourceAWS, mapAWSError(err, o.path)
}

return NewRangeReader(resp.Body), SourceAWS, nil
Expand All @@ -293,13 +303,7 @@ func (o *awsObject) Size(ctx context.Context) (_ int64, err error) {

resp, err := o.client.HeadObject(ctx, &s3.HeadObjectInput{Bucket: &o.bucketName, Key: &o.path})
if err != nil {
var nsk *types.NoSuchKey
var nfd *types.NotFound
if errors.As(err, &nsk) || errors.As(err, &nfd) {
return 0, ErrObjectNotExist
}

return 0, err
return 0, mapAWSError(err, o.path)
}

return *resp.ContentLength, nil
Expand Down