diff --git a/packages/orchestrator/benchmarks/benchmark_test.go b/packages/orchestrator/benchmarks/benchmark_test.go index 67a70547e7..ca0876f440 100644 --- a/packages/orchestrator/benchmarks/benchmark_test.go +++ b/packages/orchestrator/benchmarks/benchmark_test.go @@ -386,7 +386,7 @@ func (tc *testContainer) testOneItem(b *testing.B, buildID, kernelVersion, fcVer KernelVersion: kernelVersion, FirecrackerVersion: fcVersion, }) - snap, err := sbx.Pause(ctx, templateMetadata) + snap, err := sbx.Pause(ctx, templateMetadata, sandbox.SnapshotUseCasePause) require.NoError(b, err) require.NotNil(b, snap) diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index 3af18330ba..dd780e13da 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -640,7 +640,7 @@ func (r *runner) pauseOnce(ctx context.Context, opts pauseOptions, verbose bool) // Pause and create snapshot pauseStart := time.Now() - snapshot, err := sbx.Pause(ctx, newMeta) + snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause) pauseDur := time.Since(pauseStart) totalDur := time.Since(t0) diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 167887600d..8cc61d7a73 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -1032,6 +1032,7 @@ func (s *Sandbox) Shutdown(ctx context.Context) error { func (s *Sandbox) Pause( ctx context.Context, m metadata.Template, + useCase SnapshotUseCase, ) (st *Snapshot, e error) { ctx, span := tracer.Start(ctx, "sandbox-snapshot") defer span.End() @@ -1092,6 +1093,7 @@ func (s *Sandbox) Pause( if err != nil { return nil, fmt.Errorf("failed to get memfile metadata: %w", err) } + recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, originalMemfile.Header(), useCase) // Start POSTPROCESSING memfileDiff, memfileDiffHeader, err := pauseProcessMemory( @@ -1116,6 +1118,7 @@ func (s *Sandbox) Pause( closeHook: s.Close, }, s.config.DefaultCacheDir, + useCase, ) if err != nil { return nil, fmt.Errorf("error while post processing: %w", err) @@ -1200,6 +1203,7 @@ func pauseProcessRootfs( originalHeader *header.Header, diffCreator DiffCreator, cacheDir string, + useCase SnapshotUseCase, ) (d build.Diff, h *header.Header, e error) { ctx, span := tracer.Start(ctx, "process-rootfs") defer span.End() @@ -1216,6 +1220,7 @@ func pauseProcessRootfs( return nil, nil, fmt.Errorf("error creating diff: %w", err) } telemetry.ReportEvent(ctx, "exported rootfs") + recordSnapshotDiff(ctx, "rootfs", rootfsDiffMetadata, originalHeader, useCase) rootfsDiff, err := rootfsDiffFile.CloseToDiff(int64(originalHeader.Metadata.BlockSize)) if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/snapshot_metrics.go b/packages/orchestrator/pkg/sandbox/snapshot_metrics.go new file mode 100644 index 0000000000..4659f6c61a --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/snapshot_metrics.go @@ -0,0 +1,77 @@ +package sandbox + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" + + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" + "github.com/e2b-dev/infra/packages/shared/pkg/telemetry" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" +) + +var ( + snapshotDiffBytes = utils.Must(telemetry.GetHistogram(meter, telemetry.SnapshotDiffBytes)) + snapshotDiffRatioBp = utils.Must(telemetry.GetHistogram(meter, telemetry.SnapshotDiffRatioBp)) + snapshotTotalBytes = utils.Must(telemetry.GetHistogram(meter, telemetry.SnapshotTotalBytes)) +) + +type SnapshotUseCase string + +const ( + SnapshotUseCasePause SnapshotUseCase = "pause" + SnapshotUseCaseBuild SnapshotUseCase = "build" +) + +func recordSnapshotDiff( + ctx context.Context, + fileType string, + dm *header.DiffMetadata, + original *header.Header, + useCase SnapshotUseCase, +) { + if dm == nil || original == nil || original.Metadata == nil { + return + } + bs := int64(original.Metadata.BlockSize) + total := int64(original.Metadata.Size) + + ft := attribute.String("file_type", fileType) + uc := attribute.String("use_case", string(useCase)) + + snapshotTotalBytes.Record(ctx, total, metric.WithAttributes(ft, uc)) + + var dirtyBytes, emptyBytes int64 + if dm.Dirty != nil { + dirtyBytes = int64(dm.Dirty.GetCardinality()) * bs + } + if dm.Empty != nil { + emptyBytes = int64(dm.Empty.GetCardinality()) * bs + } + for kind, b := range map[string]int64{ + "dirty": dirtyBytes, + "empty": emptyBytes, + } { + attrs := metric.WithAttributes(ft, attribute.String("kind", kind), uc) + snapshotDiffBytes.Record(ctx, b, attrs) + snapshotDiffRatioBp.Record(ctx, ratioBp(b, total), attrs) + } +} + +// ratioBp returns num/denom in basis points (10000 = 100.00%) so we keep +// sub-percent resolution. Grafana panels divide by 100 to display percent. +func ratioBp(num, denom int64) int64 { + if denom <= 0 { + return 0 + } + bp := num * 10000 / denom + if bp < 0 { + return 0 + } + if bp > 10000 { + return 10000 + } + + return bp +} diff --git a/packages/orchestrator/pkg/server/sandboxes.go b/packages/orchestrator/pkg/server/sandboxes.go index b2216e0d19..f0ad1901cc 100644 --- a/packages/orchestrator/pkg/server/sandboxes.go +++ b/packages/orchestrator/pkg/server/sandboxes.go @@ -746,7 +746,7 @@ func (s *Server) snapshotAndCacheSandbox( FirecrackerVersion: sbx.Config.FirecrackerConfig.FirecrackerVersion, }) - snapshot, err := sbx.Pause(ctx, meta) + snapshot, err := sbx.Pause(ctx, meta, sandbox.SnapshotUseCasePause) if err != nil { return nil, fmt.Errorf("error snapshotting sandbox: %w", err) } diff --git a/packages/orchestrator/pkg/template/build/layer/layer_executor.go b/packages/orchestrator/pkg/template/build/layer/layer_executor.go index f9d72fa266..642cd09aa2 100644 --- a/packages/orchestrator/pkg/template/build/layer/layer_executor.go +++ b/packages/orchestrator/pkg/template/build/layer/layer_executor.go @@ -260,6 +260,7 @@ func (lb *LayerExecutor) PauseAndUpload( snapshot, err := sbx.Pause( ctx, meta, + sandbox.SnapshotUseCaseBuild, ) if err != nil { return fmt.Errorf("error processing vm: %w", err) diff --git a/packages/shared/pkg/telemetry/meters.go b/packages/shared/pkg/telemetry/meters.go index dd18e03381..e2a539ee7e 100644 --- a/packages/shared/pkg/telemetry/meters.go +++ b/packages/shared/pkg/telemetry/meters.go @@ -121,6 +121,10 @@ const ( SandboxFCBlockRateLimiterEventCount HistogramType = "orchestrator.sandbox.fc.block.rate_limiter_event_count" SandboxFCBlockIOEngineThrottled HistogramType = "orchestrator.sandbox.fc.block.io_engine_throttled" SandboxFCBlockRemainingReqs HistogramType = "orchestrator.sandbox.fc.block.remaining_reqs" + + SnapshotDiffBytes HistogramType = "orchestrator.sandbox.snapshot.diff.bytes" + SnapshotDiffRatioBp HistogramType = "orchestrator.sandbox.snapshot.diff.ratio_bp" + SnapshotTotalBytes HistogramType = "orchestrator.sandbox.snapshot.total.bytes" ) const ( @@ -352,6 +356,10 @@ var histogramDesc = map[HistogramType]string{ SandboxFCBlockRateLimiterEventCount: "Distribution of Firecracker VMM block rate limiter events per metrics flush", SandboxFCBlockIOEngineThrottled: "Distribution of Firecracker VMM block ops throttled by io_uring engine per metrics flush", SandboxFCBlockRemainingReqs: "Distribution of Firecracker VMM block queue remaining-request events per metrics flush", + + SnapshotDiffBytes: "Per-snapshot dirty/empty bytes per file", + SnapshotDiffRatioBp: "Per-snapshot dirty/empty as fraction of total mapped size, in basis points (10000=100%)", + SnapshotTotalBytes: "Per-snapshot total mapped size of the file", } var histogramUnits = map[HistogramType]string{ @@ -382,6 +390,10 @@ var histogramUnits = map[HistogramType]string{ SandboxFCBlockRateLimiterEventCount: "{event}", SandboxFCBlockIOEngineThrottled: "{op}", SandboxFCBlockRemainingReqs: "{event}", + + SnapshotDiffBytes: "{By}", + SnapshotDiffRatioBp: "{1}", + SnapshotTotalBytes: "{By}", } func GetHistogram(meter metric.Meter, name HistogramType) (metric.Int64Histogram, error) {