Skip to content

Commit 49b8f16

Browse files
arkamarjakubno
andauthored
feat(orchestrator): improve concurrent benchmark tracing and enable huge pages (#2327)
* feat(orchestrator): add concurrency and sandbox attributes to benchmark spans Add concurrency level, sandbox index, and sandbox ID attributes to the bench-resume span so traces can be filtered by concurrency level in Grafana/Tempo (e.g. {span.concurrency=5}). * feat(orchestrator): enable huge pages by default in concurrent benchmark Production uses huge pages, so the benchmark should too. Disable with DISABLE_HUGE_PAGES=true for comparison. Uses a separate build ID per mode to avoid cache collisions. * feat(orchestrator): add spans for uffd socket and rootfs waits in resume-fc During concurrent sandbox creation, resume-fc blocks on several parallel waits before it can load the snapshot. These waits were previously invisible — only covered by point-in-time ReportEvent calls that do not capture duration. Adding duration spans makes them visible as bars in the Grafana waterfall view. This is important because wait-rootfs-path turned out to be the primary bottleneck, growing significantly as more sandboxes are created simultaneously. * fix(orchestrator): parse DISABLE_HUGE_PAGES as a boolean flag Use strconv.ParseBool so that common boolean env values like 1, TRUE, or True are accepted, not just the exact string "true". * Update packages/orchestrator/pkg/sandbox/fc/process.go Co-authored-by: Jakub Novák <jakub@e2b.dev> --------- Co-authored-by: Jakub Novák <jakub@e2b.dev>
1 parent cdab472 commit 49b8f16

2 files changed

Lines changed: 30 additions & 4 deletions

File tree

packages/orchestrator/benchmarks/concurrent_benchmark_test.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ import (
3333
"github.com/stretchr/testify/assert"
3434
"github.com/stretchr/testify/require"
3535
"go.opentelemetry.io/otel"
36+
"go.opentelemetry.io/otel/attribute"
3637
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
3738
"go.opentelemetry.io/otel/metric/noop"
39+
"go.opentelemetry.io/otel/trace"
3840

3941
"github.com/e2b-dev/infra/packages/clickhouse/pkg/hoststats"
4042
"github.com/e2b-dev/infra/packages/orchestrator/pkg/cfg"
@@ -137,11 +139,20 @@ func BenchmarkConcurrentResume(b *testing.B) {
137139
kernelVersion = "vmlinux-6.1.158"
138140
fcVersion = featureflags.DefaultFirecrackerVersion
139141
templateID = "fcb33d09-3141-42c4-8d3b-c2df411681db"
140-
buildID = "ba6aae36-74f7-487a-b6f7-74fd7c94e479"
141-
useHugePages = false
142142
templateVersion = "v2.0.0"
143+
144+
buildIDNormal = "ba6aae36-74f7-487a-b6f7-74fd7c94e479"
145+
buildIDHugePages = "ba6aae36-74f7-487a-b6f7-74fd7c94e480"
143146
)
144147

148+
disableHP, _ := strconv.ParseBool(os.Getenv("DISABLE_HUGE_PAGES"))
149+
useHugePages := !disableHP
150+
buildID := buildIDHugePages
151+
if !useHugePages {
152+
buildID = buildIDNormal
153+
b.Log("huge pages disabled")
154+
}
155+
145156
// cache & ephemeral paths
146157
persistenceDir := getPersistenceDir()
147158
kernelsDir := filepath.Join(persistenceDir, "kernels")
@@ -423,9 +434,17 @@ func runConcurrentResume(
423434
// Wait for the barrier.
424435
<-gate
425436

437+
ctx, span := tracer.Start(b.Context(), "bench-resume",
438+
trace.WithAttributes(
439+
attribute.Int("concurrency", n),
440+
attribute.Int("sandbox.index", i),
441+
attribute.String("sandbox.id", runtime.SandboxID),
442+
),
443+
)
444+
426445
start := time.Now()
427446
sbx, err := factory.ResumeSandbox(
428-
b.Context(),
447+
ctx,
429448
tmpl,
430449
config,
431450
runtime,
@@ -434,6 +453,7 @@ func runConcurrentResume(
434453
nil,
435454
)
436455
elapsed := time.Since(start)
456+
span.End()
437457

438458
results[i] = concurrencyResult{
439459
sandboxID: runtime.SandboxID,

packages/orchestrator/pkg/sandbox/fc/process.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,10 @@ func (p *Process) Resume(
489489
})
490490

491491
eg.Go(func() error {
492-
err := socket.Wait(egCtx, uffdSocketPath)
492+
ctx, uffdSpan := tracer.Start(egCtx, "wait-uffd-socket")
493+
err := socket.Wait(ctx, uffdSocketPath)
494+
uffdSpan.End()
495+
493496
if err != nil {
494497
return fmt.Errorf("error waiting for uffd socket: %w", err)
495498
}
@@ -500,7 +503,10 @@ func (p *Process) Resume(
500503
})
501504

502505
eg.Go(func() error {
506+
_, rootfsSpan := tracer.Start(egCtx, "wait-rootfs-path")
503507
rootfsPath, err := p.rootfsProvider.Path()
508+
rootfsSpan.End()
509+
504510
if err != nil {
505511
return fmt.Errorf("error getting rootfs path: %w", err)
506512
}

0 commit comments

Comments
 (0)