diff --git a/packages/orchestrator/cmd/copy-build/main.go b/packages/orchestrator/cmd/copy-build/main.go index 438c75cea6..cd4f2983b1 100644 --- a/packages/orchestrator/cmd/copy-build/main.go +++ b/packages/orchestrator/cmd/copy-build/main.go @@ -28,13 +28,16 @@ import ( type Destination struct { Path string CRC uint32 + exists bool isLocal bool } func NewDestinationFromObject(ctx context.Context, o *googleStorage.ObjectHandle) (*Destination, error) { var crc uint32 + exists := false if attrs, err := o.Attrs(ctx); err == nil { crc = attrs.CRC32C + exists = true } else if !errors.Is(err, googleStorage.ErrObjectNotExist) { return nil, fmt.Errorf("failed to get object attributes: %w", err) } @@ -42,6 +45,7 @@ func NewDestinationFromObject(ctx context.Context, o *googleStorage.ObjectHandle return &Destination{ Path: fmt.Sprintf("gs://%s/%s", o.BucketName(), o.ObjectName()), CRC: crc, + exists: exists, isLocal: false, }, nil } @@ -67,6 +71,7 @@ func NewDestinationFromPath(prefix, file string) (*Destination, error) { return &Destination{ Path: p, CRC: crc, + exists: true, isLocal: true, }, nil } @@ -194,6 +199,116 @@ func gcloudCopy(ctx context.Context, from, to *Destination) error { return nil } +// readTemplateVersions reads the build's metadata.json (from the destination, where it +// has just been copied) and returns its kernel and firecracker version strings. +func readTemplateVersions(ctx context.Context, client *googleStorage.Client, to, metadataPath string) (kernelVer, fcVer string, err error) { + var r io.ReadCloser + if strings.HasPrefix(to, "gs://") { + bucketName, _ := strings.CutPrefix(to, "gs://") + r, err = client.Bucket(bucketName).Object(metadataPath).NewReader(ctx) + } else { + r, err = os.Open(path.Join(to, "templates", metadataPath)) + } + if err != nil { + return "", "", err + } + defer r.Close() + + var meta struct { + Template struct { + KernelVersion string `json:"kernel_version"` + FirecrackerVersion string `json:"firecracker_version"` + } `json:"template"` + } + if err := json.NewDecoder(r).Decode(&meta); err != nil { + return "", "", err + } + + return meta.Template.KernelVersion, meta.Template.FirecrackerVersion, nil +} + +// deriveArtifactBuckets maps a template bucket (gs://-fc-templates) to the +// sibling firecracker-versions and kernels buckets in the same project/env. The build +// records no environment, so the env is taken from the template bucket location. +func deriveArtifactBuckets(templateLoc string) (fcBucket, kernelBucket string, err error) { + if !strings.HasPrefix(templateLoc, "gs://") { + return "", "", fmt.Errorf("-gdb requires a gs:// location, got %q", templateLoc) + } + bucket := strings.TrimSuffix(strings.TrimPrefix(templateLoc, "gs://"), "/") + prefix, ok := strings.CutSuffix(bucket, "-fc-templates") + if !ok { + return "", "", fmt.Errorf("cannot derive versions/kernels buckets from %q (expected a *-fc-templates bucket)", templateLoc) + } + + return "gs://" + prefix + "-fc-versions", "gs://" + prefix + "-fc-kernels", nil +} + +// copyGdbArtifacts ensures the build's FC + kernel runtime and debug artifacts exist at +// the destination, copying each from the source env's versions/kernels buckets only if +// it is not already present at the destination (so a large vmlinux.debug is not recopied +// for a version already there). Required artifacts must exist at the source; the optional +// firecracker-debug.debug (FC's own symbols, not needed for guest-kernel gdb) is skipped +// if absent. +func copyGdbArtifacts(ctx context.Context, client *googleStorage.Client, from, to, arch, fcVer, kernelVer string) error { + fcFrom, kFrom, err := deriveArtifactBuckets(from) + if err != nil { + return err + } + fcTo, kTo, err := deriveArtifactBuckets(to) + if err != nil { + return err + } + + artifacts := []struct { + name string + srcBucket, dstBucket, obj string + required bool + }{ + {"firecracker", fcFrom, fcTo, path.Join(fcVer, arch, "firecracker"), true}, + {"firecracker-debug", fcFrom, fcTo, path.Join(fcVer, arch, "firecracker-debug"), true}, + {"firecracker-debug.debug", fcFrom, fcTo, path.Join(fcVer, arch, "firecracker-debug.debug"), false}, + {"vmlinux.bin", kFrom, kTo, path.Join(kernelVer, arch, "vmlinux.bin"), true}, + {"vmlinux.debug", kFrom, kTo, path.Join(kernelVer, arch, "vmlinux.debug"), true}, + } + + for _, a := range artifacts { + srcBucket := strings.TrimPrefix(a.srcBucket, "gs://") + dstBucket := strings.TrimPrefix(a.dstBucket, "gs://") + src, err := NewDestinationFromObject(ctx, client.Bucket(srcBucket).Object(a.obj)) + if err != nil { + return fmt.Errorf("stat source %s: %w", a.name, err) + } + dst, err := NewDestinationFromObject(ctx, client.Bucket(dstBucket).Object(a.obj)) + if err != nil { + return fmt.Errorf("stat destination %s: %w", a.name, err) + } + + if !src.exists { // not present at source + if a.required { + return fmt.Errorf("required gdb artifact %s not found at %s (is this version published with gdb support?)", a.name, src.Path) + } + fmt.Fprintf(os.Stderr, "-> optional gdb artifact %s not at source, skipping\n", a.name) + + continue + } + // Skip only when the destination already holds identical content (CRC32C match); + // otherwise copy, replacing a divergent/stale or absent artifact rather than + // trusting it. The dst.exists guard avoids a false 0 == 0 match when the + // destination is missing and the source's CRC32C is genuinely zero. + if dst.exists && src.CRC == dst.CRC { + fmt.Fprintf(os.Stderr, "-> gdb artifact %s already current at destination, skipping\n", a.name) + + continue + } + fmt.Fprintf(os.Stderr, "+ copying gdb artifact '%s' to '%s'\n", src.Path, dst.Path) + if err := gcloudCopy(ctx, src, dst); err != nil { + return fmt.Errorf("copy %s: %w", a.name, err) + } + } + + return nil +} + func main() { buildId := flag.String("build", "", "build id") from := flag.String("from", "", "from destination") @@ -204,12 +319,19 @@ func main() { memory := flag.Int("memory", 1024, "memory MB") disk := flag.Int("disk", 1024, "disk MB") tag := flag.String("tag", "default", "build assignment tag") + gdb := flag.Bool("gdb", false, "also copy the build's FC + kernel runtime and debug artifacts (firecracker, firecracker-debug, vmlinux.bin, vmlinux.debug) into the matching versions/kernels buckets so the snapshot is gdb-ready at the destination; requires gs:// -from/-to") + arch := flag.String("arch", "amd64", "artifact arch for -gdb (amd64 or arm64)") flag.Parse() if *teamID != "" && *envdVersion == "" { log.Fatal("-envd-version is required when -team is set") } + // Validate -gdb's gs:// precondition up front (deriveArtifactBuckets needs it), so a + // wrong invocation fails instantly rather than after the multi-GB snapshot copy. + if *gdb && (!strings.HasPrefix(*from, "gs://") || !strings.HasPrefix(*to, "gs://")) { + log.Fatal("-gdb requires gs:// -from and -to (it stages debug artifacts between bucket environments)") + } fmt.Fprintf(os.Stderr, "Copying build '%s' from '%s' to '%s'\n", *buildId, *from, *to) @@ -378,48 +500,34 @@ func main() { fmt.Fprintf(os.Stderr, "Build '%s' copied to '%s'\n", *buildId, *to) - if *teamID != "" { - // Read metadata.json from destination to get kernel and firecracker versions. - var metadataReader io.ReadCloser - if strings.HasPrefix(*to, "gs://") { - bucketName, _ := strings.CutPrefix(*to, "gs://") - obj := googleStorageClient.Bucket(bucketName).Object(metadataPath) - r, err := obj.NewReader(ctx) - if err != nil { - log.Fatalf("failed to read metadata from GCS: %s", err) - } - metadataReader = r - } else { - f, err := os.Open(path.Join(*to, "templates", metadataPath)) - if err != nil { - log.Fatalf("failed to read metadata from local path: %s", err) - } - metadataReader = f + if *teamID != "" || *gdb { + // metadata.json (just copied to the destination) carries the kernel + FC + // versions; both the -gdb artifact copy and the -team SQL seed need them. + kernelVer, fcVer, err := readTemplateVersions(ctx, googleStorageClient, *to, metadataPath) + if err != nil { + log.Fatalf("failed to read template versions from metadata: %s", err) } - var meta struct { - Template struct { - KernelVersion string `json:"kernel_version"` - FirecrackerVersion string `json:"firecracker_version"` - } `json:"template"` + if *gdb { + if err := copyGdbArtifacts(ctx, googleStorageClient, *from, *to, *arch, fcVer, kernelVer); err != nil { + log.Fatalf("failed to copy gdb artifacts: %s", err) + } + fmt.Fprintf(os.Stderr, "gdb artifacts ensured at destination (arch %s)\n", *arch) } - if err := json.NewDecoder(metadataReader).Decode(&meta); err != nil { - metadataReader.Close() - log.Fatalf("failed to decode metadata.json: %s", err) + + if *teamID != "" { + envID := id.Generate() + fmt.Fprintf(os.Stderr, "\n\nGenerated env ID: %s\n\n", envID) + + fmt.Printf("BEGIN;\n") + fmt.Printf("INSERT INTO public.envs (id, team_id, updated_at, public, source)\n") + fmt.Printf("VALUES ('%s', '%s', NOW(), FALSE, 'template');\n\n", envID, *teamID) + fmt.Printf("INSERT INTO public.env_builds (id, env_id, updated_at, finished_at, status, ram_mb, vcpu, kernel_version, firecracker_version, envd_version, free_disk_size_mb, total_disk_size_mb)\n") + fmt.Printf("VALUES ('%s', '%s', NOW(), NOW(), 'uploaded', %d, %d, '%s', '%s', '%s', %d, %d);\n\n", + *buildId, envID, *memory, *vcpu, kernelVer, fcVer, *envdVersion, *disk, *disk) + fmt.Printf("INSERT INTO public.env_build_assignments (env_id, build_id, tag)\n") + fmt.Printf("VALUES ('%s', '%s', '%s');\n", envID, *buildId, *tag) + fmt.Printf("COMMIT;\n") } - metadataReader.Close() - - envID := id.Generate() - fmt.Fprintf(os.Stderr, "\n\nGenerated env ID: %s\n\n", envID) - - fmt.Printf("BEGIN;\n") - fmt.Printf("INSERT INTO public.envs (id, team_id, updated_at, public, source)\n") - fmt.Printf("VALUES ('%s', '%s', NOW(), FALSE, 'template');\n\n", envID, *teamID) - fmt.Printf("INSERT INTO public.env_builds (id, env_id, updated_at, finished_at, status, ram_mb, vcpu, kernel_version, firecracker_version, envd_version, free_disk_size_mb, total_disk_size_mb)\n") - fmt.Printf("VALUES ('%s', '%s', NOW(), NOW(), 'uploaded', %d, %d, '%s', '%s', '%s', %d, %d);\n\n", - *buildId, envID, *memory, *vcpu, meta.Template.KernelVersion, meta.Template.FirecrackerVersion, *envdVersion, *disk, *disk) - fmt.Printf("INSERT INTO public.env_build_assignments (env_id, build_id, tag)\n") - fmt.Printf("VALUES ('%s', '%s', '%s');\n", envID, *buildId, *tag) - fmt.Printf("COMMIT;\n") } } diff --git a/packages/orchestrator/cmd/resume-build/fc-debug.gdb b/packages/orchestrator/cmd/resume-build/fc-debug.gdb index dd4c4abff5..786ab7900a 100644 --- a/packages/orchestrator/cmd/resume-build/fc-debug.gdb +++ b/packages/orchestrator/cmd/resume-build/fc-debug.gdb @@ -23,7 +23,7 @@ define fc-faults else set $_fc_n = $arg0 end - break *handle_mm_fault + hbreak *handle_mm_fault set $_fc_i = 0 while $_fc_i < $_fc_n continue diff --git a/packages/orchestrator/cmd/resume-build/gdb-debugging.md b/packages/orchestrator/cmd/resume-build/gdb-debugging.md index ec01f113cd..cee790582c 100644 --- a/packages/orchestrator/cmd/resume-build/gdb-debugging.md +++ b/packages/orchestrator/cmd/resume-build/gdb-debugging.md @@ -12,20 +12,33 @@ kernel state) beyond what host/UFFD telemetry exposes. `gdb` on PATH. The two debug artifacts — `firecracker-debug` (Firecracker built `--features gdb`) and `vmlinux.debug` (the guest kernel's split DWARF symbols) — are -**fetched automatically by version**, matched to the snapshot's `FirecrackerVersion` -/ `KernelVersion` (which `resume-build` prints when it loads the build), from -`https://storage.googleapis.com/e2b-prod-public-builds`. In the common case you pass -nothing. - -Supplying them yourself is only needed when the fetch can't find them — before the -fc-versions / fc-kernels pipelines publish them, or for a locally-built kernel/FC. -Then point `E2B_GDB_ARTIFACTS_URL` at a base that serves them, or pass explicit paths +resolved **locally**, by version (matched to the snapshot's `FirecrackerVersion` / +`KernelVersion`, which `resume-build` prints when it loads the build): `firecracker-debug` +next to the snapshot's `firecracker`, and `vmlinux.debug` next to its `vmlinux.bin`. The +fc-versions / fc-kernels releases publish them into those version dirs (and `copy-build +-gdb` stages them when bridging a snapshot — see step 1), so in the common case they are +already present and you pass nothing. + +They are **not** fetched over the network. If they aren't present, pass explicit paths with `-gdb-fc` / `-gdb-symbols` (see *Preparing the artifacts*). ## Steps 1. **Copy the build chain** to the dev node's local storage (`copy-build`), as for - `resume-prod-snapshot`. + `resume-prod-snapshot` — or resume straight from a bucket with `-storage gs://…`. + + To make the debug artifacts resolvable when the snapshot comes from another + environment, stage them with a **separate `gs://`→`gs://` bridge** (not part of the + local copy above): + + ```bash + copy-build -gdb -from gs://-fc-templates -to gs://-fc-templates + ``` + + `-gdb` copies `firecracker-debug` / `vmlinux.debug` (plus the runtime FC/kernel) into + the destination's `-fc-versions` / `-fc-kernels` buckets, so they sit next to the + FC/kernel on the node's mounts and resolve locally. It requires `gs://` for both + `-from` and `-to`; it does not write to local paths. 2. **Resume under gdb** (interactive) — the common case needs no extra flags: @@ -51,10 +64,10 @@ connected.) ## Preparing the artifacts -Normally you don't — `resume-build` fetches `firecracker-debug` and `vmlinux.debug` -automatically (see *Prerequisites*). Build them by hand only when the release -pipelines haven't published them for your version, or to debug a locally-built -kernel/FC: +Normally you don't — the fc-versions / fc-kernels releases publish `firecracker-debug` +and `vmlinux.debug` into the FC-version / kernel-version dirs, and `copy-build -gdb` +stages them next to a bridged snapshot (see *Prerequisites*). Build them by hand only to +debug a locally-built kernel/FC, or a version the releases predate: - **`firecracker-debug`** — Firecracker built `--features gdb` (release profile): `cargo build --release --features gdb -p firecracker`. Pass with `-gdb-fc`. @@ -63,10 +76,9 @@ kernel/FC: toolchain as the deployed kernel (gcc 13.x / Ubuntu 24.04) so the symbol addresses match the snapshot. Pass with `-gdb-symbols`. -Then either pass `-gdb-fc` / `-gdb-symbols` explicitly, stage them at the conventional -local paths (`firecracker-debug` in the FC-version dir, `vmlinux.debug` in the -kernel-version dir), or set `E2B_GDB_ARTIFACTS_URL` to a base that serves them at -`firecrackers///firecracker-debug` and `kernels///vmlinux.debug`. +Then pass `-gdb-fc` / `-gdb-symbols` explicitly, or stage them at the conventional local +paths (`firecracker-debug` in the FC-version dir, `vmlinux.debug` in the kernel-version +dir). ## Macros (`fc-debug.gdb`) diff --git a/packages/orchestrator/cmd/resume-build/gdb.go b/packages/orchestrator/cmd/resume-build/gdb.go index f6bcca1f24..de9dec60fe 100644 --- a/packages/orchestrator/cmd/resume-build/gdb.go +++ b/packages/orchestrator/cmd/resume-build/gdb.go @@ -1,12 +1,12 @@ package main import ( + "bytes" "context" + _ "embed" "errors" "fmt" "io" - "net/http" - "net/url" "os" "os/exec" "path/filepath" @@ -14,6 +14,7 @@ import ( "time" "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/artifact" "github.com/e2b-dev/infra/packages/shared/pkg/utils" ) @@ -47,7 +48,7 @@ func (r *runner) gdbMode(ctx context.Context, opts gdbOptions) error { if _, err := exec.LookPath("gdb"); err != nil { return fmt.Errorf("gdb not found on PATH: %w", err) } - fcBinary, symbols, err := r.gdbResolveArtifacts(ctx, opts) + fcBinary, symbols, err := r.gdbResolveArtifacts(opts) if err != nil { return err } @@ -58,15 +59,30 @@ func (r *runner) gdbMode(ctx context.Context, opts gdbOptions) error { // CONFIG_RANDOMIZE_BASE and CONFIG_RANDOMIZE_MEMORY stay inert (gated on a flag // only the decompressor sets). So we load symbols at offset 0. - // 3. Stage the gdb-enabled Firecracker binary at the path resume-build resolves - // for this snapshot's FC version, backing up whatever is there and restoring - // it on exit (so the local prod binary is left untouched). - fcPath := r.sbxConfig.FirecrackerConfig.FirecrackerPath(r.config) - restore, err := stageBinary(fcBinary, fcPath) - if err != nil { + // 3. Stage the gdb-enabled Firecracker into the writable temp FirecrackerVersionsDir + // that run() points the factory at for gdb mode. The factory (and thus the launch) + // resolves FC from this dir at resume, so we never overwrite the prod binary in the + // real versions dir — which on cluster nodes is a read-only gcsfuse mount where the + // old in-place swap failed. The kernel dir is untouched. + stagedFC := filepath.Join(r.config.FirecrackerVersionsDir, r.sbxConfig.FirecrackerConfig.FirecrackerVersion, utils.TargetArch(), artifact.FirecrackerBinaryName) + if err := os.MkdirAll(filepath.Dir(stagedFC), 0o755); err != nil { + return fmt.Errorf("gdb fc staging dir: %w", err) + } + if err := copyFile(fcBinary, stagedFC, 0o755); err != nil { return fmt.Errorf("stage debug firecracker: %w", err) } - defer restore() + fcPath := r.sbxConfig.FirecrackerConfig.FirecrackerPath(r.config) + + // Backstop: confirm the binary we are about to launch is actually gdb-enabled. This + // guards both a stale/wrong firecracker-debug and any future regression that resolves + // FC from somewhere other than the staging dir — otherwise FC starts but never opens + // the stub, surfacing only as an opaque "gdb socket never bound" later. + if ok, gdbErr := fileContainsGdbStub(fcPath); gdbErr != nil { + return fmt.Errorf("check staged firecracker: %w", gdbErr) + } else if !ok { + return fmt.Errorf("firecracker to launch (%s) is not gdb-enabled (no FIRECRACKER_GDB_SOCKET); "+ + "it must be built with --features gdb (see fc-versions build.sh)", fcPath) + } // 4. Arm the stub via the env var (FC inherits resume-build's env; no jailer // here), and tell the resume path not to wait for envd — the guest never @@ -154,45 +170,26 @@ func (r *runner) gdbMode(ctx context.Context, opts gdbOptions) error { return runGdb(ctx, initScript, opts) } -// defaultDebugArtifactsBaseURL is where the fc-versions / fc-kernels release pipelines -// publish the debug artifacts (firecracker-debug, vmlinux.debug), alongside the prod -// firecracker/vmlinux that create-build already fetches from here. -const defaultDebugArtifactsBaseURL = "https://storage.googleapis.com/e2b-prod-public-builds" - -// debugArtifactsBaseURL is the base URL to fetch firecracker-debug / vmlinux.debug from. -// Overridable via E2B_GDB_ARTIFACTS_URL (e.g. to point at a bucket you can read before -// the artifacts are published to the public one). -func debugArtifactsBaseURL() string { - if u := os.Getenv("E2B_GDB_ARTIFACTS_URL"); u != "" { - return strings.TrimRight(u, "/") - } - - return defaultDebugArtifactsBaseURL -} - // gdbResolveArtifacts resolves the debug FC binary and the vmlinux.debug symbols. Each -// is taken from its -gdb-* override if set, else a local staged copy if present, else -// fetched by version from the release buckets (see debugArtifactsBaseURL) — mirroring -// how create-build fetches the prod kernel/FC. -func (r *runner) gdbResolveArtifacts(ctx context.Context, opts gdbOptions) (fcBinary, symbols string, err error) { - arch := utils.TargetArch() +// is taken from its -gdb-* override if set, else a local copy next to the snapshot's FC / +// kernel — where the fc-versions/fc-kernels buckets, and copy-build -gdb, place them. The +// artifacts are not fetched over the network. +func (r *runner) gdbResolveArtifacts(opts gdbOptions) (fcBinary, symbols string, err error) { fcVer := r.sbxConfig.FirecrackerConfig.FirecrackerVersion kernelVer := r.sbxConfig.FirecrackerConfig.KernelVersion - fcDir := filepath.Dir(r.sbxConfig.FirecrackerConfig.FirecrackerPath(r.config)) - kernelDir := filepath.Dir(r.sbxConfig.FirecrackerConfig.HostKernelPath(r.config)) - base := debugArtifactsBaseURL() - - fcURL, err := url.JoinPath(base, "firecrackers", fcVer, arch, "firecracker-debug") - if err != nil { - return "", "", fmt.Errorf("firecracker-debug URL: %w", err) - } - symURL, err := url.JoinPath(base, "kernels", kernelVer, arch, "vmlinux.debug") - if err != nil { - return "", "", fmt.Errorf("vmlinux.debug URL: %w", err) - } - - fcBinary, fcErr := resolveOrFetch(ctx, opts.fcBinary, filepath.Join(fcDir, "firecracker-debug"), fcURL, 0o755) - symbols, symErr := resolveOrFetch(ctx, opts.symbols, filepath.Join(kernelDir, "vmlinux.debug"), symURL, 0o644) + // Resolve the debug artifacts from the ORIGINAL versions dir: in gdb mode run() points + // the runner's FirecrackerVersionsDir at a writable temp staging dir, but the published + // firecracker-debug lives in the original (read-only) dir. The kernel dir is not + // overridden. + fcVersionsDir := r.config.FirecrackerVersionsDir + if r.gdbOrigVersionsDir != "" { + fcVersionsDir = r.gdbOrigVersionsDir + } + // Prefer the arch-prefixed layout (where releases and copy-build -gdb publish), falling + // back to the legacy flat layout — independently of FirecrackerPath/HostKernelPath, + // which resolve the prod binary and may sit in a different layout on un-migrated nodes. + fcBinary, fcErr := resolveLocal(opts.fcBinary, archOrLegacyArtifact(fcVersionsDir, fcVer, "firecracker-debug")) + symbols, symErr := resolveLocal(opts.symbols, archOrLegacyArtifact(r.config.HostKernelsDir, kernelVer, "vmlinux.debug")) var missing []string if fcErr != nil { @@ -203,20 +200,30 @@ func (r *runner) gdbResolveArtifacts(ctx context.Context, opts gdbOptions) (fcBi } if len(missing) > 0 { return "", "", fmt.Errorf( - "could not obtain gdb debug artifacts:\n - %s\n"+ - "They are fetched by version from %s. Until the fc-versions/fc-kernels release\n"+ - "pipelines publish them there, build them (a --features gdb firecracker and a DWARF\n"+ - "kernel) and pass -gdb-fc / -gdb-symbols, or set E2B_GDB_ARTIFACTS_URL to a base URL\n"+ - "that serves them", - strings.Join(missing, "\n - "), base) + "could not find gdb debug artifacts locally:\n - %s\n"+ + "firecracker-debug must sit next to the snapshot's firecracker, and vmlinux.debug\n"+ + "next to its vmlinux.bin (the fc-versions/fc-kernels buckets; copy-build -gdb stages\n"+ + "them). Otherwise pass -gdb-fc / -gdb-symbols explicitly", + strings.Join(missing, "\n - ")) } return fcBinary, symbols, nil } -// resolveOrFetch returns the override if it is set (erroring if it does not exist), -// otherwise the local staged path if it already exists, otherwise downloads url to it. -func resolveOrFetch(ctx context.Context, override, localPath, srcURL string, perm os.FileMode) (string, error) { +// archOrLegacyArtifact returns /// when it exists, else the legacy +// flat //, mirroring FirecrackerPath/HostKernelPath so a debug artifact +// resolves under either layout (releases and copy-build -gdb publish it arch-prefixed). +func archOrLegacyArtifact(base, ver, file string) string { + if archPath := filepath.Join(base, ver, utils.TargetArch(), file); fileExists(archPath) { + return archPath + } + + return filepath.Join(base, ver, file) +} + +// resolveLocal returns the -gdb-* override if set (erroring if it does not exist), +// otherwise the local copy if present, else an error. Artifacts are not fetched. +func resolveLocal(override, localPath string) (string, error) { if override != "" { if fileExists(override) { return override, nil @@ -227,67 +234,15 @@ func resolveOrFetch(ctx context.Context, override, localPath, srcURL string, per if fileExists(localPath) { return localPath, nil } - if err := os.MkdirAll(filepath.Dir(localPath), 0o755); err != nil { - return "", err - } - fmt.Printf("⬇ fetching %s from %s ...\n", filepath.Base(localPath), srcURL) - if err := download(ctx, srcURL, localPath, perm); err != nil { - return "", err - } - return localPath, nil -} - -// download GETs rawURL to path (atomic rename via a .tmp). Mirrors create-build's -// helper; the debug artifacts live in the same public release buckets. -func download(ctx context.Context, rawURL, path string, perm os.FileMode) error { - req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) - if err != nil { - return fmt.Errorf("invalid download URL %s: %w", rawURL, err) - } - resp, err := (&http.Client{Timeout: 10 * time.Minute}).Do(req) - if err != nil { - return err - } - defer resp.Body.Close() - - if resp.StatusCode == http.StatusNotFound { - return fmt.Errorf("not found (HTTP 404): %s", rawURL) - } - if resp.StatusCode != http.StatusOK { - return fmt.Errorf("HTTP %d: %s", resp.StatusCode, rawURL) - } - - tmpPath := path + ".tmp" - f, err := os.OpenFile(tmpPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, perm) - if err != nil { - return err - } - if _, err := io.Copy(f, resp.Body); err != nil { - f.Close() - os.Remove(tmpPath) - - return err - } - if err := f.Close(); err != nil { - os.Remove(tmpPath) - - return err - } - if err := os.Rename(tmpPath, path); err != nil { - os.Remove(tmpPath) - - return err - } - - return nil + return "", fmt.Errorf("not present at %s", localPath) } // writeInitScript generates the parameterized gdb init script: load the versioned // macro library, the symbols (at their link-time addresses — see gdbMode: FC boots the // vmlinux ELF directly, so there is no KASLR image slide), and connect to the stub. func writeInitScript(symbols, socket string) (string, error) { - macroLib, err := macroLibPath() + macros, err := macroLibContent() if err != nil { return "", err } @@ -295,14 +250,22 @@ func writeInitScript(symbols, socket string) (string, error) { if err != nil { return "", err } + // Inline the macros rather than `source`-ing a separate file: the init script is the + // only temp file, and it is removed on exit (see gdbMode), so nothing leaks. if _, err := fmt.Fprintf(f, `set pagination off set confirm off -source %s +%s # FC boots the uncompressed vmlinux ELF directly, so KASLR never relocates the image: # symbols sit at their link-time addresses (offset 0). add-symbol-file %s -o 0x0 +# FC binds the gdb socket while still loading the snapshot, so its first packet +# ack can lag past gdb's 2s default. That makes gdb retransmit qSupported, the +# stub double-replies, and the reply stream desyncs (gdb aborts with +# "Remote replied unexpectedly to 'vMustReplyEmpty'"). Raise the timeout so gdb +# does not prematurely retransmit during connect. +set remotetimeout 120 target remote %s -`, macroLib, symbols, socket); err != nil { +`, macros, symbols, socket); err != nil { f.Close() _ = os.Remove(f.Name()) @@ -317,27 +280,28 @@ target remote %s return f.Name(), nil } -// macroLibPath locates the checked-in fc-debug.gdb macro library next to this binary's -// source. resume-build is run via `go run`/built from cmd/resume-build, so the library -// sits beside main.go. -func macroLibPath() (string, error) { - exe, err := os.Executable() - if err == nil { +// fcDebugMacros is the checked-in gdb macro library, embedded so a standalone binary +// is self-contained (resume-build is typically scp'd to a node away from its source). +// +//go:embed fc-debug.gdb +var fcDebugMacros string + +// macroLibContent returns the fc-debug.gdb macro definitions: a copy colocated with the +// binary if present (lets you iterate on macros without rebuilding), otherwise the +// embedded copy. The caller inlines this into the init script, so no temp file is made. +func macroLibContent() (string, error) { + if exe, err := os.Executable(); err == nil { if p := filepath.Join(filepath.Dir(exe), "fc-debug.gdb"); fileExists(p) { - return p, nil - } - } - // Fall back to the source tree (the common `go run ./cmd/resume-build` case). - for _, p := range []string{ - "cmd/resume-build/fc-debug.gdb", - "packages/orchestrator/cmd/resume-build/fc-debug.gdb", - } { - if abs, err := filepath.Abs(p); err == nil && fileExists(abs) { - return abs, nil + b, err := os.ReadFile(p) + if err != nil { + return "", err + } + + return string(b), nil } } - return "", errors.New("fc-debug.gdb macro library not found (next to the binary or under cmd/resume-build/)") + return fcDebugMacros, nil } // printGdbContext prints the debug-context block so the session is drivable any way @@ -389,6 +353,18 @@ func fileExists(p string) bool { return err == nil && !info.IsDir() } +// fileContainsGdbStub reports whether the binary at path was built with the gdb feature, +// detected by the FIRECRACKER_GDB_SOCKET env-var literal — present iff the +// #[cfg(feature = "gdb")] code is compiled in, and it survives stripping. +func fileContainsGdbStub(path string) (bool, error) { + b, err := os.ReadFile(path) + if err != nil { + return false, err + } + + return bytes.Contains(b, []byte("FIRECRACKER_GDB_SOCKET")), nil +} + func waitForSocket(ctx context.Context, path string, timeout time.Duration) error { deadline := time.Now().Add(timeout) ticker := time.NewTicker(50 * time.Millisecond) @@ -408,44 +384,6 @@ func waitForSocket(ctx context.Context, path string, timeout time.Duration) erro } } -// stageBinary copies src to dst (preserving any existing dst as a backup) and returns -// a restore func that puts the original back (or removes the staged copy if there was -// none). The running FC keeps its loaded binary, so restoring after launch is safe. -func stageBinary(src, dst string) (restore func(), err error) { - if err := os.MkdirAll(filepath.Dir(dst), 0o755); err != nil { - return nil, err - } - bak := dst + ".prodbak" - // If a backup already exists, a previous run was interrupted before it could - // restore — that backup is the real binary, so keep it and overwrite dst (which - // currently holds the staged debug binary) rather than backing dst up over it. - hadOriginal := fileExists(bak) - if !hadOriginal && fileExists(dst) { - if err := os.Rename(dst, bak); err != nil { - return nil, fmt.Errorf("back up %s: %w", dst, err) - } - hadOriginal = true - } - if err := copyFile(src, dst, 0o755); err != nil { - if hadOriginal { - _ = os.Rename(bak, dst) - } else { - // No original to restore: drop the partial/truncated copy so a later - // run can't resolve and execute a corrupt binary at dst. - _ = os.Remove(dst) - } - - return nil, err - } - - return func() { - _ = os.Remove(dst) - if hadOriginal { - _ = os.Rename(bak, dst) - } - }, nil -} - func copyFile(src, dst string, mode os.FileMode) error { in, err := os.Open(src) if err != nil { diff --git a/packages/orchestrator/cmd/resume-build/gdb_test.go b/packages/orchestrator/cmd/resume-build/gdb_test.go index d39735d97b..0f9f3d8c2e 100644 --- a/packages/orchestrator/cmd/resume-build/gdb_test.go +++ b/packages/orchestrator/cmd/resume-build/gdb_test.go @@ -1,42 +1,22 @@ package main import ( - "context" - "net/http" - "net/http/httptest" "os" "path/filepath" "testing" ) -//nolint:paralleltest // uses t.Setenv, which is incompatible with t.Parallel -func TestDebugArtifactsBaseURL(t *testing.T) { - t.Setenv("E2B_GDB_ARTIFACTS_URL", "") - if got := debugArtifactsBaseURL(); got != defaultDebugArtifactsBaseURL { - t.Fatalf("default: got %q, want %q", got, defaultDebugArtifactsBaseURL) - } - - t.Setenv("E2B_GDB_ARTIFACTS_URL", "http://localhost:8077/") - if got, want := debugArtifactsBaseURL(), "http://localhost:8077"; got != want { - t.Fatalf("override (trailing slash trimmed): got %q, want %q", got, want) - } -} - -func TestResolveOrFetch(t *testing.T) { +func TestResolveLocal(t *testing.T) { t.Parallel() - ctx := context.Background() - // Resolution paths that must NOT fetch use a fast-failing URL, so a regression - // that wrongly fetches surfaces as an error rather than a silent pass. - const noFetchURL = "http://127.0.0.1:0/unused" - t.Run("override returned when present, no fetch", func(t *testing.T) { + t.Run("override returned when present", func(t *testing.T) { t.Parallel() dir := t.TempDir() override := filepath.Join(dir, "override.bin") if err := os.WriteFile(override, []byte("x"), 0o644); err != nil { t.Fatal(err) } - got, err := resolveOrFetch(ctx, override, filepath.Join(dir, "local"), noFetchURL, 0o644) + got, err := resolveLocal(override, filepath.Join(dir, "local")) if err != nil || got != override { t.Fatalf("got %q, err %v; want %q", got, err, override) } @@ -45,62 +25,29 @@ func TestResolveOrFetch(t *testing.T) { t.Run("missing override errors", func(t *testing.T) { t.Parallel() dir := t.TempDir() - if _, err := resolveOrFetch(ctx, filepath.Join(dir, "nope"), filepath.Join(dir, "local"), noFetchURL, 0o644); err == nil { + if _, err := resolveLocal(filepath.Join(dir, "nope"), filepath.Join(dir, "local")); err == nil { t.Fatal("expected error for missing override") } }) - t.Run("local staged copy returned, no fetch", func(t *testing.T) { + t.Run("local copy returned when present", func(t *testing.T) { t.Parallel() dir := t.TempDir() local := filepath.Join(dir, "local.bin") if err := os.WriteFile(local, []byte("y"), 0o644); err != nil { t.Fatal(err) } - got, err := resolveOrFetch(ctx, "", local, noFetchURL, 0o644) + got, err := resolveLocal("", local) if err != nil || got != local { t.Fatalf("got %q, err %v; want %q", got, err, local) } }) - t.Run("fetches when absent and creates parent dir", func(t *testing.T) { + t.Run("absent errors (artifacts are not fetched)", func(t *testing.T) { t.Parallel() - const body = "fetched-artifact" - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/firecracker-debug" { - w.WriteHeader(http.StatusNotFound) - - return - } - _, _ = w.Write([]byte(body)) - })) - defer srv.Close() - - dir := t.TempDir() - local := filepath.Join(dir, "sub", "firecracker-debug") // parent does not exist yet - got, err := resolveOrFetch(ctx, "", local, srv.URL+"/firecracker-debug", 0o755) - if err != nil { - t.Fatalf("fetch failed: %v", err) - } - if got != local { - t.Fatalf("got %q, want %q", got, local) - } - b, err := os.ReadFile(local) - if err != nil || string(b) != body { - t.Fatalf("content %q err %v; want %q", b, err, body) - } - }) - - t.Run("404 errors", func(t *testing.T) { - t.Parallel() - srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusNotFound) - })) - defer srv.Close() - dir := t.TempDir() - if _, err := resolveOrFetch(ctx, "", filepath.Join(dir, "x"), srv.URL+"/missing", 0o644); err == nil { - t.Fatal("expected 404 error") + if _, err := resolveLocal("", filepath.Join(dir, "missing")); err == nil { + t.Fatal("expected error when artifact absent") } }) } diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index 34581af7ab..100daba751 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -83,8 +83,8 @@ func main() { fphBenchDelay := flag.Duration("fph-bench-delay", 0, "wait this long between workload completion and pause (lets FPR settle)") gdbDebug := flag.Bool("gdb", false, "resume under gdb: hold the guest at the kernel entry breakpoint with a gdb-enabled FC and hand over a ready gdb session for source-level guest-kernel debugging") - gdbFC := flag.String("gdb-fc", "", "path to a firecracker built --features gdb (default: fetch firecracker-debug by version; set E2B_GDB_ARTIFACTS_URL to override the source)") - gdbSymbols := flag.String("gdb-symbols", "", "path to the guest kernel's DWARF symbols, vmlinux.debug (default: fetch vmlinux.debug by version; set E2B_GDB_ARTIFACTS_URL to override the source)") + gdbFC := flag.String("gdb-fc", "", "path to a firecracker built --features gdb (default: firecracker-debug resolved next to the snapshot's firecracker)") + gdbSymbols := flag.String("gdb-symbols", "", "path to the guest kernel's DWARF symbols, vmlinux.debug (default: resolved next to the snapshot's vmlinux.bin)") gdbSocket := flag.String("gdb-socket", "", "gdb unix socket path (default: a temp path)") gdbExec := flag.String("gdb-exec", "", "scripted mode: run these gdb commands in batch (newline/';'-separated) instead of an interactive prompt") gdbScript := flag.String("gdb-script", "", "scripted mode: run this gdb command file in batch") @@ -356,6 +356,10 @@ type runner struct { reboot bool config cfg.BuilderConfig storage storage.StorageProvider + // gdbOrigVersionsDir preserves the original FirecrackerVersionsDir in gdb mode, where + // config.FirecrackerVersionsDir is redirected to a writable staging dir; the published + // firecracker-debug is resolved from this original (read-only) dir. + gdbOrigVersionsDir string } // startSandbox starts a sandbox from the build, either resuming from its memory @@ -1196,6 +1200,27 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe cache.Start(ctx) defer cache.Stop() + // In gdb mode the launch must run the gdb-enabled Firecracker, but the factory + // resolves the FC binary from FirecrackerVersionsDir, which on cluster nodes is a + // read-only gcsfuse mount. Redirect it to a writable temp dir (populated by gdbMode) + // before the factory captures the config; the original dir is preserved for resolving + // the published firecracker-debug. Only the FC dir is affected — the kernel dir stays. + // + // INVARIANT: this must run before anything resolves the FC binary from + // FirecrackerVersionsDir. Today only the factory (created just below) does; the + // template cache above does not. gdbMode also verifies the binary it is about to + // launch is gdb-enabled, as a backstop against this assumption drifting. + gdbOrigVersionsDir := "" + if gdbOpts.enabled { + gdbOrigVersionsDir = config.BuilderConfig.FirecrackerVersionsDir + stageDir, mkErr := os.MkdirTemp("", "fc-gdb-versions-") + if mkErr != nil { + return fmt.Errorf("gdb fc staging dir: %w", mkErr) + } + defer os.RemoveAll(stageDir) + config.BuilderConfig.FirecrackerVersionsDir = stageDir + } + if verbose { fmt.Println("🔧 Creating sandbox factory...") } @@ -1245,6 +1270,8 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe config: config.BuilderConfig, storage: persistence, sbxConfig: sbxCfg, + + gdbOrigVersionsDir: gdbOrigVersionsDir, } if gdbOpts.enabled {