Skip to content

Commit e988d6e

Browse files
committed
Add OCI runtime surface and compat test fixtures
Make `oci run` work against real public images (alpine, busybox, python, ruby, debian) and lock the surface down with end-to-end fixtures. Runtime surface: - writable clone-rootfs DoD: the per-run rootfs is writable out of the box, so guests that mutate /tmp, /var, /run work unchanged - runtime files injection: /etc/resolv.conf, /etc/hosts, /etc/hostname populated from the host into the clone-rootfs - /dev/full and /dev/console emulation in the syscall layer - /proc surface: cgroup, hostname, comm, statm entries that glibc startup and procps tooling read - image-config `User` symbolic resolution: name and name:group forms looked up against the guest /etc/passwd and /etc/group before falling back to numeric - `oci run` walks the image index to the linux/arm64 leaf manifest (Phase 3 fix; previously fed the top-level index to the config-loader and crashed on multi-arch images) Bug fixes uncovered by cold-cache runs: - layer apply no longer rejects the root tar entry "./" - unpack stages files via copyfile(2) with COPYFILE_CLONE fallback so cross-volume unpack (store on internal SSD, sysroot on the APFS sparsebundle) succeeds - tar reader handles PAX 'x' / 'g' extended-header `path` and `linkpath` records (busybox and python:alpine layers use them) Compat tests: - `tests/test-oci-compat.sh` shell smoke (in-tree fixtures) - `OCI_COMPAT_TEST=1` heavy mode that provisions a scratch sparsebundle and drives three fixtures end-to-end: alpine-shaped, busybox-shaped hardlink dispatch, two-layer whiteout - `OCI_FETCH_ONLINE=1` alpine:3 end-to-end smoke (opt-in; requires network) `ELFUSE_OCI_PROGRESS=plain` env disables the pull progress in-place CSI redraw for terminals that don't honor cursor-up escapes (issue surfaced on legacy Terminal.app panes). Documentation: `docs/oci.md` Phase 4 runtime surface and libc-adjacent envelope notes (what guests can / can't expect from the synthetic /etc, /dev, /proc).
1 parent 7dd19fb commit e988d6e

30 files changed

Lines changed: 3800 additions & 176 deletions

Makefile

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,9 @@ SRCS := \
9191
oci/unpack.c \
9292
oci/rebuild-cache.c \
9393
oci/runspec.c \
94+
oci/user-lookup.c \
9495
oci/path-resolve.c \
96+
oci/runtime-files.c \
9597
oci/run.c
9698

9799
SRCS := $(addprefix src/,$(SRCS))
@@ -303,11 +305,19 @@ $(BUILD_DIR)/test-oci-tar: $(BUILD_DIR)/test-oci-tar.o $(BUILD_DIR)/oci/tar.o |
303305
@echo " LD $@"
304306
$(Q)$(CC) $(CFLAGS) -o $@ $^
305307

306-
## Build the OCI runspec unit test (native macOS, no HVF). Pure-data
307-
## merge of image-config runtime block + CLI overrides; the test feeds
308-
## oci_image_runtime_t literals directly through oci_runspec_build with
309-
## no filesystem or libcurl dependency.
310-
$(BUILD_DIR)/test-oci-runspec: $(BUILD_DIR)/test-oci-runspec.o $(BUILD_DIR)/oci/runspec.o | $(BUILD_DIR)
308+
## Build the OCI runspec unit test (native macOS, no HVF). Merges
309+
## image-config runtime block + CLI overrides; the rootfs-driven
310+
## symbolic-User cases write /etc/passwd and /etc/group fixtures under
311+
## /tmp, so the link island pulls in oci/user-lookup.o.
312+
$(BUILD_DIR)/test-oci-runspec: $(BUILD_DIR)/test-oci-runspec.o $(BUILD_DIR)/oci/runspec.o $(BUILD_DIR)/oci/user-lookup.o | $(BUILD_DIR)
313+
@echo " LD $@"
314+
$(Q)$(CC) $(CFLAGS) -o $@ $^
315+
316+
## Build the OCI User-field resolver unit test (native macOS, no HVF).
317+
## Pure C; the test builds scratch /tmp rootfses with synthetic
318+
## /etc/passwd / /etc/group and drives oci_user_lookup across the seven
319+
## OCI image-spec User shapes plus the policy edges.
320+
$(BUILD_DIR)/test-oci-user: $(BUILD_DIR)/test-oci-user.o $(BUILD_DIR)/oci/user-lookup.o | $(BUILD_DIR)
311321
@echo " LD $@"
312322
$(Q)$(CC) $(CFLAGS) -o $@ $^
313323

@@ -325,10 +335,18 @@ $(BUILD_DIR)/test-oci-path-resolve: $(BUILD_DIR)/test-oci-path-resolve.o $(BUILD
325335
## the test ships an in-file elfuse_launch stub that aborts when called,
326336
## and every case installs a launch hook via oci_run_set_launch_for_testing
327337
## before invoking oci_run, so the real VM bring-up never runs from a test.
328-
$(BUILD_DIR)/test-oci-run: $(BUILD_DIR)/test-oci-run.o $(BUILD_DIR)/oci/run.o $(BUILD_DIR)/oci/runspec.o $(BUILD_DIR)/oci/path-resolve.o $(BUILD_DIR)/oci/unpack.o $(BUILD_DIR)/oci/volume.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/clone-rootfs.o $(BUILD_DIR)/oci/layer-apply.o $(BUILD_DIR)/oci/layer-meta.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/decompress.o $(BUILD_DIR)/oci/tar.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/ref.o $(BUILD_DIR)/core/sysroot.o $(BUILD_DIR)/debug/log.o $(CJSON_OBJ) $(ZSTD_OBJS) | $(BUILD_DIR)
338+
$(BUILD_DIR)/test-oci-run: $(BUILD_DIR)/test-oci-run.o $(BUILD_DIR)/oci/run.o $(BUILD_DIR)/oci/runspec.o $(BUILD_DIR)/oci/user-lookup.o $(BUILD_DIR)/oci/path-resolve.o $(BUILD_DIR)/oci/runtime-files.o $(BUILD_DIR)/oci/unpack.o $(BUILD_DIR)/oci/volume.o $(BUILD_DIR)/oci/volume-list.o $(BUILD_DIR)/oci/clone-rootfs.o $(BUILD_DIR)/oci/layer-apply.o $(BUILD_DIR)/oci/layer-meta.o $(BUILD_DIR)/oci/origin-meta.o $(BUILD_DIR)/oci/decompress.o $(BUILD_DIR)/oci/tar.o $(BUILD_DIR)/oci/store.o $(BUILD_DIR)/oci/blob-store.o $(BUILD_DIR)/oci/digest.o $(BUILD_DIR)/oci/digest-set.o $(BUILD_DIR)/oci/manifest.o $(BUILD_DIR)/oci/media-type.o $(BUILD_DIR)/oci/ref.o $(BUILD_DIR)/core/sysroot.o $(BUILD_DIR)/debug/log.o $(CJSON_OBJ) $(ZSTD_OBJS) | $(BUILD_DIR)
329339
@echo " LD $@"
330340
$(Q)$(CC) $(CFLAGS) -o $@ $^ -lz
331341

342+
## Build the OCI runtime-files injection unit test (native macOS, no HVF).
343+
## Pure C; the test drives oci_runtime_files_inject against scratch
344+
## /tmp/elfuse-rf-* run directories and verifies the synthesised
345+
## /etc/{resolv.conf,hosts,hostname} content.
346+
$(BUILD_DIR)/test-oci-runtime-files: $(BUILD_DIR)/test-oci-runtime-files.o $(BUILD_DIR)/oci/runtime-files.o | $(BUILD_DIR)
347+
@echo " LD $@"
348+
$(Q)$(CC) $(CFLAGS) -o $@ $^
349+
332350
## Build the OCI fixture builder (Phase 3 compat tests). Standalone tool
333351
## that synthesises a complete OCI store from uncompressed-tar layers
334352
## plus image-config flags. Used by tests/test-oci-compat.sh and

docs/usage.md

Lines changed: 93 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,16 @@ Image-provided `DYLD_*` entries pass through (the guest ignores them).
166166

167167
### User and WorkingDir
168168

169-
`User` accepts numeric `UID` or `UID:GID` only. Symbolic users (`User
170-
nginx`) are rejected with a deterministic Phase 4 pointer message;
171-
static `/etc/passwd` parsing waits for Phase 4 along with the rest of
172-
the NSS resolution work. `--user UID` alone defaults GID to the same
173-
value.
169+
`User` accepts seven shapes: the empty string (no override), a numeric
170+
`UID`, `UID:GID`, a symbolic `name`, `name:group`, `uid:group`, or
171+
`name:gid`. Symbolic forms read `/etc/passwd` and `/etc/group` from
172+
the cloned rootfs. A token made entirely of ASCII digits is always
173+
parsed numerically, even when a same-named account ships in the image
174+
(this matches runc semantics, so an image that happens to carry a
175+
`1234` account does not capture `--user 1234`). When the symbolic
176+
form names an account the unpacked layers do not actually carry,
177+
lookup fails closed; `elfuse` never silently falls back to root.
178+
`--user UID` alone defaults GID to the same value.
174179

175180
`WorkingDir` must be absolute and free of `..` segments. If neither the
176181
image nor the CLI sets it, the guest starts in `/`. The directory is
@@ -180,12 +185,93 @@ selects credentials).
180185

181186
### Scope guardrails
182187

183-
- Symbolic `User` -> Phase 4 (NSS / static `/etc/passwd` resolution)
184-
- `/etc/resolv.conf`, `/etc/hosts`, `/dev/*`, `/proc/*` synthesis -> Phase 4
185188
- Auto-pull on `run` miss -> never; `elfuse oci pull` must run first
186189
- Network policy, `docker run -p`-style port mapping -> later phases
187190
- Live `docker exec`-style attach -> never
188191

192+
### Runtime host-truth surface
193+
194+
`elfuse oci run` runs the guest against a freshly cloned per-run
195+
rootfs and a small set of synthesized host-truth files. The rootfs
196+
is produced by APFS `clonefile(2)` against the unpacked image
197+
layers, so the first guest write to any path triggers copy-on-write
198+
in APFS without touching the original image. The clone is removed at
199+
guest exit unless `--keep` is set; nothing is ever pushed back to
200+
the on-disk image, and concurrent `oci run` invocations against the
201+
same image are isolated.
202+
203+
Three `/etc` files are overwritten in the clone before the guest
204+
starts. Any pre-existing symlink (the common case is
205+
`/etc/resolv.conf -> /run/systemd/resolve/stub-resolv.conf`) is
206+
unlinked first so it does not dangle inside the guest:
207+
208+
| File | Source |
209+
|--|--|
210+
| `/etc/resolv.conf` | `nameserver` lines harvested from `scutil --dns`; falls back to `8.8.8.8` and `1.1.1.1` on any scutil failure |
211+
| `/etc/hosts` | fixed 5-line block: `localhost`, the ip6-loopback aliases, ip6 link-local multicast, and `127.0.0.1 host.elfuse.internal` |
212+
| `/etc/hostname` | literal string `elfuse` |
213+
214+
The following pseudo-filesystem paths are synthesized by the host-side
215+
openat interceptor and do not need to exist inside the rootfs:
216+
217+
| Path | Behavior |
218+
|--|--|
219+
| `/dev/null`, `/dev/zero`, `/dev/random`, `/dev/urandom`, `/dev/tty` | redirected to the host device of the same name |
220+
| `/dev/full` | reads zero-fill, writes of any non-zero length return `ENOSPC` |
221+
| `/dev/console` | mirrored from the controlling tty when present (macOS reserves the real `/dev/console` for the kernel) |
222+
| other `/dev/*` | `ENOENT` |
223+
| `/proc/cpuinfo`, `/proc/meminfo`, `/proc/version` | derived from host sysctl |
224+
| `/proc/self/{maps,exe,status,stat,comm,statm,cgroup}` | synthesized; `cgroup` reports the canonical `0::/` (elfuse runs outside any cgroup hierarchy) |
225+
| `/proc/sys/kernel/{ostype,osrelease,hostname}` | tracks the cached `uname` fields (`Linux`, `6.17.0-20-generic`, `elfuse`) |
226+
227+
### Libc-adjacent compatibility
228+
229+
`elfuse` does not patch libc-adjacent payload (NSS modules, time-zone
230+
data, locale data, character-set converters, dynamic-linker cache)
231+
inside the guest. Each item below names the contract `elfuse` honors
232+
and the failure mode an image hits when it does not ship the
233+
matching files.
234+
235+
- **`/etc/nsswitch.conf`** is read by the guest's libc, not by
236+
`elfuse`. Only the `files` and `dns` backends actually function:
237+
`files` resolves through `/etc/{passwd,group,hosts}` in the cloned
238+
rootfs, and `dns` resolves through host `getaddrinfo` via the
239+
synthesized `/etc/resolv.conf`. Backends such as `systemd`, `sss`,
240+
or `ldap` need their NSS shared object plus a matching daemon,
241+
neither of which `elfuse` provides.
242+
- **NSS shared objects** (`libnss_systemd.so`, `libnss_sss.so`,
243+
`libnss_ldap.so`, ...) are `dlopen`'d by guest libc against its own
244+
loader. `elfuse` never injects NSS modules: they are aarch64-linux
245+
ELF objects against guest libc, so the macOS host has no way to
246+
load them, and the guest can only `dlopen` the modules its image
247+
already carries.
248+
- **tzdata** (`/usr/share/zoneinfo`, `/etc/localtime`, `/etc/timezone`)
249+
ships with the image. `elfuse` does not transcode macOS
250+
`/var/db/timezone/zoneinfo` into the tzdata format; if the image is
251+
missing the needed zone, glibc / musl fall back to UTC. The `TZ`
252+
environment variable is honored as-is and is not rewritten by the
253+
Env merge policy.
254+
- **`/usr/lib/locale/locale-archive`** is not regenerated. glibc
255+
images without a built archive (or the matching `<lang>.UTF-8/`
256+
directory) fall back to the `C` locale; locale-aware sort / printf
257+
/ strcoll outputs ASCII order. musl images do not use the archive
258+
and are unaffected.
259+
- **`/usr/lib/<triple>/gconv/`** modules and the `gconv-modules`
260+
index ship with the image. Missing modules surface as `EILSEQ` from
261+
`iconv` / glibc's character-set conversion; this most often shows
262+
up when an image ships a stripped glibc layer.
263+
- **`ld.so.cache`** is not rebuilt. The guest dynamic linker reads
264+
whatever cache the image carries; missing entries fall through to
265+
the linker's library-path search, which is the normal slow path.
266+
267+
Common workloads and the symptom-to-workaround mapping:
268+
269+
| Symptom | Trigger | Workaround |
270+
|--|--|--|
271+
| `getaddrinfo` returns `EAI_AGAIN` or an empty result | `/etc/nsswitch.conf` lists a backend (`systemd`, `sss`, ...) that needs a daemon | use a distro whose `nsswitch.conf` is `files dns` (alpine ships this by default; debian needs the file edited) |
272+
| `date`, `strftime` show UTC instead of the expected zone | the image does not contain `/usr/share/zoneinfo/<Zone>` | install tzdata in the image (`apk add tzdata` / `apt install tzdata`), or pass `-e TZ=UTC` to acknowledge UTC |
273+
| `sort`, `printf`, `strcoll` collate in ASCII order | the image is missing `/usr/lib/locale/locale-archive` or the matching `<lang>.UTF-8/` directory | accept the C-locale fallback, run `locale-gen` during the image build, or use a musl-based image (alpine), which does not depend on the archive |
274+
189275
## Guest Compatibility Model
190276

191277
`elfuse` is designed for Linux user-space workloads, not for booting a Linux

mk/tests.mk

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
test-oci-tar test-oci-decompress test-oci-meta \
1515
test-oci-origin \
1616
test-oci-layer-apply test-oci-volume test-oci-clone \
17-
test-oci-unpack test-oci-runspec test-oci-path-resolve \
17+
test-oci-unpack test-oci-runspec test-oci-user test-oci-path-resolve \
18+
test-oci-runtime-files \
1819
test-oci-run test-oci-compat oci-fixture-builder \
1920
test-sysroot-rename \
2021
test-case-collision test-case-collision-fallback test-sysroot-create-paths \
@@ -88,8 +89,12 @@ check: $(ELFUSE_BIN) $(TEST_DEPS) check-syscall-coverage
8889
@$(MAKE) --no-print-directory test-oci-unpack
8990
@printf "\n$(BLUE)━━━ OCI runspec resolver unit tests ━━━$(RESET)\n"
9091
@$(MAKE) --no-print-directory test-oci-runspec
92+
@printf "\n$(BLUE)━━━ OCI User-field resolver unit tests ━━━$(RESET)\n"
93+
@$(MAKE) --no-print-directory test-oci-user
9194
@printf "\n$(BLUE)━━━ OCI path-resolve unit tests ━━━$(RESET)\n"
9295
@$(MAKE) --no-print-directory test-oci-path-resolve
96+
@printf "\n$(BLUE)━━━ OCI runtime-files injection unit tests ━━━$(RESET)\n"
97+
@$(MAKE) --no-print-directory test-oci-runtime-files
9398
@printf "\n$(BLUE)━━━ OCI run orchestrator unit tests ━━━$(RESET)\n"
9499
@$(MAKE) --no-print-directory test-oci-run
95100
@printf "\n$(BLUE)━━━ OCI compat shell smoke ━━━$(RESET)\n"
@@ -200,19 +205,36 @@ test-oci-unpack: $(BUILD_DIR)/test-oci-unpack
200205
@$(BUILD_DIR)/test-oci-unpack
201206

202207
## Run the OCI runspec resolver unit tests (native, no HVF, no network).
203-
## Pure data: feeds hand-built oci_image_runtime_t literals plus synthetic
204-
## CLI flags through oci_runspec_build and asserts argv / envp / uid / cwd
205-
## outputs against the Phase 3 override matrix and Env policy.
208+
## Feeds hand-built oci_image_runtime_t literals plus synthetic CLI flags
209+
## through oci_runspec_build and asserts argv / envp / uid / cwd outputs
210+
## against the Phase 3 override matrix and Env policy. Phase 4 symbolic
211+
## User cases write scratch /tmp rootfses for /etc/passwd lookup.
206212
test-oci-runspec: $(BUILD_DIR)/test-oci-runspec
207213
@$(BUILD_DIR)/test-oci-runspec
208214

215+
## Run the OCI User-field resolver unit tests (native, no HVF, no network).
216+
## Phase 4 F4.7: validates oci_user_lookup against scratch rootfses
217+
## carrying synthetic /etc/passwd / /etc/group; covers the seven OCI
218+
## image-spec User shapes plus the policy edges (digit-name collision,
219+
## missing passwd, name-not-found, invalid characters).
220+
test-oci-user: $(BUILD_DIR)/test-oci-user
221+
@$(BUILD_DIR)/test-oci-user
222+
209223
## Run the OCI guest PATH resolver unit tests (native, no HVF, no network).
210224
## Builds a fake sysroot tree under /tmp and drives oci_path_resolve
211225
## against it: PATH search, symlink-follow, escape-symlink skip,
212226
## EACCES on noexec, ENOENT diagnostics with searched-dirs list.
213227
test-oci-path-resolve: $(BUILD_DIR)/test-oci-path-resolve
214228
@$(BUILD_DIR)/test-oci-path-resolve
215229

230+
## Run the OCI runtime-files injection unit tests (native, no HVF, no network).
231+
## Phase 4 F4.2 / F4.3: validates oci_runtime_files_inject against scratch
232+
## run directories, covering fresh-/etc creation, symlink overwrite,
233+
## regular-file overwrite, and the synthesised /etc/{resolv.conf,
234+
## hosts, hostname} content.
235+
test-oci-runtime-files: $(BUILD_DIR)/test-oci-runtime-files
236+
@$(BUILD_DIR)/test-oci-runtime-files
237+
216238
## Run the OCI run orchestrator unit tests (native, no HVF, no network).
217239
## Covers oci_cli_run argument parsing plus oci_run early-failure
218240
## paths against a case-insensitive volume; the launch backend is

src/oci/cli.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ static int print_usage(FILE *out)
7070
" re-fetch missing layer blobs\n"
7171
" -q, --quiet Suppress per-blob progress output\n"
7272
"\n"
73+
"Env: ELFUSE_OCI_PROGRESS=plain disables the in-place TTY\n"
74+
" redraw (use when the terminal mis-handles CSI cursor-up;\n"
75+
" prints one summary line per blob on completion).\n"
76+
"\n"
7377
"Policy: optional policy.json controls per-registry insecure /\n"
7478
" ca_bundle / auth_file. Read from $ELFUSE_POLICY_FILE >\n"
7579
" $XDG_CONFIG_HOME/elfuse/policy.json >\n"

src/oci/layer-apply.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,16 @@ static int layer_apply_impl(oci_tar_reader_t *r,
527527
while (gp[0] == '/')
528528
gp++;
529529

530+
/* Root-directory tar entry: docker/buildkit emit "./" as the
531+
* first entry of a layer; the DIR-type trailing-slash strip
532+
* upstream collapses it to ".". The unpack root already
533+
* exists by the time the assembler enters this loop, so the
534+
* root entry has no work to drive. Skip empty paths the same
535+
* way for archives that record a zero-length root name.
536+
*/
537+
if (gp[0] == '\0' || (gp[0] == '.' && gp[1] == '\0'))
538+
continue;
539+
530540
if (mode == APPLY_MODE_OVERLAY) {
531541
if (e.is_opaque_whiteout) {
532542
oci_tar_entry_t e2 = e;

src/oci/pull.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,20 @@ static int pull_progress_init(pull_progress_t *pp, FILE *fp,
150150
memset(pp, 0, sizeof(*pp));
151151
pp->fp = fp;
152152
pp->is_tty = fp != NULL && isatty(fileno(fp));
153+
/* ELFUSE_OCI_PROGRESS=plain (or =lines, =off) forces the
154+
* line-per-completion path even on a real TTY. Some terminal panes
155+
* (notably embedded ones that emulate a pty without honoring CSI
156+
* cursor-up) leave the in-place redraw stacking copies down the
157+
* screen instead of rewriting the active rows; the env override
158+
* gives the operator a stable opt-out without touching code.
159+
*/
160+
if (pp->is_tty) {
161+
const char *override = getenv("ELFUSE_OCI_PROGRESS");
162+
if (override
163+
&& (!strcmp(override, "plain") || !strcmp(override, "lines")
164+
|| !strcmp(override, "off")))
165+
pp->is_tty = false;
166+
}
153167

154168
size_t cap = 1 + n_layers;
155169
pp->slots = calloc(cap, sizeof(*pp->slots));

0 commit comments

Comments
 (0)