diff --git a/internal/shim/manager/manager_unix.go b/internal/shim/manager/manager_unix.go index 7f9848e..1512be0 100644 --- a/internal/shim/manager/manager_unix.go +++ b/internal/shim/manager/manager_unix.go @@ -186,10 +186,32 @@ func (manager) Start(ctx context.Context, bparams *bootapi.BootstrapParams) (_ * cmd.ExtraFiles = append(cmd.ExtraFiles, s.f) } - cloneMntNs(ctx, cmd) + userns := cloneMntNs(ctx, cmd) - if err := cmd.Start(); err != nil { - return nil, err + if startErr := cmd.Start(); startErr != nil { + if !userns { + return nil, startErr + } + // clone(CLONE_NEWUSER) can fail for reasons not covered by the + // proactive AppArmor check — e.g. seccomp filters, LSM policies, + // or EACCES from the child's capability recomputation when + // inherited socket fds cross the user namespace boundary after + // exec. Retry without namespace isolation rather than failing + // the container start. + // + // Note: we cannot log here — during "start" the logger output + // goes to stderr which containerd captures as part of the + // bootstrap response (CombinedOutput), corrupting the JSON. + cmd, err = newCommand(ctx, id, bparams.ContainerdGrpcAddress, bparams.ContainerdTtrpcAddress, debug) + if err != nil { + return nil, err + } + for _, s := range sockets { + cmd.ExtraFiles = append(cmd.ExtraFiles, s.f) + } + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("retry without userns failed: %w (original error: %v)", err, startErr) + } } defer func() { diff --git a/internal/shim/manager/mount_linux.go b/internal/shim/manager/mount_linux.go index c7f2191..d19b9b1 100644 --- a/internal/shim/manager/mount_linux.go +++ b/internal/shim/manager/mount_linux.go @@ -52,19 +52,18 @@ import ( // filesystem setup. // // If namespace creation is not possible (e.g. AppArmor restricts -// unprivileged user namespaces), the function logs a warning and the shim -// will run without mount isolation. -func cloneMntNs(_ context.Context, cmd *exec.Cmd) { +// unprivileged user namespaces), the shim runs without mount isolation +// and this function returns false. +// cloneMntNs returns true if user namespace clone flags were set. +func cloneMntNs(_ context.Context, cmd *exec.Cmd) bool { if restricted, err := apparmorRestrictsUserns(); err != nil { // Failed to check apparmor userns restriction, skipping mount namespace isolation") // We can't log anything here as it will break the TTRPC protocol! - // TODO(vvoland): Find a better way to surface this to the user. - return + return false } else if restricted { // apparmor_restrict_unprivileged_userns=1 prevents user namespace creation; shim will run without mount namespace isolation // We can't log anything here as it will break the TTRPC protocol! - // TODO(vvoland): Find a better way to surface this to the user. - return + return false } uid := os.Getuid() @@ -76,6 +75,7 @@ func cloneMntNs(_ context.Context, cmd *exec.Cmd) { cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ {ContainerID: gid, HostID: gid, Size: 1}, } + return true } // apparmorRestrictsUserns checks if the kernel sysctl diff --git a/internal/shim/manager/mount_other.go b/internal/shim/manager/mount_other.go index 8adaa1b..30eccfa 100644 --- a/internal/shim/manager/mount_other.go +++ b/internal/shim/manager/mount_other.go @@ -23,4 +23,4 @@ import ( "os/exec" ) -func cloneMntNs(_ context.Context, _ *exec.Cmd) {} +func cloneMntNs(_ context.Context, _ *exec.Cmd) bool { return false }