Skip to content

Commit f68d962

Browse files
committed
shim/manager: Retry shim start without userns on clone failure
cloneMntNs sets CLONE_NEWUSER|CLONE_NEWNS on the child, but clone can fail for reasons the proactive AppArmor sysctl check cannot detect — seccomp filters, other LSM policies, or EACCES when inherited socket fds cross the user namespace boundary after exec triggers capability recomputation. Return whether namespace flags were set so the caller can distinguish a namespace-related Start failure from an unrelated one. On failure, rebuild the command without clone flags and retry, degrading gracefully to no mount isolation rather than failing the container start entirely. Signed-off-by: Derek McGowan <derek@mcg.dev>
1 parent b7fc692 commit f68d962

3 files changed

Lines changed: 31 additions & 8 deletions

File tree

internal/shim/manager/manager_unix.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,33 @@ func (manager) Start(ctx context.Context, id string, opts shim.StartOpts) (_ shi
183183
cmd.ExtraFiles = append(cmd.ExtraFiles, s.f)
184184
}
185185

186-
cloneMntNs(ctx, cmd)
186+
userns := cloneMntNs(ctx, cmd)
187187

188188
if err := cmd.Start(); err != nil {
189-
return params, err
189+
if !userns {
190+
return params, err
191+
}
192+
// clone(CLONE_NEWUSER) can fail for reasons not covered by the
193+
// proactive AppArmor check — e.g. seccomp filters, LSM policies,
194+
// or EACCES from the child's capability recomputation when
195+
// inherited socket fds cross the user namespace boundary after
196+
// exec. Retry without namespace isolation rather than failing
197+
// the container start.
198+
//
199+
// Note: we cannot log here — during "start" the logger output
200+
// goes to stderr which containerd captures as part of the
201+
// bootstrap response (CombinedOutput), corrupting the JSON.
202+
cmd, err = newCommand(ctx, id, opts.Address, opts.TTRPCAddress, opts.Debug)
203+
if err != nil {
204+
return params, err
205+
}
206+
cmd.ExtraFiles = append(cmd.ExtraFiles, sockets[0].f)
207+
if opts.Debug && len(sockets) > 1 {
208+
cmd.ExtraFiles = append(cmd.ExtraFiles, sockets[1].f)
209+
}
210+
if err := cmd.Start(); err != nil {
211+
return params, err
212+
}
190213
}
191214

192215
defer func() {

internal/shim/manager/mount_linux.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,16 @@ import (
5454
// If namespace creation is not possible (e.g. AppArmor restricts
5555
// unprivileged user namespaces), the function logs a warning and the shim
5656
// will run without mount isolation.
57-
func cloneMntNs(_ context.Context, cmd *exec.Cmd) {
57+
// cloneMntNs returns true if user namespace clone flags were set.
58+
func cloneMntNs(_ context.Context, cmd *exec.Cmd) bool {
5859
if restricted, err := apparmorRestrictsUserns(); err != nil {
5960
// Failed to check apparmor userns restriction, skipping mount namespace isolation")
6061
// We can't log anything here as it will break the TTRPC protocol!
61-
// TODO(vvoland): Find a better way to surface this to the user.
62-
return
62+
return false
6363
} else if restricted {
6464
// apparmor_restrict_unprivileged_userns=1 prevents user namespace creation; shim will run without mount namespace isolation
6565
// We can't log anything here as it will break the TTRPC protocol!
66-
// TODO(vvoland): Find a better way to surface this to the user.
67-
return
66+
return false
6867
}
6968

7069
uid := os.Getuid()
@@ -76,6 +75,7 @@ func cloneMntNs(_ context.Context, cmd *exec.Cmd) {
7675
cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{
7776
{ContainerID: gid, HostID: gid, Size: 1},
7877
}
78+
return true
7979
}
8080

8181
// apparmorRestrictsUserns checks if the kernel sysctl

internal/shim/manager/mount_other.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,4 @@ import (
2323
"os/exec"
2424
)
2525

26-
func cloneMntNs(_ context.Context, _ *exec.Cmd) {}
26+
func cloneMntNs(_ context.Context, _ *exec.Cmd) bool { return false }

0 commit comments

Comments
 (0)