Skip to content

Commit 46c46f8

Browse files
JAORMXclaude
andcommitted
Add Linux user namespace support for runner spawning
Spawn the propolis-runner inside a CLONE_NEWUSER namespace so it gains CAP_SETUID/CAP_SETGID within the namespace. This allows libkrun's virtiofs passthrough to call set_creds() without requiring host-level capabilities, fixing EPERM errors when host GID != guest GID. - Add WithUserNamespaceUID(uid, gid) backend option - Add UserNamespaceConfig type and applyUserNamespace() (Linux/no-op) - Add preflight check for kernel.unprivileged_userns_clone sysctl - Add EPERM hint in SpawnProcess suggesting sysctl check Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 70e7626 commit 46c46f8

11 files changed

Lines changed: 484 additions & 17 deletions

File tree

hypervisor/libkrun/backend.go

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,32 @@ func WithFirmware(src extract.Source) Option { return func(b *Backend) { b.firmw
4545
// Ignored when Sources are directory-based.
4646
func WithCacheDir(dir string) Option { return func(b *Backend) { b.cacheDir = dir } }
4747

48+
// WithUserNamespaceUID configures the runner to spawn inside a Linux user
49+
// namespace (CLONE_NEWUSER) with a single UID/GID mapping. The child
50+
// process gains CAP_SETUID and CAP_SETGID within the namespace, which
51+
// allows libkrun's virtiofs passthrough to call set_creds() without
52+
// requiring host-level capabilities.
53+
//
54+
// uid and gid specify the namespace-side IDs that map to the host
55+
// process's real UID/GID. For example, if the guest expects UID 1000
56+
// and the host runs as UID 1000, pass uid=1000, gid=1000.
57+
//
58+
// On non-Linux platforms, this option is accepted but has no effect.
59+
func WithUserNamespaceUID(uid, gid uint32) Option {
60+
return func(b *Backend) {
61+
b.userNamespace = &runner.UserNamespaceConfig{UID: uid, GID: gid}
62+
}
63+
}
64+
4865
// Backend implements hypervisor.Backend using libkrun.
4966
type Backend struct {
50-
runnerPath string
51-
libDir string
52-
spawner runner.Spawner
53-
runtime extract.Source
54-
firmware extract.Source
55-
cacheDir string
67+
runnerPath string
68+
libDir string
69+
spawner runner.Spawner
70+
runtime extract.Source
71+
firmware extract.Source
72+
cacheDir string
73+
userNamespace *runner.UserNamespaceConfig
5674
}
5775

5876
// NewBackend creates a libkrun backend with the given options.
@@ -131,17 +149,18 @@ func (b *Backend) Start(ctx context.Context, cfg hypervisor.VMConfig) (hyperviso
131149
}
132150

133151
runCfg := runner.Config{
134-
RootPath: cfg.RootFSPath,
135-
NumVCPUs: cfg.NumVCPUs,
136-
RAMMiB: cfg.RAMMiB,
137-
NetSocket: netSocket,
138-
PortForwards: toRunnerPortForwards(cfg.PortForwards),
139-
VirtioFS: toRunnerVirtioFS(cfg.FilesystemMounts),
140-
ConsoleLog: cfg.ConsoleLogPath,
141-
LogLevel: cfg.LogLevel,
142-
LibDir: libDir,
143-
RunnerPath: runnerPath,
144-
VMLogPath: filepath.Join(cfg.DataDir, "vm.log"),
152+
RootPath: cfg.RootFSPath,
153+
NumVCPUs: cfg.NumVCPUs,
154+
RAMMiB: cfg.RAMMiB,
155+
NetSocket: netSocket,
156+
PortForwards: toRunnerPortForwards(cfg.PortForwards),
157+
VirtioFS: toRunnerVirtioFS(cfg.FilesystemMounts),
158+
ConsoleLog: cfg.ConsoleLogPath,
159+
LogLevel: cfg.LogLevel,
160+
LibDir: libDir,
161+
RunnerPath: runnerPath,
162+
VMLogPath: filepath.Join(cfg.DataDir, "vm.log"),
163+
UserNamespace: b.userNamespace,
145164
}
146165

147166
spawner := b.spawner

hypervisor/libkrun/backend_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,5 +448,62 @@ func TestBackend_Start_DefaultUnchanged(t *testing.T) {
448448
assert.Equal(t, "/my/lib", spawner.captured.LibDir)
449449
}
450450

451+
func TestBackend_Options_UserNamespaceUID(t *testing.T) {
452+
t.Parallel()
453+
454+
b := NewBackend(WithUserNamespaceUID(1000, 1000))
455+
456+
require.NotNil(t, b.userNamespace)
457+
assert.Equal(t, uint32(1000), b.userNamespace.UID)
458+
assert.Equal(t, uint32(1000), b.userNamespace.GID)
459+
}
460+
461+
func TestBackend_Start_WithUserNamespaceUID(t *testing.T) {
462+
t.Parallel()
463+
464+
proc := &mockProcessHandle{pid: 77, alive: true}
465+
spawner := &captureSpawner{proc: proc}
466+
467+
b := NewBackend(
468+
WithUserNamespaceUID(1000, 1000),
469+
WithSpawner(spawner),
470+
)
471+
472+
cfg := hypervisor.VMConfig{
473+
RootFSPath: t.TempDir(),
474+
DataDir: t.TempDir(),
475+
}
476+
477+
handle, err := b.Start(context.Background(), cfg)
478+
require.NoError(t, err)
479+
require.NotNil(t, handle)
480+
481+
// Verify the user namespace config was threaded through.
482+
require.NotNil(t, spawner.captured.UserNamespace)
483+
assert.Equal(t, uint32(1000), spawner.captured.UserNamespace.UID)
484+
assert.Equal(t, uint32(1000), spawner.captured.UserNamespace.GID)
485+
}
486+
487+
func TestBackend_Start_WithoutUserNamespace(t *testing.T) {
488+
t.Parallel()
489+
490+
proc := &mockProcessHandle{pid: 78, alive: true}
491+
spawner := &captureSpawner{proc: proc}
492+
493+
b := NewBackend(WithSpawner(spawner))
494+
495+
cfg := hypervisor.VMConfig{
496+
RootFSPath: t.TempDir(),
497+
DataDir: t.TempDir(),
498+
}
499+
500+
handle, err := b.Start(context.Background(), cfg)
501+
require.NoError(t, err)
502+
require.NotNil(t, handle)
503+
504+
// UserNamespace should be nil when not configured.
505+
assert.Nil(t, spawner.captured.UserNamespace)
506+
}
507+
451508
// Verify mockSource implements extract.Source.
452509
var _ extract.Source = (*mockSource)(nil)

preflight/userns_linux.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build linux
5+
6+
package preflight
7+
8+
import (
9+
"context"
10+
"fmt"
11+
"os"
12+
"strings"
13+
)
14+
15+
const usernsCloneSysctl = "/proc/sys/kernel/unprivileged_userns_clone"
16+
17+
// usernsChecker holds injectable dependencies for user namespace verification.
18+
type usernsChecker struct {
19+
getuid func() int
20+
readFile func(string) ([]byte, error)
21+
}
22+
23+
func newUsernsChecker() *usernsChecker {
24+
return &usernsChecker{
25+
getuid: os.Getuid,
26+
readFile: os.ReadFile,
27+
}
28+
}
29+
30+
// check verifies that unprivileged user namespaces are available. Root
31+
// can always create user namespaces, so the check only applies to
32+
// non-root users. On kernels that don't expose the sysctl (e.g.
33+
// Fedora 30+, most modern distros), the check passes — CLONE_NEWUSER
34+
// is always available.
35+
func (c *usernsChecker) check(_ context.Context) error {
36+
// Root can always create user namespaces.
37+
if c.getuid() == 0 {
38+
return nil
39+
}
40+
41+
data, err := c.readFile(usernsCloneSysctl)
42+
if err != nil {
43+
if os.IsNotExist(err) {
44+
// Sysctl doesn't exist — unprivileged userns is always enabled.
45+
return nil
46+
}
47+
return fmt.Errorf("cannot read %s: %w", usernsCloneSysctl, err)
48+
}
49+
50+
val := strings.TrimSpace(string(data))
51+
if val == "0" {
52+
return fmt.Errorf("unprivileged user namespaces are disabled (kernel.unprivileged_userns_clone=0); " +
53+
"the runner requires CLONE_NEWUSER for virtiofs UID/GID mapping; " +
54+
"enable with: sudo sysctl -w kernel.unprivileged_userns_clone=1")
55+
}
56+
57+
return nil
58+
}
59+
60+
// UserNamespaceCheck returns a preflight Check that verifies unprivileged
61+
// user namespaces are available. This check should only be registered when
62+
// user namespace spawning is configured.
63+
func UserNamespaceCheck() Check {
64+
c := newUsernsChecker()
65+
return Check{
66+
Name: "userns",
67+
Description: "Verify unprivileged user namespaces are available",
68+
Run: c.check,
69+
Required: true,
70+
}
71+
}

preflight/userns_linux_test.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build linux
5+
6+
package preflight
7+
8+
import (
9+
"context"
10+
"fmt"
11+
"os"
12+
"testing"
13+
14+
"github.com/stretchr/testify/assert"
15+
"github.com/stretchr/testify/require"
16+
)
17+
18+
func TestUsernsCheck_Root(t *testing.T) {
19+
t.Parallel()
20+
21+
c := &usernsChecker{
22+
getuid: func() int { return 0 },
23+
readFile: func(_ string) ([]byte, error) {
24+
t.Fatal("readFile should not be called for root")
25+
return nil, nil
26+
},
27+
}
28+
29+
err := c.check(context.Background())
30+
assert.NoError(t, err)
31+
}
32+
33+
func TestUsernsCheck_SysctlNotExist(t *testing.T) {
34+
t.Parallel()
35+
36+
c := &usernsChecker{
37+
getuid: func() int { return 1000 },
38+
readFile: func(_ string) ([]byte, error) {
39+
return nil, os.ErrNotExist
40+
},
41+
}
42+
43+
// If the sysctl doesn't exist, unprivileged userns is always enabled.
44+
err := c.check(context.Background())
45+
assert.NoError(t, err)
46+
}
47+
48+
func TestUsernsCheck_Enabled(t *testing.T) {
49+
t.Parallel()
50+
51+
c := &usernsChecker{
52+
getuid: func() int { return 1000 },
53+
readFile: func(_ string) ([]byte, error) {
54+
return []byte("1\n"), nil
55+
},
56+
}
57+
58+
err := c.check(context.Background())
59+
assert.NoError(t, err)
60+
}
61+
62+
func TestUsernsCheck_Disabled(t *testing.T) {
63+
t.Parallel()
64+
65+
c := &usernsChecker{
66+
getuid: func() int { return 1000 },
67+
readFile: func(_ string) ([]byte, error) {
68+
return []byte("0\n"), nil
69+
},
70+
}
71+
72+
err := c.check(context.Background())
73+
require.Error(t, err)
74+
assert.Contains(t, err.Error(), "unprivileged user namespaces are disabled")
75+
assert.Contains(t, err.Error(), "kernel.unprivileged_userns_clone=1")
76+
}
77+
78+
func TestUsernsCheck_ReadError(t *testing.T) {
79+
t.Parallel()
80+
81+
c := &usernsChecker{
82+
getuid: func() int { return 1000 },
83+
readFile: func(_ string) ([]byte, error) {
84+
return nil, fmt.Errorf("permission denied")
85+
},
86+
}
87+
88+
err := c.check(context.Background())
89+
require.Error(t, err)
90+
assert.Contains(t, err.Error(), "cannot read")
91+
}
92+
93+
func TestUserNamespaceCheck_ReturnsCheck(t *testing.T) {
94+
t.Parallel()
95+
96+
check := UserNamespaceCheck()
97+
assert.Equal(t, "userns", check.Name)
98+
assert.True(t, check.Required)
99+
assert.NotNil(t, check.Run)
100+
}

preflight/userns_other.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//go:build !linux
5+
6+
package preflight
7+
8+
import "context"
9+
10+
// UserNamespaceCheck returns a no-op preflight check on non-Linux platforms.
11+
// User namespaces are a Linux-specific feature.
12+
func UserNamespaceCheck() Check {
13+
return Check{
14+
Name: "userns",
15+
Description: "Verify unprivileged user namespaces are available (Linux only)",
16+
Run: func(_ context.Context) error { return nil },
17+
Required: false,
18+
}
19+
}

runner/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ type Config struct {
4444
// VMLogPath is the path to the file where runner stdout/stderr is written.
4545
// Not serialized to JSON; used by Spawn to redirect output.
4646
VMLogPath string `json:"-"`
47+
// UserNamespace configures a Linux user namespace for the runner subprocess.
48+
// When non-nil, the runner is spawned inside CLONE_NEWUSER so that
49+
// libkrun's virtiofs passthrough can call set_creds() without host caps.
50+
// Not serialized to JSON; applied by SpawnProcess before exec.
51+
UserNamespace *UserNamespaceConfig `json:"-"`
4752
}
4853

4954
// VirtioFSMount exposes a host directory to the guest via virtio-fs.

runner/spawn.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ func SpawnProcess(ctx context.Context, cfg Config) (*Process, error) {
116116
cmd.SysProcAttr = &syscall.SysProcAttr{
117117
Setsid: true, // Create new session (detach from terminal)
118118
}
119+
applyUserNamespace(cmd, cfg.UserNamespace)
119120

120121
// Set library search path for bundled libraries if available.
121122
if cfg.LibDir != "" {
@@ -135,6 +136,10 @@ func SpawnProcess(ctx context.Context, cfg Config) (*Process, error) {
135136
}
136137

137138
if err := cmd.Start(); err != nil {
139+
if cfg.UserNamespace != nil && isPermissionError(err) {
140+
return nil, fmt.Errorf("start runner process: %w; user namespaces may be disabled — "+
141+
"check that kernel.unprivileged_userns_clone=1 (sysctl kernel.unprivileged_userns_clone)", err)
142+
}
138143
return nil, fmt.Errorf("start runner process: %w", err)
139144
}
140145

@@ -282,3 +287,10 @@ func libPathEnvVar() string {
282287
func isNoSuchProcess(err error) bool {
283288
return errors.Is(err, syscall.ESRCH)
284289
}
290+
291+
// isPermissionError returns true if the error indicates a permission denial
292+
// (EPERM or EINVAL, the latter being returned by some kernels when
293+
// CLONE_NEWUSER is disabled).
294+
func isPermissionError(err error) bool {
295+
return errors.Is(err, syscall.EPERM) || errors.Is(err, syscall.EINVAL)
296+
}

runner/userns.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package runner
5+
6+
// UserNamespaceConfig configures a Linux user namespace for the runner
7+
// subprocess. When set, the runner is spawned inside a CLONE_NEWUSER
8+
// namespace so that it gains CAP_SETUID/CAP_SETGID within the namespace.
9+
// This allows libkrun's virtiofs passthrough to call set_creds() without
10+
// requiring host-level capabilities.
11+
//
12+
// The UID and GID fields specify the single mapping from container
13+
// namespace IDs to the host process's real UID/GID. For example, if the
14+
// guest expects UID 1000 and the host process runs as UID 1000, set
15+
// UID=1000 and GID=1000 to create the mapping 1000→1000.
16+
type UserNamespaceConfig struct {
17+
// UID is the user ID inside the namespace that maps to the host UID.
18+
UID uint32
19+
// GID is the group ID inside the namespace that maps to the host GID.
20+
GID uint32
21+
}

0 commit comments

Comments
 (0)