Add guest/harden package for VM kernel and capability hardening

JAORMX · claude · JAORMX · commit 857a9f0e953f · 2026-02-17T10:24:05.000+02:00
Introduce a reusable guest-side hardening package that applies
kernel sysctl defaults (kptr_restrict, dmesg_restrict, BPF) and
drops unneeded capabilities from the bounding set via prctl.
Document the new package in docs/SECURITY.md.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/docs/SECURITY.md b/docs/SECURITY.md
@@ -10,6 +10,7 @@ hardening recommendations for propolis.
 - [Networking Trust Boundary](#networking-trust-boundary)
 - [Guest Escape Blast Radius](#guest-escape-blast-radius)
 - [Hardening Recommendations](#hardening-recommendations)
+- [Guest Hardening](#guest-hardening)
 - [Egress Policy Security Model](#egress-policy-security-model)
 - [Tar Extraction Defenses](#tar-extraction-defenses)
 - [Process Identity Verification](#process-identity-verification)
@@ -202,6 +203,59 @@ remain alive for networking to function. For the simplest deployments,
 the default runner-side networking ties the network stack to the VM's
 lifetime with no extra coordination.
 
+## Guest Hardening
+
+The `guest/harden` package provides reusable kernel and capability
+hardening for microVM init processes. It is guest-side code
+(`//go:build linux`) with no CGO or krun dependencies.
+
+### Recommended usage
+
+Call the hardening functions in your guest init boot sequence:
+
+1. Mount `/proc` and `/sys` first (sysctls need procfs).
+2. Call `harden.KernelDefaults(logger)` to apply sysctls.
+3. Perform all privileged operations (mounts, network config, chown).
+4. Call `harden.DropBoundingCaps(keep...)` as the last privileged step.
+
+### Kernel sysctls
+
+`KernelDefaults` applies the following sysctls. Each is set
+independently; individual failures are logged as warnings rather than
+aborting boot, because not all kernels support every sysctl.
+
+| Sysctl | Value | Purpose |
+|--------|-------|---------|
+| `kernel.kptr_restrict` | `2` | Hide kernel pointers from all users. Prevents information leaks that aid exploit development. |
+| `kernel.dmesg_restrict` | `1` | Restrict `dmesg` to privileged users. Prevents unprivileged processes from reading kernel log messages that may contain sensitive addresses or operations. |
+| `kernel.unprivileged_bpf_disabled` | `1` | Disable unprivileged BPF. Prevents unprivileged users from loading BPF programs, which have historically been a source of kernel privilege escalation vulnerabilities. |
+
+### Capability bounding set
+
+`DropBoundingCaps(keep...)` drops all Linux capabilities from the
+bounding set except those explicitly listed. This limits what
+capabilities child processes can acquire, even through setuid binaries
+or file capabilities.
+
+For a typical SSH-based guest, the minimal keep set is:
+
+| Capability | Number | Reason |
+|-----------|--------|--------|
+| `CAP_SETUID` | 7 | sshd credential switching to sandbox user |
+| `CAP_SETGID` | 6 | sshd group switching |
+| `CAP_NET_BIND_SERVICE` | 10 | Binding port 22 (privileged port) |
+
+### Threat model
+
+These hardening measures are defense-in-depth for the guest
+environment. An attacker who has compromised the guest workload would
+need a hypervisor escape to reach the host; however, guest hardening:
+
+- Raises the bar for local privilege escalation within the guest
+- Reduces information available for exploit development (kernel pointers, dmesg)
+- Limits the attack surface of dangerous subsystems (BPF)
+- Constrains what a compromised process can do even with root inside the guest
+
 ## Egress Policy Security Model
 
 The DNS-based egress policy (`WithEgressPolicy()`) restricts VM outbound
diff --git a/guest/harden/capability.go b/guest/harden/capability.go
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build linux
+
+package harden
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+// Linux capability constants. Only the subset typically needed by guest
+// init processes is defined here.
+const (
+	CapChown          uintptr = 0
+	CapSetUID         uintptr = 7
+	CapSetGID         uintptr = 6
+	CapKill           uintptr = 5
+	CapNetBindService uintptr = 10
+)
+
+// prctl constants for capability bounding set manipulation.
+const (
+	prCapBSetDrop = 24 // PR_CAPBSET_DROP
+)
+
+// capLastCap reads the highest valid capability number from
+// /proc/sys/kernel/cap_last_cap. Falls back to 41 (CAP_CHECKPOINT_RESTORE,
+// the highest cap on Linux 6.x kernels) if the file is unreadable.
+func capLastCap() uintptr {
+	data, err := os.ReadFile("/proc/sys/kernel/cap_last_cap")
+	if err != nil {
+		return 41
+	}
+	n, err := parseCapLastCap(string(data))
+	if err != nil {
+		return 41
+	}
+	return n
+}
+
+// parseCapLastCap parses the content of /proc/sys/kernel/cap_last_cap.
+func parseCapLastCap(content string) (uintptr, error) {
+	n, err := strconv.Atoi(strings.TrimSpace(content))
+	if err != nil {
+		return 0, fmt.Errorf("parsing cap_last_cap: %w", err)
+	}
+	return uintptr(n), nil
+}
+
+// DropBoundingCaps drops all capabilities from the bounding set except
+// those listed in keep. This limits what capabilities child processes
+// can acquire even through setuid binaries or file capabilities.
+//
+// Call this as the last privileged operation before starting the
+// workload — all mounts, network config, and chown calls must be
+// complete before caps are dropped.
+func DropBoundingCaps(keep ...uintptr) error {
+	keepSet := make(map[uintptr]struct{}, len(keep))
+	for _, c := range keep {
+		keepSet[c] = struct{}{}
+	}
+
+	last := capLastCap()
+	for cap := uintptr(0); cap <= last; cap++ {
+		if _, ok := keepSet[cap]; ok {
+			continue
+		}
+		if err := capBSetDrop(cap); err != nil {
+			return fmt.Errorf("dropping cap %d: %w", cap, err)
+		}
+	}
+	return nil
+}
+
+// capBSetDrop calls prctl(PR_CAPBSET_DROP, cap) to remove a single
+// capability from the bounding set.
+func capBSetDrop(cap uintptr) error {
+	_, _, errno := syscall.Syscall(
+		syscall.SYS_PRCTL,
+		prCapBSetDrop,
+		cap,
+		0,
+	)
+	if errno != 0 {
+		return fmt.Errorf("prctl(PR_CAPBSET_DROP, %d): %w", cap, errno)
+	}
+	return nil
+}
+
+// keepSetContains reports whether cap is in the given keep set.
+func keepSetContains(keep []uintptr, cap uintptr) bool {
+	for _, k := range keep {
+		if k == cap {
+			return true
+		}
+	}
+	return false
+}
diff --git a/guest/harden/capability_test.go b/guest/harden/capability_test.go
@@ -0,0 +1,124 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build linux
+
+package harden
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestParseCapLastCap(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name    string
+		content string
+		want    uintptr
+		wantErr bool
+	}{
+		{
+			name:    "typical value",
+			content: "40\n",
+			want:    40,
+		},
+		{
+			name:    "higher kernel",
+			content: "41\n",
+			want:    41,
+		},
+		{
+			name:    "no trailing newline",
+			content: "40",
+			want:    40,
+		},
+		{
+			name:    "whitespace padding",
+			content: "  40  \n",
+			want:    40,
+		},
+		{
+			name:    "non-numeric",
+			content: "abc\n",
+			wantErr: true,
+		},
+		{
+			name:    "empty",
+			content: "",
+			wantErr: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			got, err := ParseCapLastCapForTest(tt.content)
+			if tt.wantErr {
+				assert.Error(t, err)
+				return
+			}
+			require.NoError(t, err)
+			assert.Equal(t, tt.want, got)
+		})
+	}
+}
+
+func TestKeepSetContains(t *testing.T) {
+	t.Parallel()
+
+	keep := []uintptr{CapSetUID, CapSetGID, CapNetBindService}
+
+	tests := []struct {
+		name string
+		cap  uintptr
+		want bool
+	}{
+		{name: "CAP_SETUID in set", cap: CapSetUID, want: true},
+		{name: "CAP_SETGID in set", cap: CapSetGID, want: true},
+		{name: "CAP_NET_BIND_SERVICE in set", cap: CapNetBindService, want: true},
+		{name: "CAP_CHOWN not in set", cap: CapChown, want: false},
+		{name: "CAP_KILL not in set", cap: CapKill, want: false},
+		{name: "arbitrary cap not in set", cap: 99, want: false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.want, KeepSetContainsForTest(keep, tt.cap))
+		})
+	}
+}
+
+func TestKeepSetContains_EmptySet(t *testing.T) {
+	t.Parallel()
+
+	// With an empty keep set, nothing should be kept.
+	assert.False(t, KeepSetContainsForTest(nil, CapSetUID))
+	assert.False(t, KeepSetContainsForTest([]uintptr{}, CapSetGID))
+}
+
+func TestCapConstants(t *testing.T) {
+	t.Parallel()
+
+	// Verify the capability constants match Linux kernel values.
+	assert.Equal(t, uintptr(0), CapChown)
+	assert.Equal(t, uintptr(5), CapKill)
+	assert.Equal(t, uintptr(6), CapSetGID)
+	assert.Equal(t, uintptr(7), CapSetUID)
+	assert.Equal(t, uintptr(10), CapNetBindService)
+}
+
+func TestCapLastCap_ReadsProc(t *testing.T) {
+	t.Parallel()
+
+	// capLastCap should return a reasonable value from /proc or the
+	// fallback. On any Linux system the value should be >= 0.
+	got := capLastCap()
+	assert.GreaterOrEqual(t, got, uintptr(0))
+	// Modern kernels have at least 40 capabilities.
+	assert.GreaterOrEqual(t, got, uintptr(36))
+}
diff --git a/guest/harden/doc.go b/guest/harden/doc.go
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build linux
+
+// Package harden provides guest-side kernel and capability hardening for
+// microVM init processes. It restricts kernel information leaks, limits
+// unprivileged access to dangerous subsystems, and drops unneeded
+// capabilities from the bounding set.
+//
+// Consumers (e.g. apiary-init) should call [KernelDefaults] early in the
+// boot sequence (after /proc is mounted) and [DropBoundingCaps] last,
+// just before starting the workload, so that all privileged operations
+// (mounts, network config, chown) are already complete.
+package harden
diff --git a/guest/harden/export_test.go b/guest/harden/export_test.go
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build linux
+
+package harden
+
+// Test-only exports for verifying internal logic without root privileges.
+var (
+	ParseCapLastCapForTest = parseCapLastCap
+	KeepSetContainsForTest = keepSetContains
+	SysctlPathForTest      = sysctlPath
+	DefaultsForTest        = defaults
+)
diff --git a/guest/harden/sysctl.go b/guest/harden/sysctl.go
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build linux
+
+package harden
+
+import (
+	"fmt"
+	"log/slog"
+	"os"
+	"strings"
+)
+
+// Set writes value to the sysctl identified by key. The key uses the
+// standard dotted notation (e.g. "kernel.kptr_restrict") which is
+// converted to the /proc/sys/ path (/proc/sys/kernel/kptr_restrict).
+func Set(key, value string) error {
+	path := "/proc/sys/" + strings.ReplaceAll(key, ".", "/")
+	if err := os.WriteFile(path, []byte(value), 0o644); err != nil {
+		return fmt.Errorf("sysctl %s=%s: %w", key, value, err)
+	}
+	return nil
+}
+
+// kernelDefault is a single sysctl key-value pair with a human-readable
+// reason for why it is set.
+type kernelDefault struct {
+	key    string
+	value  string
+	reason string
+}
+
+// defaults lists the recommended kernel sysctls for guest hardening.
+var defaults = []kernelDefault{
+	{
+		key:    "kernel.kptr_restrict",
+		value:  "2",
+		reason: "hide kernel pointers from all users",
+	},
+	{
+		key:    "kernel.dmesg_restrict",
+		value:  "1",
+		reason: "restrict dmesg to privileged users",
+	},
+	{
+		key:    "kernel.unprivileged_bpf_disabled",
+		value:  "1",
+		reason: "disable unprivileged BPF",
+	},
+}
+
+// KernelDefaults applies recommended kernel sysctl hardening. Each
+// setting is applied independently; failures are logged as warnings
+// rather than aborting boot, because individual sysctls may not be
+// available on all kernel versions.
+func KernelDefaults(logger *slog.Logger) {
+	for _, d := range defaults {
+		logger.Info("applying sysctl", "key", d.key, "value", d.value, "reason", d.reason)
+		if err := Set(d.key, d.value); err != nil {
+			logger.Warn("sysctl failed", "key", d.key, "error", err)
+		}
+	}
+}
+
+// sysctlPath converts a dotted sysctl key to its /proc/sys/ path.
+func sysctlPath(key string) string {
+	return "/proc/sys/" + strings.ReplaceAll(key, ".", "/")
+}
diff --git a/guest/harden/sysctl_test.go b/guest/harden/sysctl_test.go