Skip to content

Commit ddbbc01

Browse files
authored
Merge pull request #73 from stacklok/jaosorior/stale-runner-and-provider-wiring
stale-runner: identity guard; options: symmetric provider wiring
2 parents b0eb92f + 747b3bb commit ddbbc01

File tree

8 files changed

+239
-16
lines changed

8 files changed

+239
-16
lines changed

internal/procutil/process_linux.go

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,18 +9,33 @@ import (
99
"fmt"
1010
"os"
1111
"path/filepath"
12+
"strings"
1213
)
1314

14-
// IsExpectedProcess checks if the process at pid is running the expected binary.
15-
// On Linux, reads /proc/<pid>/exe to verify the binary path. Returns false if
16-
// the process does not exist or is running a different binary, preventing
17-
// signals to recycled PIDs.
15+
// IsExpectedProcess checks if the process at pid is running the expected
16+
// binary. On Linux, reads /proc/<pid>/exe to verify the binary path. Returns
17+
// false if the process does not exist or is running a different binary,
18+
// preventing signals to recycled PIDs.
19+
//
20+
// When expectedBinary is an absolute path, the comparison is against the
21+
// full resolved exe path — this is the strong guarantee. When
22+
// expectedBinary is just a name, the comparison falls back to the base
23+
// name; two unrelated binaries with the same base name on the same host
24+
// would collide under the fallback, so callers should pass absolute paths
25+
// when possible.
26+
//
27+
// The "(deleted)" suffix that the kernel appends when the underlying
28+
// binary has been unlinked post-exec is stripped so that processes still
29+
// running from a since-removed extract directory are correctly identified.
1830
func IsExpectedProcess(pid int, expectedBinary string) bool {
1931
exePath, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid))
2032
if err != nil {
2133
return false // Process gone or no permission
2234
}
23-
// Compare base names: the state may store just the binary name while
24-
// /proc/pid/exe returns the full resolved path.
35+
exePath = strings.TrimSuffix(exePath, " (deleted)")
36+
37+
if filepath.IsAbs(expectedBinary) {
38+
return filepath.Clean(exePath) == filepath.Clean(expectedBinary)
39+
}
2540
return filepath.Base(exePath) == filepath.Base(expectedBinary)
2641
}

internal/procutil/process_linux_test.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,34 @@ func TestIsExpectedProcess_Self(t *testing.T) {
2323
}
2424
}
2525

26-
func TestIsExpectedProcess_SelfBaseName(t *testing.T) {
26+
func TestIsExpectedProcess_SelfBaseNameFallback(t *testing.T) {
27+
// When expectedBinary is NOT absolute (just a bare name), the fallback
28+
// base-name comparison applies.
2729
pid := os.Getpid()
2830

2931
selfExe, err := os.Executable()
3032
if err != nil {
3133
t.Fatalf("failed to get self executable: %v", err)
3234
}
3335
baseName := filepath.Base(selfExe)
34-
if !IsExpectedProcess(pid, "/some/other/path/"+baseName) {
35-
t.Errorf("IsExpectedProcess with different dir but same base name should return true")
36+
if !IsExpectedProcess(pid, baseName) {
37+
t.Errorf("IsExpectedProcess with bare base name should match self")
38+
}
39+
}
40+
41+
func TestIsExpectedProcess_AbsolutePathMismatch(t *testing.T) {
42+
// When expectedBinary IS absolute, a different directory with the same
43+
// base name must NOT match. This is the strengthened guarantee against
44+
// unrelated binaries with colliding base names.
45+
pid := os.Getpid()
46+
47+
selfExe, err := os.Executable()
48+
if err != nil {
49+
t.Fatalf("failed to get self executable: %v", err)
50+
}
51+
baseName := filepath.Base(selfExe)
52+
if IsExpectedProcess(pid, "/some/other/path/"+baseName) {
53+
t.Errorf("absolute path mismatch must not match even with same base name")
3654
}
3755
}
3856

microvm.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,10 @@ func Run(ctx context.Context, imageRef string, opts ...Option) (*VM, error) {
9898
slog.Warn("egress policy overrides firewall default action to Deny")
9999
}
100100
cfg.firewallDefaultAction = firewall.Deny
101-
if cfg.netProvider == nil {
102-
cfg.netProvider = hosted.NewProvider()
103-
}
104101
}
105102

103+
wireDefaultProvider(cfg)
104+
106105
// 1. Preflight checks.
107106
{
108107
ctx, span := tracer.Start(ctx, "microvm.Preflight")
@@ -312,6 +311,7 @@ func Run(ctx context.Context, imageRef string, opts ...Option) (*VM, error) {
312311
ls.State.Name = cfg.name
313312
if pid, pidErr := pidFromID(handle.ID()); pidErr == nil {
314313
ls.State.PID = pid
314+
ls.State.PIDStartTime = time.Now().UTC()
315315
} else {
316316
slog.Warn("could not persist VM PID", "id", handle.ID(), "error", pidErr)
317317
}
@@ -354,6 +354,21 @@ const (
354354
staleTermPoll = 250 * time.Millisecond
355355
)
356356

357+
// wireDefaultProvider auto-creates a hosted network provider when any
358+
// firewall configuration (egress policy, static rules, or a non-Allow
359+
// default action) is set but no provider was supplied explicitly. The
360+
// default runner-side networking path does not enforce firewall rules,
361+
// so without this the caller's deny-default would silently degrade to
362+
// allow-all. No-op when a provider is already set.
363+
func wireDefaultProvider(cfg *config) {
364+
firewallConfigured := cfg.egressPolicy != nil ||
365+
len(cfg.firewallRules) > 0 ||
366+
cfg.firewallDefaultAction != firewall.Allow
367+
if firewallConfigured && cfg.netProvider == nil {
368+
cfg.netProvider = hosted.NewProvider()
369+
}
370+
}
371+
357372
func cleanDataDir(cfg *config) error {
358373
if cfg.dataDir == "" {
359374
return nil
@@ -409,6 +424,13 @@ func terminateStaleRunner(cfg *config) {
409424
slog.Debug("stale runner already dead", "pid", st.PID)
410425
return
411426
}
427+
if !cfg.processIsExpected(st.PID) {
428+
// PID has been recycled onto an unrelated binary since we wrote
429+
// the state file. Signalling it would kill the wrong process
430+
// group (or fail silently if we lack permission). Bail out.
431+
slog.Warn("stale PID does not match expected runner binary, skipping termination", "pid", st.PID)
432+
return
433+
}
412434

413435
// Use negative PID to signal the entire process group (PGID == PID
414436
// because the runner starts with Setsid: true). This ensures any

microvm_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,20 @@ import (
2121
"github.com/stacklok/go-microvm/hypervisor"
2222
"github.com/stacklok/go-microvm/image"
2323
"github.com/stacklok/go-microvm/internal/testutil"
24+
propnet "github.com/stacklok/go-microvm/net"
2425
"github.com/stacklok/go-microvm/net/firewall"
2526
"github.com/stacklok/go-microvm/preflight"
2627
"github.com/stacklok/go-microvm/state"
2728
)
2829

30+
// sentinelProvider is a minimal net.Provider used by tests to assert that
31+
// a caller-supplied provider survives auto-wiring without being replaced.
32+
type sentinelProvider struct{}
33+
34+
func (*sentinelProvider) Start(_ context.Context, _ propnet.Config) error { return nil }
35+
func (*sentinelProvider) SocketPath() string { return "" }
36+
func (*sentinelProvider) Stop() {}
37+
2938
// --- Pure function tests ---
3039

3140
func TestBuildInitConfig_NilOCIConfig(t *testing.T) {
@@ -660,6 +669,53 @@ func TestBuildNetConfig_Empty(t *testing.T) {
660669

661670
// --- Egress validation tests ---
662671

672+
func TestWireDefaultProvider(t *testing.T) {
673+
t.Parallel()
674+
675+
t.Run("no firewall config leaves provider nil", func(t *testing.T) {
676+
t.Parallel()
677+
cfg := defaultConfig()
678+
wireDefaultProvider(cfg)
679+
assert.Nil(t, cfg.netProvider)
680+
})
681+
682+
t.Run("egress policy auto-wires provider", func(t *testing.T) {
683+
t.Parallel()
684+
cfg := defaultConfig()
685+
cfg.egressPolicy = &EgressPolicy{}
686+
wireDefaultProvider(cfg)
687+
assert.NotNil(t, cfg.netProvider)
688+
})
689+
690+
t.Run("firewall rules alone auto-wire provider", func(t *testing.T) {
691+
t.Parallel()
692+
cfg := defaultConfig()
693+
cfg.firewallRules = []firewall.Rule{{Direction: firewall.Egress, Action: firewall.Allow}}
694+
wireDefaultProvider(cfg)
695+
assert.NotNil(t, cfg.netProvider,
696+
"firewall-only config must auto-wire a provider; otherwise rules go unenforced")
697+
})
698+
699+
t.Run("deny default alone auto-wires provider", func(t *testing.T) {
700+
t.Parallel()
701+
cfg := defaultConfig()
702+
cfg.firewallDefaultAction = firewall.Deny
703+
wireDefaultProvider(cfg)
704+
assert.NotNil(t, cfg.netProvider,
705+
"deny-default config must auto-wire a provider to actually deny")
706+
})
707+
708+
t.Run("explicit provider is not overwritten", func(t *testing.T) {
709+
t.Parallel()
710+
existing := &sentinelProvider{}
711+
cfg := defaultConfig()
712+
cfg.netProvider = existing
713+
cfg.firewallDefaultAction = firewall.Deny
714+
wireDefaultProvider(cfg)
715+
assert.Same(t, existing, cfg.netProvider)
716+
})
717+
}
718+
663719
func TestRun_EgressPolicy_EmptyHosts_DenyAll(t *testing.T) {
664720
t.Parallel()
665721

@@ -862,6 +918,7 @@ func TestTerminateStaleRunner_AliveProcess_GracefulExit(t *testing.T) {
862918
// (after SIGTERM + first poll).
863919
return aliveCount <= 1
864920
}
921+
cfg.processIsExpected = func(_ int) bool { return true }
865922

866923
terminateStaleRunner(cfg)
867924

@@ -899,6 +956,7 @@ func TestTerminateStaleRunner_AliveProcess_RequiresKill(t *testing.T) {
899956
}
900957
// Process never exits on its own.
901958
cfg.processAlive = func(_ int) bool { return true }
959+
cfg.processIsExpected = func(_ int) bool { return true }
902960

903961
terminateStaleRunner(cfg)
904962

@@ -941,6 +999,7 @@ func TestTerminateStaleRunner_SendsToProcessGroup(t *testing.T) {
941999
aliveCount++
9421000
return aliveCount <= 1
9431001
}
1002+
cfg.processIsExpected = func(_ int) bool { return true }
9441003

9451004
terminateStaleRunner(cfg)
9461005

@@ -950,6 +1009,38 @@ func TestTerminateStaleRunner_SendsToProcessGroup(t *testing.T) {
9501009
assert.Equal(t, -55555, receivedPIDs[0], "killProcess should receive negative PID for process group")
9511010
}
9521011

1012+
func TestTerminateStaleRunner_RecycledPID_Skipped(t *testing.T) {
1013+
t.Parallel()
1014+
1015+
// The state file points at a live PID, but processIsExpected says the
1016+
// binary at that PID is not the runner (as if the kernel had recycled
1017+
// the PID onto an unrelated process since state was written). The
1018+
// function must refuse to signal it.
1019+
dataDir := t.TempDir()
1020+
1021+
mgr := state.NewManager(dataDir)
1022+
ls, err := mgr.LoadAndLock(context.Background())
1023+
require.NoError(t, err)
1024+
ls.State.Active = true
1025+
ls.State.PID = 77777
1026+
require.NoError(t, ls.Save())
1027+
ls.Release()
1028+
1029+
cfg := defaultConfig()
1030+
cfg.dataDir = dataDir
1031+
1032+
var killCalled bool
1033+
cfg.killProcess = func(_ int, _ syscall.Signal) error {
1034+
killCalled = true
1035+
return nil
1036+
}
1037+
cfg.processAlive = func(_ int) bool { return true }
1038+
cfg.processIsExpected = func(_ int) bool { return false }
1039+
1040+
terminateStaleRunner(cfg)
1041+
assert.False(t, killCalled, "must not signal a recycled PID belonging to an unrelated binary")
1042+
}
1043+
9531044
func TestTerminateStaleRunner_PID1_Skipped(t *testing.T) {
9541045
t.Parallel()
9551046

@@ -973,6 +1064,7 @@ func TestTerminateStaleRunner_PID1_Skipped(t *testing.T) {
9731064
return nil
9741065
}
9751066
cfg.processAlive = func(_ int) bool { return true }
1067+
cfg.processIsExpected = func(_ int) bool { return true }
9761068

9771069
terminateStaleRunner(cfg)
9781070
assert.False(t, killCalled, "should not attempt to kill PID 1")

options.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
"github.com/stacklok/go-microvm/hypervisor"
1313
"github.com/stacklok/go-microvm/image"
14+
"github.com/stacklok/go-microvm/internal/procutil"
1415
"github.com/stacklok/go-microvm/net"
1516
"github.com/stacklok/go-microvm/net/firewall"
1617
"github.com/stacklok/go-microvm/preflight"
@@ -103,6 +104,7 @@ type config struct {
103104
stat func(string) (os.FileInfo, error)
104105
killProcess func(pid int, sig syscall.Signal) error
105106
processAlive func(pid int) bool
107+
processIsExpected func(pid int) bool
106108
}
107109

108110
func defaultConfig() *config {
@@ -126,9 +128,17 @@ func defaultConfig() *config {
126128
}
127129
return proc.Signal(syscall.Signal(0)) == nil
128130
},
131+
processIsExpected: func(pid int) bool {
132+
return procutil.IsExpectedProcess(pid, runnerBinaryName)
133+
},
129134
}
130135
}
131136

137+
// runnerBinaryName is the base name of the runner executable — used by
138+
// the default processIsExpected check to distinguish the go-microvm
139+
// runner from an unrelated process that happens to be at the same PID.
140+
const runnerBinaryName = "go-microvm-runner"
141+
132142
func defaultDataDir() string {
133143
if dir := os.Getenv("GO_MICROVM_DATA_DIR"); dir != "" {
134144
return dir

runner/process_linux_test.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,31 @@ func TestIsExpectedProcess_Self(t *testing.T) {
2828
}
2929
}
3030

31-
func TestIsExpectedProcess_SelfBaseName(t *testing.T) {
32-
// Should match by base name even if full paths differ.
31+
func TestIsExpectedProcess_BaseNameFallbackForRelative(t *testing.T) {
32+
// A relative/bare binary name still matches by base name — that is the
33+
// documented fallback. Absolute mismatches must no longer pass.
3334
pid := os.Getpid()
3435

3536
selfExe, err := os.Executable()
3637
if err != nil {
3738
t.Fatalf("failed to get self executable: %v", err)
3839
}
3940
baseName := selfExe[len(selfExe)-len("runner.test"):] // last component
40-
if !isExpectedProcess(pid, "/some/other/path/"+baseName) {
41-
t.Errorf("isExpectedProcess with different dir but same base name should return true")
41+
if !isExpectedProcess(pid, baseName) {
42+
t.Errorf("isExpectedProcess with bare base name should match self")
43+
}
44+
}
45+
46+
func TestIsExpectedProcess_AbsolutePathMismatchFails(t *testing.T) {
47+
pid := os.Getpid()
48+
49+
selfExe, err := os.Executable()
50+
if err != nil {
51+
t.Fatalf("failed to get self executable: %v", err)
52+
}
53+
baseName := selfExe[len(selfExe)-len("runner.test"):]
54+
if isExpectedProcess(pid, "/some/other/path/"+baseName) {
55+
t.Errorf("absolute path with different dir must not match even when base name matches")
4256
}
4357
}
4458

state/state.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@ type State struct {
6161
// PID is the process ID of the VM runner, or 0 if not running.
6262
PID int `json:"pid,omitempty"`
6363

64+
// PIDStartTime records wall-clock time when PID was recorded. Used
65+
// to disambiguate a recycled PID from the original runner in contexts
66+
// where /proc/PID/exe comparison is unavailable (e.g. macOS) or as
67+
// belt-and-suspenders alongside the exe-path check on Linux.
68+
// Zero time on state files written before this field was introduced.
69+
PIDStartTime time.Time `json:"pid_start_time,omitempty"`
70+
6471
// CreatedAt is the time the VM state was first created.
6572
CreatedAt time.Time `json:"created_at"`
6673
}

0 commit comments

Comments
 (0)