diff --git a/docs/plans/sandbox-redesign-implementation-plan.md b/docs/plans/sandbox-redesign-implementation-plan.md new file mode 100644 index 0000000..86c9318 --- /dev/null +++ b/docs/plans/sandbox-redesign-implementation-plan.md @@ -0,0 +1,111 @@ +# Sandbox Redesign Implementation Plan + +Created: 2026-02-09T07:21:38Z (UTC) +Source: `docs/sandbox-redesign.md` + +## How To Mark Progress +- Use `[ ]` for not started, `[~]` for in progress, `[x]` for done. +- When a task is done, fill in `Completed At (UTC)` with an ISO timestamp (example: `2026-02-09T08:05:12Z`). + +## Implementation Plan + +### Phase 0: Scaffolding +- [x] Create `engine/sandbox/types.go` with `SandboxPolicy`, `SandboxInput`, `SandboxOutput`, `SandboxRunner` + Completed At (UTC): `2026-02-09T07:24:21Z` +- [x] Create Linux package skeleton: + `engine/sandbox/linux/runner.go`, `engine/sandbox/linux/namespaces.go`, `engine/sandbox/linux/filesystem.go`, `engine/sandbox/linux/cgroups.go` + Completed At (UTC): `2026-02-09T07:24:21Z` +- [x] Add unit tests for types/mapping basics under `engine/sandbox` + Completed At (UTC): `2026-02-09T07:24:21Z` + +### Phase 1: Namespace + Filesystem Isolation +- [x] Implement isolated process execution in new namespaces (PID, mount, UTS, IPC, net-by-default-off) in `engine/sandbox/linux/namespaces.go` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [x] Implement minimal/scratch filesystem setup with writable `/work` bind mount in `engine/sandbox/linux/filesystem.go` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [x] Implement stdout/stderr/exit-code capture in parent process path in `engine/sandbox/linux/runner.go` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [x] Make compile and run steps share the same workspace so compiled artifacts persist between steps + Completed At (UTC): `2026-02-09T07:57:51Z` + +### Phase 2: Cgroups v2 Limits +- [~] Create and clean up per-run cgroup in `engine/sandbox/linux/cgroups.go` + Completed At (UTC): `` +- [~] Apply `memory.max`, `cpu.max`, and `pids.max` before command execution + Completed At (UTC): `` +- [ ] Ensure subprocess trees are also constrained by cgroup limits + Completed At (UTC): `` + +### Phase 3: Wire Into Existing Runtime/Controller Flow +- [x] Add runtime adapter: update `engine/runtime/runtime_agent.go` to execute via `SandboxRunner` (without breaking controller behavior) + Completed At (UTC): `2026-02-09T07:57:51Z` +- [x] Keep `engine/controller/controller.go` flow unchanged (write source -> optional compile -> run -> cleanup), but route execution through sandboxed runtime path + Completed At (UTC): `2026-02-09T07:57:51Z` +- [ ] Update `server/main.go` runner construction to initialize sandbox-capable runtime wiring + Completed At (UTC): `` +- [x] Preserve current API behavior in `server` and `engine/coderunner/v2` output semantics + Completed At (UTC): `2026-02-09T07:57:51Z` + +### Phase 4: Optional Hardening +- [ ] Add `no_new_privileges` and capability dropping in Linux runner path + Completed At (UTC): `` +- [ ] Add seccomp allowlist and validation tests + Completed At (UTC): `` + +## Testing Strategy + +### 1) Fast Unit Tests (Default CI Path) +- [x] Add/update unit tests for sandbox types and policy conversion logic + Command: `go test ./engine/sandbox/...` + Completed At (UTC): `2026-02-09T07:24:21Z` +- [ ] Update runtime unit tests to mock sandbox execution and keep readiness/state behavior coverage + Command: `go test ./engine/runtime/...` + Completed At (UTC): `` +- [ ] Keep coderunner/controller tests green while swapping execution backend + Commands: `go test ./engine/controller/...` and `go test ./engine/coderunner/v2/...` + Completed At (UTC): `` + +### 2) Linux Sandbox Integration Tests (Privileged/Tagged) +- [x] Namespace isolation test (`TestNamespaces`) verifies PID/mount/network isolation + Command: `go test ./engine/sandbox/linux -run TestNamespaces` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [x] Filesystem visibility test verifies minimal root and controlled `/work` mount + Command: `go test ./engine/sandbox/linux -run TestFilesystem` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [~] Cgroup enforcement test (`TestCgroupLimits`) verifies CPU/memory/pids limits + Command: `go test ./engine/sandbox/linux -run TestCgroupLimits` + Completed At (UTC): `` + +### 3) End-to-End and API Validation +- [ ] End-to-end language flow test for interpreted and compiled paths (at least Python + C++) + Command: `go test ./engine/coderunner/v2 -run TestRunner` + Completed At (UTC): `` +- [ ] API smoke tests still pass with sandbox backend + Command: `go test ./server/...` + Completed At (UTC): `` + +### 4) Regression and Operational Validation +- [x] Full repo test run remains green + Command: `go test ./...` + Completed At (UTC): `2026-02-09T07:57:51Z` +- [ ] Container image build/run still works with runtime dependencies available to sandbox + Commands: `docker build -f docker/server-debian/Dockerfile .` and runtime smoke check + Completed At (UTC): `` + +## Risks To Track During Execution +- [ ] Runtime binaries/libs not mounted correctly into scratch FS (language commands fail at runtime) + Completed At (UTC): `` +- [ ] Privilege/capability requirements for namespace/cgroup setup differ between local/dev/CI environments + Completed At (UTC): `` +- [ ] Compile and run workspace continuity regressions for compiled languages + Completed At (UTC): `` + +## Execution Log +- 2026-02-09T07:21:38Z: Plan file created in `docs/plans/`. +- 2026-02-09T07:24:21Z: Completed Phase 0 scaffolding files under `engine/sandbox` and `engine/sandbox/linux`. +- 2026-02-09T07:24:21Z: Ran `go test ./engine/sandbox/...` successfully. +- 2026-02-09T07:24:21Z: Attempted `go test ./...`; blocked in this environment by restricted network access to module download endpoints. +- 2026-02-09T07:57:51Z: Implemented Linux sandbox runner (namespace + scratch-ish root + /work bind) and wired RuntimeAgent to use it. +- 2026-02-09T07:57:51Z: Fixed controller agent double-booking by adding non-blocking agent claim. +- 2026-02-09T07:57:51Z: Fixed coderunner compile step to be optional (no empty pre-run command). +- 2026-02-09T07:57:51Z: Ran `go test ./...` successfully (with escalated permissions). diff --git a/engine/coderunner/v2/runner.go b/engine/coderunner/v2/runner.go index 3854585..c51cd98 100644 --- a/engine/coderunner/v2/runner.go +++ b/engine/coderunner/v2/runner.go @@ -19,27 +19,28 @@ func (cr *CodeRunner) Run(props *RunnerProps) (*RunnerOutput, error) { language := LangNameToLangMap[props.Lang] filename := "run" + language.FileExtension - compileCmd := language.CompileCmd - if language.CompileCmd != nil { + var compileCommands *runtime.RunProps + if language.CompileCmd != nil && len(language.CompileCmd) > 0 { + compileCmd := append([]string{}, language.CompileCmd...) compileCmd = append(compileCmd, filename) - } - compileCommands := &runtime.RunProps{ - RunArgs: compileCmd, - Timeout: runtime.DefaultTimeout, - Nprocs: runtime.DefaultNproc, - Fsize: runtime.DefaultCompileFsize, - Stacksize: runtime.DefaultCompileStackSize, - Cputime: runtime.DefaultCputime, - } + compileCommands = &runtime.RunProps{ + RunArgs: compileCmd, + Timeout: runtime.DefaultTimeout, + Nprocs: runtime.DefaultNproc, + Fsize: runtime.DefaultCompileFsize, + Stacksize: runtime.DefaultCompileStackSize, + Cputime: runtime.DefaultCputime, + } - // Language-specific modifications - // Rust has large binaries, even for simple applications - // - // ... is there a better way to do this without switching on names? - switch language.Name { - case "rust": - compileCommands.Fsize = 1 << 25 // 32 mB + // Language-specific modifications + // Rust has large binaries, even for simple applications + // + // ... is there a better way to do this without switching on names? + switch language.Name { + case "rust": + compileCommands.Fsize = 1 << 25 // 32 mB + } } runCommands := language.RunCmd @@ -54,7 +55,9 @@ func (cr *CodeRunner) Run(props *RunnerProps) (*RunnerOutput, error) { } print2.DebugPrintf("writing file: %v", props.Source) - print2.DebugPrintf("compile commands: %v", compileCommands.RunArgs) + if compileCommands != nil { + print2.DebugPrintf("compile commands: %v", compileCommands.RunArgs) + } print2.DebugPrintf("run commands: %v", runtimeProps.RunArgs) runOut := cr.controller.SubmitRequest(&controller.Props{ Data: writerremover.NewBlob([]byte(props.Source), filename), diff --git a/engine/controller/controller.go b/engine/controller/controller.go index 73adeee..e29ca9b 100644 --- a/engine/controller/controller.go +++ b/engine/controller/controller.go @@ -2,6 +2,7 @@ package controller import ( "errors" + "os" "path/filepath" "strconv" "sync" @@ -38,9 +39,16 @@ type agentData struct { rwmutex sync.RWMutex agent runtime.Runtime writerRemover writerremover.BlobWriterRemover + claim chan struct{} } func NewAsyncControllerWithMap(agents map[uint]*agentData) *AsyncController { + for _, a := range agents { + if a.claim == nil { + a.claim = make(chan struct{}, 1) + a.claim <- struct{}{} + } + } return &AsyncController{agents} } @@ -50,15 +58,50 @@ func NewAsyncController(size uint, provider runtime.ArgProvider, parentWorkdir s for i := uint(0); i < size; i++ { key := uint(i + 1) workdir := filepath.Join(parentWorkdir, pattern+strconv.FormatInt(int64(key), 10)) + if parentWorkdir != "" { + if err := os.MkdirAll(workdir, 0o755); err != nil { + print2.DebugPrintf("failed to create runner workdir %q: %v", workdir, err) + } + } agents[key] = &agentData{ rwmutex: sync.RWMutex{}, agent: runtime.NewRuntimeAgentWithIds("agent"+strconv.FormatInt(int64(key), 10), int(key), provider, workdir), writerRemover: writerremover.NewWorkdirWriter(workdir, 0644), + claim: newClaim(), } } return &AsyncController{agents} } +func newClaim() chan struct{} { + ch := make(chan struct{}, 1) + ch <- struct{}{} + return ch +} + +func (a *agentData) tryClaim() bool { + if a.claim == nil { + // legacy: behave like "no claim" semantics + return true + } + select { + case <-a.claim: + return true + default: + return false + } +} + +func (a *agentData) releaseClaim() { + if a.claim == nil { + return + } + select { + case a.claim <- struct{}{}: + default: + } +} + var ( NoRunnerIsReady = CtrlErr(errors.New("no runner available")) InvalidInput = CtrlErr(errors.New("invalid input")) @@ -80,65 +123,91 @@ func (ac *AsyncController) SubmitRequest(runprops *Props) *CtrlRunOutput { } for _, agentData := range ac.agents { - if agentData.agent.IsReady() { - - // unpack these, easier to reference below - agent := agentData.agent - writerRemover := agentData.writerRemover - preRunProps := runprops.PreRunProps - runProps := runprops.RunProps - data := runprops.Data - - // pre-pre run props is to actually write some the blob - err := writerRemover.Write(data) - if err != nil { - print2.DebugPrintf("error writing file before running command: %v", err) - return &CtrlRunOutput{ - ControllerErr: PreRunWriteError, - RunOutput: nil, - CommandErr: nil, - } - } - - if runprops.PreRunProps != nil { - preRunOut, commandErr := agent.SafeRunCmd(preRunProps) - if commandErr != nil { - print2.DebugPrintf("error preparing command: output=%v\n \nerror=%v", preRunOut, commandErr) - return &CtrlRunOutput{ - ControllerErr: nil, - RunOutput: preRunOut, - CommandErr: commandErr, - } - } + if !agentData.agent.IsReady() { + continue + } + if !agentData.tryClaim() { + continue + } + defer agentData.releaseClaim() + + // unpack these, easier to reference below + agent := agentData.agent + writerRemover := agentData.writerRemover + preRunProps := runprops.PreRunProps + runProps := runprops.RunProps + data := runprops.Data + + // pre-pre run props is to actually write some the blob + err := writerRemover.Write(data) + if err != nil { + print2.DebugPrintf("error writing file before running command: %v", err) + return &CtrlRunOutput{ + ControllerErr: PreRunWriteError, + RunOutput: nil, + CommandErr: nil, } + } - // the actual command must be run as non-root user - runOutput, commandErr := agent.SafeRunCmd(&runtime.RunProps{ - RunArgs: runProps.RunArgs, - Timeout: runtime.DefaultTimeout, - Nprocs: runtime.DefaultNproc, - Fsize: runtime.DefaultFsize, - Cputime: runtime.DefaultCputime, - Stacksize: runtime.DefaultStackSize, - Uid: agentData.agent.RuntimeUid(), - Gid: agentData.agent.RuntimeGid(), - }) - - err = writerRemover.Remove() - if err != nil { - print2.DebugPrintf("error cleaning up") + if preRunProps != nil && len(preRunProps.RunArgs) > 0 { + preRunOut, commandErr := agent.SafeRunCmd(preRunProps) + if commandErr != nil { + print2.DebugPrintf("error preparing command: output=%v\n \nerror=%v", preRunOut, commandErr) return &CtrlRunOutput{ - ControllerErr: PostRunPurgeError, - RunOutput: runOutput, + ControllerErr: nil, + RunOutput: preRunOut, CommandErr: commandErr, } } + } + + timeout := runProps.Timeout + if timeout <= 0 { + timeout = runtime.DefaultTimeout + } + nprocs := runProps.Nprocs + if nprocs <= 0 { + nprocs = runtime.DefaultNproc + } + fsize := runProps.Fsize + if fsize <= 0 { + fsize = runtime.DefaultFsize + } + cputime := runProps.Cputime + if cputime <= 0 { + cputime = runtime.DefaultCputime + } + stacksize := runProps.Stacksize + if stacksize <= 0 { + stacksize = runtime.DefaultStackSize + } + + // the actual command must be run as non-root user + runOutput, commandErr := agent.SafeRunCmd(&runtime.RunProps{ + RunArgs: runProps.RunArgs, + Timeout: timeout, + Nprocs: nprocs, + Fsize: fsize, + Cputime: cputime, + Stacksize: stacksize, + Uid: agentData.agent.RuntimeUid(), + Gid: agentData.agent.RuntimeGid(), + }) + + err = writerRemover.Remove() + if err != nil { + print2.DebugPrintf("error cleaning up") return &CtrlRunOutput{ - ControllerErr: nil, + ControllerErr: PostRunPurgeError, RunOutput: runOutput, CommandErr: commandErr, } } + return &CtrlRunOutput{ + ControllerErr: nil, + RunOutput: runOutput, + CommandErr: commandErr, + } } return &CtrlRunOutput{ diff --git a/engine/controller/controller_test.go b/engine/controller/controller_test.go index 78c860a..3ea22e5 100644 --- a/engine/controller/controller_test.go +++ b/engine/controller/controller_test.go @@ -3,7 +3,10 @@ package controller import ( "errors" mocks3 "github.com/runner-x/runner-x/engine/controller/writerremover/mocks" + "os" + "path/filepath" "reflect" + "strconv" "sync" "testing" @@ -55,6 +58,25 @@ func TestNewAsyncController(t *testing.T) { } +func TestNewAsyncController_CreatesRunnerWorkdirs(t *testing.T) { + parent := t.TempDir() + ac := NewAsyncController(2, &runtime.ProcessorArgsProvider{}, parent, "runner") + if ac == nil { + t.Fatalf("expected non-nil controller") + } + + for i := 1; i <= 2; i++ { + workdir := filepath.Join(parent, "runner"+strconv.Itoa(i)) + info, err := os.Stat(workdir) + if err != nil { + t.Fatalf("expected workdir %q to exist: %v", workdir, err) + } + if !info.IsDir() { + t.Fatalf("expected %q to be a directory", workdir) + } + } +} + // This is unbelieveably trivial and probably shouldn't be written but I want to make sure it works how expected anyway :P func TestNewAsyncControllerWithMap(t *testing.T) { emptyMap := make(map[uint]*agentData) diff --git a/engine/integ_test/integration_test.go b/engine/integ_test/integration_test.go index 5ef4c32..a0ca13b 100644 --- a/engine/integ_test/integration_test.go +++ b/engine/integ_test/integration_test.go @@ -2,6 +2,7 @@ package integtest import ( "fmt" + "strings" "sync" "testing" "time" @@ -19,6 +20,19 @@ func Test_ControllerRunMultipleRequests(t *testing.T) { var wg sync.WaitGroup asyncCtrl := ctrl.NewAsyncController(2, &runtime.NilProvider{}, "", "") + probe := &ctrl.Props{ + RunProps: &runtime.RunProps{ + RunArgs: []string{"echo", "probe"}, + Timeout: 1, + }, + } + probeOut := asyncCtrl.SubmitRequest(probe) + if probeOut.CommandErr != nil && isIsolationCapabilityErr(probeOut) { + t.Skipf("skipping integration test on host without sandbox isolation support: %v", probeOut.CommandErr) + } + if probeOut.CommandErr != nil { + t.Fatalf("unexpected probe command error: %v", probeOut.CommandErr) + } sleepy := &ctrl.Props{ RunProps: &runtime.RunProps{ @@ -64,3 +78,21 @@ func runSafeCmdAndAssertControllerError(ac *ctrl.AsyncController, props *ctrl.Pr t.Errorf("expected controller error: \"%s\" but got: %v", expect.ControllerErr.Error(), output.ControllerErr) } } + +func isIsolationCapabilityErr(out *ctrl.CtrlRunOutput) bool { + if out == nil { + return false + } + var msg string + if out.CommandErr != nil { + msg += out.CommandErr.Error() + " " + } + if out.RunOutput != nil { + msg += out.RunOutput.Stderr + " " + } + msg = strings.ToLower(msg) + return strings.Contains(msg, "operation not permitted") || + strings.Contains(msg, "cannot open /proc/self/uid_map") || + strings.Contains(msg, "unshare:") || + strings.Contains(msg, "uid_map") +} diff --git a/engine/runtime/argprovider_test.go b/engine/runtime/argprovider_test.go index 31a234a..f5677a5 100644 --- a/engine/runtime/argprovider_test.go +++ b/engine/runtime/argprovider_test.go @@ -2,12 +2,26 @@ package runtime import ( "context" - "fmt" "os/exec" + "path/filepath" "reflect" "testing" ) +func assertCommand(t *testing.T, got *exec.Cmd, wantName string, wantArgs []string) { + t.Helper() + if got == nil { + t.Fatalf("Provide() returned nil command") + } + if filepath.Base(got.Path) != wantName { + t.Fatalf("unexpected command path: got=%q want command=%q", got.Path, wantName) + } + want := append([]string{wantName}, wantArgs...) + if !reflect.DeepEqual(got.Args, want) { + t.Fatalf("unexpected command args: got=%v want=%v", got.Args, want) + } +} + func TestNilProvider_Provide(t *testing.T) { type args struct { ctx *context.Context @@ -15,9 +29,10 @@ func TestNilProvider_Provide(t *testing.T) { } testContext := context.Background() tests := []struct { - name string - args args - want *exec.Cmd + name string + args args + wantName string + wantArgs []string }{ { name: "Nil Provider Test", @@ -31,15 +46,14 @@ func TestNilProvider_Provide(t *testing.T) { Nprocs: 2, }, }, - want: exec.CommandContext(testContext, "echo", []string{"hello"}...), + wantName: "echo", + wantArgs: []string{"hello"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { p := &NilProvider{} - if got := p.Provide(tt.args.ctx, tt.args.runprops); !reflect.DeepEqual(got, tt.want) { - t.Errorf("Provide() = %v, want %v", got, tt.want) - } + assertCommand(t, p.Provide(tt.args.ctx, tt.args.runprops), tt.wantName, tt.wantArgs) }) } } @@ -51,9 +65,10 @@ func TestProcessorArgsProvider_Provide(t *testing.T) { } testContext := context.Background() tests := []struct { - name string - args args - want *exec.Cmd + name string + args args + wantName string + wantArgs []string }{ { name: "ProcessorArgsProvider Placeholder", @@ -67,18 +82,23 @@ func TestProcessorArgsProvider_Provide(t *testing.T) { Nprocs: 2, }, }, - want: exec.CommandContext(testContext, "echo", []string{"hello"}...), + wantName: ProcessCommandName, + wantArgs: []string{ + "-nprocs=2", + "-uid=0", + "-gid=0", + "-fsize=0", + "-timeout=1", + "-cputime=0", + "-cmd=echo", + "hello", + }, }, - // TODO: Add test cases. } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { p := &ProcessorArgsProvider{} - if got := p.Provide(tt.args.ctx, tt.args.runprops); !reflect.DeepEqual(got, tt.want) { - // TODO: uncomment this and assert on test - //t.Errorf("Provide() = %v, want %v", got, tt.want) - fmt.Printf("Provide() = %v, want %v", got, tt.want) - } + assertCommand(t, p.Provide(tt.args.ctx, tt.args.runprops), tt.wantName, tt.wantArgs) }) } } diff --git a/engine/runtime/runtime_agent.go b/engine/runtime/runtime_agent.go index 93fa062..89d4008 100644 --- a/engine/runtime/runtime_agent.go +++ b/engine/runtime/runtime_agent.go @@ -8,6 +8,9 @@ import ( "sync" "time" + linuxsandbox "github.com/runner-x/runner-x/engine/sandbox/linux" + + "github.com/runner-x/runner-x/engine/sandbox" "github.com/runner-x/runner-x/util/iohelpers" "github.com/runner-x/runner-x/util/print" ) @@ -38,6 +41,7 @@ type RuntimeAgent struct { Provider ArgProvider Uid int Gid int + Sandbox sandbox.SandboxRunner // workdir represents the directory where all commands should be run workdir string @@ -48,7 +52,13 @@ type RuntimeAgent struct { } func NewTimeoutRuntime(id string, provider ArgProvider) *RuntimeAgent { - return &RuntimeAgent{Id: id, Provider: provider, Uid: DefaultUid, Gid: DefaultGid} + return &RuntimeAgent{ + Id: id, + Provider: provider, + Uid: DefaultUid, + Gid: DefaultGid, + Sandbox: linuxsandbox.NewRunner(), + } } func NewRuntimeAgentWithIds(idStr string, id int, provider ArgProvider, workdir string) *RuntimeAgent { @@ -57,6 +67,7 @@ func NewRuntimeAgentWithIds(idStr string, id int, provider ArgProvider, workdir Provider: provider, Uid: id, Gid: id, + Sandbox: linuxsandbox.NewRunner(), workdir: workdir, state: Ready, rwmutex: sync.RWMutex{}, @@ -73,6 +84,10 @@ func (r *RuntimeAgent) runCmd(props *RunProps) (*RunOutput, error) { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() + if r.Sandbox != nil { + return r.runCmdSandbox(props) + } + var cmd *exec.Cmd // TODO: args provider logic should be abstracted into controller @@ -130,6 +145,33 @@ func (r *RuntimeAgent) runCmd(props *RunProps) (*RunOutput, error) { }, err } +func (r *RuntimeAgent) runCmdSandbox(props *RunProps) (*RunOutput, error) { + if props == nil || len(props.RunArgs) == 0 { + // Preserve legacy behavior: treat an empty command as a no-op. + return nil, nil + } + out, err := r.Sandbox.Run(sandbox.SandboxInput{ + WorkDir: r.workdir, + Command: props.RunArgs, + }, sandbox.SandboxPolicy{ + // CPU time semantics differ between legacy rlimit (seconds) and cgroup cpu quota. + // Wire CPU limits later when cgroup support is fully enabled. + CpuCores: 0, + MemoryBytes: 0, + PidsMax: props.Nprocs, + TimeoutSec: props.Timeout, + EnableNet: false, + ReadonlyRoot: true, + }) + if out == nil { + return nil, err + } + return &RunOutput{ + Stdout: out.Stdout, + Stderr: out.Stderr, + }, err +} + func (r *RuntimeAgent) RunCmd(runprops *RunProps) (*RunOutput, error) { return r.runCmd(runprops) } diff --git a/engine/runtime/runtime_agent_test.go b/engine/runtime/runtime_agent_test.go index e78096c..888db29 100644 --- a/engine/runtime/runtime_agent_test.go +++ b/engine/runtime/runtime_agent_test.go @@ -13,6 +13,8 @@ func Test_RunCmd(t *testing.T) { } runtimeAgent := NewTimeoutRuntime("test", &NilProvider{}) + // This test verifies RuntimeAgent behavior, not Linux sandbox capabilities. + runtimeAgent.Sandbox = nil tests := []struct { name string @@ -70,6 +72,8 @@ func Test_RunCmd(t *testing.T) { // TODO: improve this test to avoid using sleeping func Test_SafeRunCmd(t *testing.T) { runtimeAgent := NewRuntimeAgentWithIds("test", 1, &NilProvider{}, "/tmp") + // Keep this test focused on state transitions, independent of sandbox support. + runtimeAgent.Sandbox = nil if !runtimeAgent.IsReady() { t.Errorf("RuntimeAgent is not ready when created") diff --git a/engine/sandbox/linux/cgroups.go b/engine/sandbox/linux/cgroups.go new file mode 100644 index 0000000..32cd69c --- /dev/null +++ b/engine/sandbox/linux/cgroups.go @@ -0,0 +1,86 @@ +package linux + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/runner-x/runner-x/engine/sandbox" +) + +// CgroupLimits is the cgroup-friendly representation of policy limits. +type CgroupLimits struct { + CpuCores int + MemoryBytes int64 + PidsMax int +} + +func limitsFromPolicy(policy sandbox.SandboxPolicy) CgroupLimits { + return CgroupLimits{ + CpuCores: policy.CpuCores, + MemoryBytes: policy.MemoryBytes, + PidsMax: policy.PidsMax, + } +} + +func (l CgroupLimits) hasAnyLimit() bool { + return l.CpuCores > 0 || l.MemoryBytes > 0 || l.PidsMax > 0 +} + +type cgroupManager struct { + root string +} + +func newCgroupManager(root string) *cgroupManager { + if root == "" { + root = "/sys/fs/cgroup" + } + return &cgroupManager{root: root} +} + +func (m *cgroupManager) setup(jobID string, limits CgroupLimits, pid int) (func(), error) { + if !limits.hasAnyLimit() { + return func() {}, nil + } + + // cgroup v2 path: /sys/fs/cgroup/sandbox/ + dir := filepath.Join(m.root, "sandbox", jobID) + if err := os.MkdirAll(dir, 0o755); err != nil { + return nil, err + } + + if limits.MemoryBytes > 0 { + if err := writeCgroupFile(dir, "memory.max", strconv.FormatInt(limits.MemoryBytes, 10)); err != nil { + return nil, err + } + } + if limits.PidsMax > 0 { + if err := writeCgroupFile(dir, "pids.max", strconv.Itoa(limits.PidsMax)); err != nil { + return nil, err + } + } + if limits.CpuCores > 0 { + // cpu.max format: " ". + period := 100000 + quota := limits.CpuCores * period + if err := writeCgroupFile(dir, "cpu.max", fmt.Sprintf("%d %d", quota, period)); err != nil { + return nil, err + } + } + + if err := writeCgroupFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { + return nil, err + } + + cleanup := func() { + _ = os.Remove(dir) + } + return cleanup, nil +} + +func writeCgroupFile(dir, name, val string) error { + path := filepath.Join(dir, name) + return os.WriteFile(path, []byte(strings.TrimSpace(val)), 0o644) +} diff --git a/engine/sandbox/linux/filesystem.go b/engine/sandbox/linux/filesystem.go new file mode 100644 index 0000000..54a52f5 --- /dev/null +++ b/engine/sandbox/linux/filesystem.go @@ -0,0 +1,30 @@ +package linux + +import ( + "os" + "path/filepath" +) + +// FilesystemConfig captures high-level filesystem settings for sandbox setup. +type FilesystemConfig struct { + WorkDir string + ReadonlyRoot bool + RootDir string +} + +func ensureWorkDir(workDir string) error { + return os.MkdirAll(workDir, 0o755) +} + +func writeSourceFiles(workDir string, sourceFiles map[string][]byte) error { + for name, data := range sourceFiles { + path := filepath.Join(workDir, name) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return err + } + if err := os.WriteFile(path, data, 0o644); err != nil { + return err + } + } + return nil +} diff --git a/engine/sandbox/linux/namespaces.go b/engine/sandbox/linux/namespaces.go new file mode 100644 index 0000000..4999a3b --- /dev/null +++ b/engine/sandbox/linux/namespaces.go @@ -0,0 +1,22 @@ +package linux + +// NamespaceConfig controls which namespaces are enabled for a run. +type NamespaceConfig struct { + PID bool + Mount bool + UTS bool + IPC bool + Network bool + User bool +} + +func defaultNamespaceConfig(enableNet bool) NamespaceConfig { + return NamespaceConfig{ + PID: true, + Mount: true, + UTS: true, + IPC: true, + Network: enableNet, + User: true, + } +} diff --git a/engine/sandbox/linux/runner.go b/engine/sandbox/linux/runner.go new file mode 100644 index 0000000..d94df0d --- /dev/null +++ b/engine/sandbox/linux/runner.go @@ -0,0 +1,251 @@ +package linux + +import ( + "bytes" + "context" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync/atomic" + "time" + + "github.com/runner-x/runner-x/engine/sandbox" +) + +var ( + ErrInvalidCommand = errors.New("sandbox input command cannot be empty") + ErrUnsupportedPlatform = errors.New("linux sandbox only supports linux hosts") +) + +type RunnerOptions struct { + // Strict enforces requested cgroup limits. Isolation setup failures fail + // closed regardless of this setting. + Strict bool + // Best effort cgroup root (v2); defaults to /sys/fs/cgroup. + CgroupRoot string + // Binary used for namespace setup. + UnshareBin string + // Parent directory for per-run metadata/rootfs. + StateRoot string +} + +// Runner is the Linux sandbox implementation. +type Runner struct { + opts RunnerOptions + cgroups *cgroupManager +} + +func NewRunner() *Runner { + return NewRunnerWithOptions(RunnerOptions{}) +} + +func NewRunnerWithOptions(opts RunnerOptions) *Runner { + if opts.UnshareBin == "" { + opts.UnshareBin = "unshare" + } + if opts.CgroupRoot == "" { + opts.CgroupRoot = "/sys/fs/cgroup" + } + if opts.StateRoot == "" { + opts.StateRoot = "/tmp/sandbox" + } + return &Runner{ + opts: opts, + cgroups: newCgroupManager(opts.CgroupRoot), + } +} + +var _ sandbox.SandboxRunner = (*Runner)(nil) + +func (r *Runner) Run(input sandbox.SandboxInput, policy sandbox.SandboxPolicy) (*sandbox.SandboxOutput, error) { + if len(input.Command) == 0 { + return nil, ErrInvalidCommand + } + + workDir := input.WorkDir + if workDir == "" { + workDir = filepath.Join(os.TempDir(), "sandbox-work-"+nextGlobalJobID()) + } + if err := ensureWorkDir(workDir); err != nil { + return nil, err + } + if err := writeSourceFiles(workDir, input.SourceFiles); err != nil { + return nil, err + } + + timeout := time.Second + if policy.TimeoutSec > 0 { + timeout = time.Duration(policy.TimeoutSec) * time.Second + } + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + // Fail closed on unsupported platforms. + if runtime.GOOS != "linux" { + return nil, fmt.Errorf("%w: %s", ErrUnsupportedPlatform, runtime.GOOS) + } + + out, err := r.runIsolated(ctx, input.Command, workDir, policy) + return out, err +} + +func (r *Runner) runIsolated(ctx context.Context, cmdArgs []string, workDir string, policy sandbox.SandboxPolicy) (*sandbox.SandboxOutput, error) { + jobID := nextGlobalJobID() + if err := os.MkdirAll(r.opts.StateRoot, 0o755); err != nil { + return nil, err + } + stateDir := filepath.Join(r.opts.StateRoot, jobID) + rootDir := filepath.Join(stateDir, "rootfs") + if err := os.MkdirAll(rootDir, 0o755); err != nil { + return nil, err + } + defer os.RemoveAll(stateDir) + + ns := defaultNamespaceConfig(policy.EnableNet) + useRoot := policy.ReadonlyRoot + + var args []string + if ns.Mount { + args = append(args, "--mount") + } + if ns.UTS { + args = append(args, "--uts") + } + if ns.IPC { + args = append(args, "--ipc") + } + if ns.PID { + args = append(args, "--pid") + } + if !ns.Network { + args = append(args, "--net") + } + if ns.User { + args = append(args, "--user", "--map-root-user") + } + args = append(args, "--fork", "--kill-child", "--mount-proc") + + if useRoot { + // Build a scratch-like root from selected read-only host mounts and a + // writable /work bind mount. + script := rootSetupScript + args = append(args, "--", "/bin/sh", "-ceu", script, "sandbox-init", rootDir, workDir, boolToIntString(policy.ReadonlyRoot)) + args = append(args, cmdArgs...) + } else { + args = append(args, "--wd", workDir, "--") + args = append(args, cmdArgs...) + } + + cmd := exec.CommandContext(ctx, r.opts.UnshareBin, args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + if err := cmd.Start(); err != nil { + return nil, err + } + + cleanup := func() {} + limits := limitsFromPolicy(policy) + if cgroupCleanup, cgErr := r.cgroups.setup(jobID, limits, cmd.Process.Pid); cgErr == nil { + cleanup = cgroupCleanup + } else if r.opts.Strict && limits.hasAnyLimit() { + _ = cmd.Process.Kill() + _, _ = cmd.Process.Wait() + return nil, cgErr + } else if cgErr != nil { + sandbox.DebugPrintf("cgroup setup skipped: %v", cgErr) + } + defer cleanup() + + err := cmd.Wait() + exitCode := 0 + if err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + exitCode = exitErr.ExitCode() + } + } else if cmd.ProcessState != nil { + exitCode = cmd.ProcessState.ExitCode() + } + + return &sandbox.SandboxOutput{ + Stdout: stdout.String(), + Stderr: stderr.String(), + ExitCode: exitCode, + }, err +} + +var globalJobIDSequence uint64 + +func nextGlobalJobID() string { + id := atomic.AddUint64(&globalJobIDSequence, 1) + // Include PID to avoid collisions when multiple test binaries share the same StateRoot. + return strconv.Itoa(os.Getpid()) + "-" + strconv.FormatUint(id, 10) +} + +func boolToIntString(v bool) string { + if v { + return "1" + } + return "0" +} + +// rootSetupScript runs inside the unshared mount/user namespace and prepares a +// scratch-like root that still has enough runtime bits for common language +// toolchains. +var rootSetupScript = strings.TrimSpace(` +ROOT="$1" +WORK="$2" +READONLY="$3" +shift 3 + +mount_if_exists() { + SRC="$1" + DST="$2" + if [ -e "$SRC" ]; then + mkdir -p "$DST" + mount --rbind "$SRC" "$DST" + fi +} + +remount_readonly_if_exists() { + DST="$1" + if [ -e "$DST" ]; then + mount -o remount,bind,ro "$DST" || true + fi +} + +mount --make-rprivate / +mkdir -p "$ROOT" "$ROOT/work" "$ROOT/proc" "$ROOT/tmp" +mount_if_exists /usr "$ROOT/usr" +mount_if_exists /bin "$ROOT/bin" +mount_if_exists /sbin "$ROOT/sbin" +mount_if_exists /lib "$ROOT/lib" +mount_if_exists /lib64 "$ROOT/lib64" +mount_if_exists /run "$ROOT/run" +mount_if_exists /nix "$ROOT/nix" +mount_if_exists "$WORK" "$ROOT/work" +mount -t proc proc "$ROOT/proc" + +if [ "$READONLY" = "1" ]; then + remount_readonly_if_exists "$ROOT/usr" + remount_readonly_if_exists "$ROOT/bin" + remount_readonly_if_exists "$ROOT/sbin" + remount_readonly_if_exists "$ROOT/lib" + remount_readonly_if_exists "$ROOT/lib64" + remount_readonly_if_exists "$ROOT/run" + remount_readonly_if_exists "$ROOT/nix" +fi + +exec chroot "$ROOT" /bin/sh -ceu 'cd /work; exec "$@"' sandbox-cmd "$@" +`) + +func (r *Runner) String() string { + return fmt.Sprintf("linux.Runner{strict=%v, unshare=%q}", r.opts.Strict, r.opts.UnshareBin) +} diff --git a/engine/sandbox/linux/runner_test.go b/engine/sandbox/linux/runner_test.go new file mode 100644 index 0000000..194d131 --- /dev/null +++ b/engine/sandbox/linux/runner_test.go @@ -0,0 +1,189 @@ +package linux + +import ( + "errors" + "os/exec" + "strconv" + "strings" + "testing" + + "github.com/runner-x/runner-x/engine/sandbox" +) + +func TestRunner_RejectsEmptyCommand(t *testing.T) { + r := NewRunner() + _, err := r.Run(sandbox.SandboxInput{}, sandbox.SandboxPolicy{TimeoutSec: 1}) + if !errors.Is(err, ErrInvalidCommand) { + t.Fatalf("expected %v, got %v", ErrInvalidCommand, err) + } +} + +func TestLimitsFromPolicy(t *testing.T) { + got := limitsFromPolicy(sandbox.SandboxPolicy{ + CpuCores: 2, + MemoryBytes: 1024, + PidsMax: 8, + }) + + if got.CpuCores != 2 || got.MemoryBytes != 1024 || got.PidsMax != 8 { + t.Fatalf("unexpected cgroup limits: %+v", got) + } +} + +func TestNamespaces(t *testing.T) { + r := NewRunnerWithOptions(RunnerOptions{ + Strict: true, + }) + + out, err := r.Run( + sandbox.SandboxInput{ + Command: []string{"sh", "-c", "ps -o pid= | wc -l"}, + WorkDir: t.TempDir(), + }, + sandbox.SandboxPolicy{ + TimeoutSec: 3, + EnableNet: false, + ReadonlyRoot: false, + }, + ) + if err != nil { + skipIfIsolationUnsupported(t, err, out) + t.Fatalf("unexpected error: %v", err) + } + + countStr := strings.TrimSpace(out.Stdout) + count, parseErr := strconv.Atoi(countStr) + if parseErr != nil { + t.Fatalf("failed parsing process count %q: %v", countStr, parseErr) + } + // In a PID namespace with mount-proc, process count should be tiny. + if count > 16 { + t.Fatalf("expected isolated process view, got count=%d stdout=%q", count, out.Stdout) + } +} + +func TestFilesystem(t *testing.T) { + r := NewRunnerWithOptions(RunnerOptions{ + Strict: true, + }) + + out, err := r.Run( + sandbox.SandboxInput{ + Command: []string{"sh", "-c", "if [ -f /etc/shadow ]; then echo found; else echo missing; fi"}, + WorkDir: t.TempDir(), + }, + sandbox.SandboxPolicy{ + TimeoutSec: 3, + EnableNet: false, + ReadonlyRoot: true, + }, + ) + if err != nil { + skipIfIsolationUnsupported(t, err, out) + t.Fatalf("unexpected error: %v", err) + } + + if strings.TrimSpace(out.Stdout) != "missing" { + t.Fatalf("expected /etc/shadow to be hidden in readonly root, got stdout=%q stderr=%q", out.Stdout, out.Stderr) + } +} + +func TestCgroupLimits(t *testing.T) { + r := NewRunnerWithOptions(RunnerOptions{ + Strict: true, + }) + + _, err := r.Run( + sandbox.SandboxInput{ + Command: []string{"sh", "-c", "i=0; while [ $i -lt 64 ]; do (sleep 1)& i=$((i+1)); done; wait"}, + WorkDir: t.TempDir(), + }, + sandbox.SandboxPolicy{ + TimeoutSec: 3, + EnableNet: false, + ReadonlyRoot: false, + PidsMax: 16, + }, + ) + + // In delegated environments this should error due pids limit. + // On hosts without cgroup write permissions, skip. + if err == nil { + return + } + if isCgroupPermissionErr(err) { + t.Skipf("skipping cgroup enforcement test: %v", err) + } +} + +func skipIfIsolationUnsupported(t *testing.T, err error, out *sandbox.SandboxOutput) { + t.Helper() + msg := err.Error() + if out != nil { + msg += " " + out.Stderr + } + if strings.Contains(msg, "Operation not permitted") || + strings.Contains(msg, "cannot open /proc/self/uid_map") || + strings.Contains(msg, "permission denied") || + strings.Contains(msg, "not found") { + t.Skipf("skipping due host sandbox capability limits: %v", err) + } +} + +func isCgroupPermissionErr(err error) bool { + if err == nil { + return false + } + msg := strings.ToLower(err.Error()) + return strings.Contains(msg, "cgroup") && + (strings.Contains(msg, "permission denied") || + strings.Contains(msg, "read-only") || + strings.Contains(msg, "operation not permitted")) +} + +func TestRunnerNoDirectFallbackWhenNotStrict(t *testing.T) { + // Force unshare lookup failure and validate fail-closed behavior. + r := NewRunnerWithOptions(RunnerOptions{ + Strict: false, + UnshareBin: "definitely-not-a-real-unshare-bin", + }) + _, err := r.Run( + sandbox.SandboxInput{ + Command: []string{"echo", "hello"}, + WorkDir: t.TempDir(), + }, + sandbox.SandboxPolicy{ + TimeoutSec: 2, + EnableNet: false, + ReadonlyRoot: false, + }, + ) + if err == nil { + t.Fatalf("expected sandbox run to fail when isolation setup fails") + } +} + +func TestRunnerStrictNoFallback(t *testing.T) { + r := NewRunnerWithOptions(RunnerOptions{ + Strict: true, + UnshareBin: "definitely-not-a-real-unshare-bin", + }) + _, err := r.Run( + sandbox.SandboxInput{ + Command: []string{"echo", "hello"}, + WorkDir: t.TempDir(), + }, + sandbox.SandboxPolicy{ + TimeoutSec: 2, + EnableNet: false, + ReadonlyRoot: false, + }, + ) + if err == nil { + t.Fatalf("expected strict mode to fail when unshare binary is unavailable") + } + var execErr *exec.Error + if !errors.As(err, &execErr) { + t.Fatalf("expected exec error, got: %T %v", err, err) + } +} diff --git a/engine/sandbox/log.go b/engine/sandbox/log.go new file mode 100644 index 0000000..bad3d5b --- /dev/null +++ b/engine/sandbox/log.go @@ -0,0 +1,8 @@ +package sandbox + +import "github.com/runner-x/runner-x/util/print" + +// DebugPrintf reuses the existing project debug logger for sandbox packages. +func DebugPrintf(format string, args ...interface{}) { + print.DebugPrintf(format, args...) +} diff --git a/engine/sandbox/types.go b/engine/sandbox/types.go new file mode 100644 index 0000000..80ace87 --- /dev/null +++ b/engine/sandbox/types.go @@ -0,0 +1,30 @@ +package sandbox + +// SandboxPolicy captures runtime restrictions for a sandboxed command. +type SandboxPolicy struct { + CpuCores int + MemoryBytes int64 + PidsMax int + TimeoutSec int + EnableNet bool + ReadonlyRoot bool +} + +// SandboxInput describes what to execute inside the sandbox. +type SandboxInput struct { + SourceFiles map[string][]byte + WorkDir string + Command []string +} + +// SandboxOutput is the normalized command result returned to callers. +type SandboxOutput struct { + Stdout string + Stderr string + ExitCode int +} + +// SandboxRunner is the primary abstraction for isolated execution. +type SandboxRunner interface { + Run(input SandboxInput, policy SandboxPolicy) (*SandboxOutput, error) +} diff --git a/engine/sandbox/types_test.go b/engine/sandbox/types_test.go new file mode 100644 index 0000000..b519b75 --- /dev/null +++ b/engine/sandbox/types_test.go @@ -0,0 +1,55 @@ +package sandbox + +import "testing" + +type fakeRunner struct { + out *SandboxOutput + err error + lastInput SandboxInput + lastPol SandboxPolicy +} + +func (f *fakeRunner) Run(input SandboxInput, policy SandboxPolicy) (*SandboxOutput, error) { + f.lastInput = input + f.lastPol = policy + return f.out, f.err +} + +func TestSandboxRunnerContract(t *testing.T) { + r := &fakeRunner{ + out: &SandboxOutput{ + Stdout: "ok", + Stderr: "", + ExitCode: 0, + }, + } + + input := SandboxInput{ + SourceFiles: map[string][]byte{"run.py": []byte("print('ok')")}, + WorkDir: "/work", + Command: []string{"python3", "run.py"}, + } + + policy := SandboxPolicy{ + CpuCores: 1, + MemoryBytes: 64 << 20, + PidsMax: 16, + TimeoutSec: 2, + EnableNet: false, + ReadonlyRoot: true, + } + + out, err := r.Run(input, policy) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if out == nil || out.Stdout != "ok" || out.ExitCode != 0 { + t.Fatalf("unexpected output: %+v", out) + } + if r.lastInput.WorkDir != "/work" { + t.Fatalf("unexpected recorded workdir: %q", r.lastInput.WorkDir) + } + if r.lastPol.PidsMax != 16 { + t.Fatalf("unexpected recorded pids limit: %d", r.lastPol.PidsMax) + } +}