From b6c69ab9941fe9353764af15d74db439bd6046e2 Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Mon, 30 Mar 2026 11:30:59 +0300 Subject: [PATCH 1/5] Track runtime ownership to prevent cross-runtime status corruption When multiple runtimes coexist (e.g., podman and go-microvm), running CLI commands with one runtime active permanently corrupts the status files of workloads managed by the other runtime, marking them as "unhealthy." This happens because the reconciliation logic assumes all workloads belong to the single active runtime. Add a RuntimeName field to RunConfig that records which runtime created a workload, and a Name() method to the Runtime interface so the active runtime can identify itself. During reconciliation, skip workloads whose owning runtime differs from the active one. Legacy workloads without a RuntimeName are conservatively treated as owned by the active runtime. Fixes #4432 Co-Authored-By: Claude Opus 4.6 (1M context) --- cmd/thv/app/run.go | 5 + pkg/api/v1/workload_service.go | 3 + pkg/api/v1/workloads_test.go | 3 + pkg/container/docker/client.go | 5 + pkg/container/kubernetes/client.go | 5 + pkg/container/runtime/mocks/mock_runtime.go | 14 +++ pkg/container/runtime/types.go | 5 + pkg/mcp/server/run_server.go | 9 +- pkg/runner/config.go | 5 + pkg/workloads/statuses/file_status.go | 44 +++++++ pkg/workloads/statuses/file_status_test.go | 124 ++++++++++++++++++++ 11 files changed, 221 insertions(+), 1 deletion(-) diff --git a/cmd/thv/app/run.go b/cmd/thv/app/run.go index 04bcc1598d..a46ba70fb1 100644 --- a/cmd/thv/app/run.go +++ b/cmd/thv/app/run.go @@ -243,6 +243,10 @@ func runSingleServer(ctx context.Context, runFlags *RunFlags, serverOrImage stri return err } + // Record which runtime owns this workload so that reconciliation logic + // does not corrupt its status file when a different runtime is active. + runnerConfig.RuntimeName = rt.Name() + // Enforce policy in the main process before saving state or spawning a // detached worker, so violations surface synchronously with a non-zero // exit code rather than silently failing in the background log. @@ -469,6 +473,7 @@ func runFromConfigFile(ctx context.Context) error { // Set the runtime in the config runConfig.Deployer = rt + runConfig.RuntimeName = rt.Name() // Create workload manager workloadManager, err := workloads.NewManagerFromRuntime(rt) diff --git a/pkg/api/v1/workload_service.go b/pkg/api/v1/workload_service.go index e0d899f8ec..117dc098d3 100644 --- a/pkg/api/v1/workload_service.go +++ b/pkg/api/v1/workload_service.go @@ -100,6 +100,9 @@ func (s *WorkloadService) CreateWorkloadFromRequest(ctx context.Context, req *cr return nil, err } + // Record which runtime owns this workload for cross-runtime reconciliation. + runConfig.RuntimeName = s.containerRuntime.Name() + // Enforce policy before saving state or starting the workload, so // violations are returned as API errors rather than creating the server // in a broken state. diff --git a/pkg/api/v1/workloads_test.go b/pkg/api/v1/workloads_test.go index 6091879908..b647218d7d 100644 --- a/pkg/api/v1/workloads_test.go +++ b/pkg/api/v1/workloads_test.go @@ -261,6 +261,7 @@ func TestCreateWorkload(t *testing.T) { mockRuntime := runtimemocks.NewMockRuntime(ctrl) mockGroupManager := groupsmocks.NewMockManager(ctrl) + mockRuntime.EXPECT().Name().Return("docker").AnyTimes() tt.setupMock(t, mockWorkloadManager, mockRuntime, mockGroupManager) expectedServerOrImage := tt.expectedServerOrImage if expectedServerOrImage == "" { @@ -281,6 +282,7 @@ func TestCreateWorkload(t *testing.T) { workloadService: &WorkloadService{ groupManager: mockGroupManager, workloadManager: mockWorkloadManager, + containerRuntime: mockRuntime, imageRetriever: mockRetriever, imagePuller: func(_ context.Context, _ string) error { return nil }, configProvider: config.NewDefaultProvider(), @@ -502,6 +504,7 @@ func TestUpdateWorkload(t *testing.T) { workloadService: &WorkloadService{ groupManager: mockGroupManager, workloadManager: mockWorkloadManager, + containerRuntime: mockRuntime, imageRetriever: mockRetriever, imagePuller: func(_ context.Context, _ string) error { return nil }, configProvider: config.NewDefaultProvider(), diff --git a/pkg/container/docker/client.go b/pkg/container/docker/client.go index ea5ef8fe88..b4b02605fd 100644 --- a/pkg/container/docker/client.go +++ b/pkg/container/docker/client.go @@ -651,6 +651,11 @@ func (c *Client) AttachToWorkload(ctx context.Context, workloadName string) (io. return resp.Conn, stdoutReader, nil } +// Name returns the registered name of this runtime. +func (*Client) Name() string { + return RuntimeName +} + // IsRunning checks the health of the container runtime. // This is used to verify that the runtime is operational and can manage workloads. func (c *Client) IsRunning(ctx context.Context) error { diff --git a/pkg/container/kubernetes/client.go b/pkg/container/kubernetes/client.go index 0b6d72cd09..405e58493e 100644 --- a/pkg/container/kubernetes/client.go +++ b/pkg/container/kubernetes/client.go @@ -787,6 +787,11 @@ func (*Client) StopWorkload(_ context.Context, _ string) error { return nil } +// Name returns the registered name of this runtime. +func (*Client) Name() string { + return RuntimeName +} + // IsRunning checks the health of the container runtime. // This is used to verify that the runtime is operational and can manage workloads. func (c *Client) IsRunning(ctx context.Context) error { diff --git a/pkg/container/runtime/mocks/mock_runtime.go b/pkg/container/runtime/mocks/mock_runtime.go index b890ea1f3f..1141c4b467 100644 --- a/pkg/container/runtime/mocks/mock_runtime.go +++ b/pkg/container/runtime/mocks/mock_runtime.go @@ -232,6 +232,20 @@ func (mr *MockRuntimeMockRecorder) ListWorkloads(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ListWorkloads", reflect.TypeOf((*MockRuntime)(nil).ListWorkloads), ctx) } +// Name mocks base method. +func (m *MockRuntime) Name() string { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Name") + ret0, _ := ret[0].(string) + return ret0 +} + +// Name indicates an expected call of Name. +func (mr *MockRuntimeMockRecorder) Name() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Name", reflect.TypeOf((*MockRuntime)(nil).Name)) +} + // RemoveWorkload mocks base method. func (m *MockRuntime) RemoveWorkload(ctx context.Context, workloadName string) error { m.ctrl.T.Helper() diff --git a/pkg/container/runtime/types.go b/pkg/container/runtime/types.go index b65c518820..1f96ec0fba 100644 --- a/pkg/container/runtime/types.go +++ b/pkg/container/runtime/types.go @@ -180,6 +180,11 @@ type Runtime interface { // IsRunning checks the health of the container runtime. // This is used to verify that the runtime is operational and can manage workloads. IsRunning(ctx context.Context) error + + // Name returns the registered name of this runtime (e.g., "docker", "kubernetes"). + // Used to track which runtime owns a workload so that reconciliation logic + // does not corrupt status files of workloads managed by a different runtime. + Name() string } // Monitor defines the interface for container monitoring diff --git a/pkg/mcp/server/run_server.go b/pkg/mcp/server/run_server.go index cc1f542fd8..2789cc1238 100644 --- a/pkg/mcp/server/run_server.go +++ b/pkg/mcp/server/run_server.go @@ -169,7 +169,14 @@ func buildServerConfig( // Build the configuration envVarValidator := &runner.DetachedEnvVarValidator{} - return runner.NewRunConfigBuilder(ctx, imageMetadata, envVars, envVarValidator, opts...) + cfg, err := runner.NewRunConfigBuilder(ctx, imageMetadata, envVars, envVarValidator, opts...) + if err != nil { + return nil, err + } + + // Record which runtime owns this workload for cross-runtime reconciliation. + cfg.RuntimeName = rt.Name() + return cfg, nil } // configureTransport sets up transport configuration from metadata diff --git a/pkg/runner/config.go b/pkg/runner/config.go index 399cff3729..0265282470 100644 --- a/pkg/runner/config.go +++ b/pkg/runner/config.go @@ -55,6 +55,11 @@ type RunConfig struct { // means unversioned (backward-compat with older operators, or non-operator callers). MCPServerGeneration int64 `json:"mcpserver_generation,omitempty" yaml:"mcpserver_generation,omitempty"` + // RuntimeName is the registered name of the container runtime that owns this + // workload (e.g., "docker", "kubernetes"). Used during reconciliation to avoid + // corrupting status files of workloads managed by a different runtime. + RuntimeName string `json:"runtime_name,omitempty" yaml:"runtime_name,omitempty"` + // Image is the Docker image to run Image string `json:"image" yaml:"image"` diff --git a/pkg/workloads/statuses/file_status.go b/pkg/workloads/statuses/file_status.go index 92350fd6a8..c42d172c2f 100644 --- a/pkg/workloads/statuses/file_status.go +++ b/pkg/workloads/statuses/file_status.go @@ -74,6 +74,38 @@ type fileStatusManager struct { runConfigStore state.Store } +// isOwnedByActiveRuntime checks whether the named workload was created by the +// currently active runtime. It loads the runtime_name field from the persisted +// RunConfig and compares it against f.runtime.Name(). Legacy workloads that +// predate the runtime_name field (empty string) are conservatively treated as +// owned by the active runtime so that existing validation behaviour is preserved. +func (f *fileStatusManager) isOwnedByActiveRuntime(ctx context.Context, workloadName string) bool { + reader, err := f.runConfigStore.GetReader(ctx, workloadName) + if err != nil { + // RunConfig missing or unreadable -- assume ours so we don't silently + // skip validation for workloads we should be checking. + return true + } + defer func() { + if err := reader.Close(); err != nil { + slog.Warn("failed to close reader", "error", err) + } + }() + + var config struct { + RuntimeName string `json:"runtime_name"` + } + if err := json.NewDecoder(reader).Decode(&config); err != nil { + return true // can't determine ownership -- assume ours + } + + // Empty means legacy workload; treat as owned by active runtime. + if config.RuntimeName == "" { + return true + } + return config.RuntimeName == f.runtime.Name() +} + // isRemoteWorkload checks if a workload is remote by attempting to load its run configuration // and checking if it has a RemoteURL field set. // TODO: This is a temporary solution to check if a workload is remote @@ -910,6 +942,12 @@ func (f *fileStatusManager) validateRunningWorkload( return result, nil } + // Skip validation for workloads owned by a different runtime to avoid + // corrupting their status files (see #4432). + if !f.isOwnedByActiveRuntime(ctx, workloadName) { + return result, nil + } + // Get raw container info from runtime (before label filtering) containerInfo, err := f.runtime.GetWorkloadInfo(ctx, workloadName) if err != nil { @@ -964,6 +1002,12 @@ func (f *fileStatusManager) handleRuntimeMissing( return fileWorkload, nil } + // Skip reconciliation for workloads owned by a different runtime to avoid + // corrupting their status files (see #4432). + if !f.isOwnedByActiveRuntime(ctx, workloadName) { + return fileWorkload, nil + } + if fileWorkload.Status == rt.WorkloadStatusRunning || fileWorkload.Status == rt.WorkloadStatusStopped { // The workload cannot be running or stopped if the runtime container is not found contextMsg := fmt.Sprintf("workload %s not found in runtime, marking as unhealthy", workloadName) diff --git a/pkg/workloads/statuses/file_status_test.go b/pkg/workloads/statuses/file_status_test.go index de2c85bc85..cc2277bfd8 100644 --- a/pkg/workloads/statuses/file_status_test.go +++ b/pkg/workloads/statuses/file_status_test.go @@ -2229,3 +2229,127 @@ func TestJSONRecovery_MultipleExtraClosingBraces(t *testing.T) { assert.Equal(t, rt.WorkloadStatusRunning, statusFile.Status) assert.Equal(t, 12345, statusFile.ProcessID) } + +func TestFileStatusManager_CrossRuntimeProtection(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + // runtimeName is what the active runtime reports via Name() + runtimeName string + // runConfigJSON is the JSON returned by GetReader for the workload's RunConfig. + // It controls what runtime_name the workload was created with. + runConfigJSON string + // operation selects which code path to exercise: "list" or "get" + operation string + // expectUnhealthy indicates whether we expect the workload to be marked unhealthy + expectUnhealthy bool + // expectRuntimeGetWorkloadInfo indicates whether GetWorkloadInfo should be called (for "get" operation) + expectRuntimeGetWorkloadInfo bool + }{ + { + name: "workload owned by different runtime is not marked unhealthy during list", + runtimeName: "docker", + runConfigJSON: `{"runtime_name": "go-microvm"}`, + operation: "list", + expectUnhealthy: false, + }, + { + name: "workload owned by active runtime IS marked unhealthy when missing from runtime", + runtimeName: "docker", + runConfigJSON: `{"runtime_name": "docker"}`, + operation: "list", + expectUnhealthy: true, + }, + { + name: "legacy workload without runtime_name IS validated normally", + runtimeName: "docker", + runConfigJSON: `{"name": "test-workload"}`, + operation: "list", + expectUnhealthy: true, + }, + { + name: "workload owned by different runtime is not validated during GetWorkload", + runtimeName: "docker", + runConfigJSON: `{"runtime_name": "go-microvm"}`, + operation: "get", + expectUnhealthy: false, + expectRuntimeGetWorkloadInfo: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager, mockRuntime, mockRunConfigStore := newTestFileStatusManager(t, ctrl) + ctx := t.Context() + + workloadName := "test-workload" + + // Create a running workload status file + err := manager.SetWorkloadStatus(ctx, workloadName, rt.WorkloadStatusRunning, "container started") + require.NoError(t, err) + + // Mock runtime name + mockRuntime.EXPECT().Name().Return(tt.runtimeName).AnyTimes() + + // Mock run config store -- returns the test-specific RunConfig JSON. + // Both isRemoteWorkload (via getWorkloadsFromFiles) and isOwnedByActiveRuntime + // read from GetReader, so we use AnyTimes with a fresh reader per call. + mockRunConfigStore.EXPECT().Exists(gomock.Any(), workloadName).Return(true, nil).AnyTimes() + mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName).DoAndReturn(func(context.Context, string) (io.ReadCloser, error) { + return io.NopCloser(strings.NewReader(tt.runConfigJSON)), nil + }).AnyTimes() + + switch tt.operation { + case "list": + // Runtime returns no containers -- workload exists only in file + mockRuntime.EXPECT().ListWorkloads(gomock.Any()).Return([]rt.ContainerInfo{}, nil) + + workloads, listErr := manager.ListWorkloads(ctx, true, nil) + require.NoError(t, listErr) + require.Len(t, workloads, 1, "expected exactly one workload in list") + + if tt.expectUnhealthy { + assert.Equal(t, rt.WorkloadStatusUnhealthy, workloads[0].Status, + "workload should be marked unhealthy") + } else { + assert.Equal(t, rt.WorkloadStatusRunning, workloads[0].Status, + "workload should remain running (cross-runtime protection)") + + // Verify the status file on disk was NOT modified + statusFilePath := filepath.Join(manager.baseDir, workloadName+".json") + data, readErr := os.ReadFile(statusFilePath) + require.NoError(t, readErr) + + var statusFile workloadStatusFile + require.NoError(t, json.Unmarshal(data, &statusFile)) + assert.Equal(t, rt.WorkloadStatusRunning, statusFile.Status, + "status file on disk should still show running") + } + + case "get": + if tt.expectRuntimeGetWorkloadInfo { + mockRuntime.EXPECT().GetWorkloadInfo(gomock.Any(), workloadName).Return(rt.ContainerInfo{}, errors.New("not found")) + } + // If we do NOT expect GetWorkloadInfo to be called, the mock controller + // will fail the test if it is called unexpectedly. + + workload, getErr := manager.GetWorkload(ctx, workloadName) + require.NoError(t, getErr) + + if tt.expectUnhealthy { + assert.Equal(t, rt.WorkloadStatusUnhealthy, workload.Status, + "workload should be marked unhealthy") + } else { + assert.Equal(t, rt.WorkloadStatusRunning, workload.Status, + "workload should remain running (cross-runtime protection)") + } + } + }) + } +} From 85babb60bf0391789369e9ef50dd65dafe0c28bd Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Mon, 30 Mar 2026 11:45:04 +0300 Subject: [PATCH 2/5] Regenerate swagger docs for RuntimeName field Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/server/docs.go | 4 ++++ docs/server/swagger.json | 4 ++++ docs/server/swagger.yaml | 6 ++++++ 3 files changed, 14 insertions(+) diff --git a/docs/server/docs.go b/docs/server/docs.go index 9ca280736f..5ae175529b 100644 --- a/docs/server/docs.go +++ b/docs/server/docs.go @@ -1404,6 +1404,10 @@ const docTemplate = `{ "runtime_config": { "$ref": "#/components/schemas/github_com_stacklok_toolhive_pkg_container_templates.RuntimeConfig" }, + "runtime_name": { + "description": "RuntimeName is the registered name of the container runtime that owns this\nworkload (e.g., \"docker\", \"kubernetes\"). Used during reconciliation to avoid\ncorrupting status files of workloads managed by a different runtime.", + "type": "string" + }, "scaling_config": { "$ref": "#/components/schemas/github_com_stacklok_toolhive_pkg_runner.ScalingConfig" }, diff --git a/docs/server/swagger.json b/docs/server/swagger.json index 8127a52c45..a3967d0be1 100644 --- a/docs/server/swagger.json +++ b/docs/server/swagger.json @@ -1397,6 +1397,10 @@ "runtime_config": { "$ref": "#/components/schemas/github_com_stacklok_toolhive_pkg_container_templates.RuntimeConfig" }, + "runtime_name": { + "description": "RuntimeName is the registered name of the container runtime that owns this\nworkload (e.g., \"docker\", \"kubernetes\"). Used during reconciliation to avoid\ncorrupting status files of workloads managed by a different runtime.", + "type": "string" + }, "scaling_config": { "$ref": "#/components/schemas/github_com_stacklok_toolhive_pkg_runner.ScalingConfig" }, diff --git a/docs/server/swagger.yaml b/docs/server/swagger.yaml index 6bbded4bce..e166553ba9 100644 --- a/docs/server/swagger.yaml +++ b/docs/server/swagger.yaml @@ -1406,6 +1406,12 @@ components: type: string runtime_config: $ref: '#/components/schemas/github_com_stacklok_toolhive_pkg_container_templates.RuntimeConfig' + runtime_name: + description: |- + RuntimeName is the registered name of the container runtime that owns this + workload (e.g., "docker", "kubernetes"). Used during reconciliation to avoid + corrupting status files of workloads managed by a different runtime. + type: string scaling_config: $ref: '#/components/schemas/github_com_stacklok_toolhive_pkg_runner.ScalingConfig' schema_version: From 094fb1167deb0e7c3e16f7ac9d133d714e8d38ff Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Tue, 31 Mar 2026 07:17:22 +0000 Subject: [PATCH 3/5] Migrate legacy workloads to stamp runtime ownership on healthy reconciliation Legacy workloads created before runtime ownership tracking have an empty RuntimeName field. Rather than leaving them permanently in the legacy state, opportunistically stamp them with the active runtime's name when reconciliation confirms the workload is healthy (container running, proxy OK). This ensures the cross-runtime protection from the previous commit covers existing workloads progressively. Also adds a clarifying comment on the Remote guard in handleRuntimeMissing per reviewer feedback, and improves diagnostics with a DEBUG log when GetReader fails during migration. Co-Authored-By: Claude Opus 4.6 (1M context) --- pkg/workloads/statuses/file_status.go | 68 ++++++++- pkg/workloads/statuses/file_status_test.go | 160 +++++++++++++++++++++ 2 files changed, 224 insertions(+), 4 deletions(-) diff --git a/pkg/workloads/statuses/file_status.go b/pkg/workloads/statuses/file_status.go index c42d172c2f..b12a86de79 100644 --- a/pkg/workloads/statuses/file_status.go +++ b/pkg/workloads/statuses/file_status.go @@ -106,6 +106,61 @@ func (f *fileStatusManager) isOwnedByActiveRuntime(ctx context.Context, workload return config.RuntimeName == f.runtime.Name() } +// migrateRuntimeName stamps a legacy workload (RuntimeName == "") with the +// active runtime's name. This is called only after the runtime has confirmed the +// workload is healthy, so we know with certainty it belongs to the active +// runtime. Save failures are non-fatal — the migration will be retried on the +// next reconciliation cycle. +func (f *fileStatusManager) migrateRuntimeName(ctx context.Context, workloadName string) { + reader, err := f.runConfigStore.GetReader(ctx, workloadName) + if err != nil { + slog.Debug("skipping runtime name migration: cannot read config", "workload", workloadName, "error", err) + return + } + + var raw map[string]json.RawMessage + decodeErr := json.NewDecoder(reader).Decode(&raw) + // Always close the reader before acting on the decode result. + if closeErr := reader.Close(); closeErr != nil { + slog.Warn("failed to close reader for runtime name migration", "workload", workloadName, "error", closeErr) + } + if decodeErr != nil { + return + } + + // Check if runtime_name is already set + if rn, ok := raw["runtime_name"]; ok { + var name string + if err := json.Unmarshal(rn, &name); err == nil && name != "" { + return // Already migrated + } + } + + // Stamp with the active runtime name + runtimeBytes, err := json.Marshal(f.runtime.Name()) + if err != nil { + return + } + raw["runtime_name"] = runtimeBytes + + writer, err := f.runConfigStore.GetWriter(ctx, workloadName) + if err != nil { + slog.Warn("failed to open writer for runtime name migration", "workload", workloadName, "error", err) + return + } + defer func() { + if err := writer.Close(); err != nil { + slog.Warn("failed to close writer for runtime name migration", "workload", workloadName, "error", err) + } + }() + + encoder := json.NewEncoder(writer) + encoder.SetIndent("", " ") + if err := encoder.Encode(raw); err != nil { + slog.Warn("failed to write migrated RunConfig", "workload", workloadName, "error", err) + } +} + // isRemoteWorkload checks if a workload is remote by attempting to load its run configuration // and checking if it has a RemoteURL field set. // TODO: This is a temporary solution to check if a workload is remote @@ -964,7 +1019,12 @@ func (f *fileStatusManager) validateRunningWorkload( return unhealthyWorkload, nil } - // Runtime and proxy confirm workload is healthy - merge runtime data with file status + // Runtime and proxy confirm workload is healthy — opportunistically migrate + // legacy workloads that predate the runtime_name field so they are stamped + // with the owning runtime for future reconciliation cycles. + f.migrateRuntimeName(ctx, workloadName) + + // Merge runtime data with file status return f.mergeHealthyWorkloadData(containerInfo, result) } @@ -995,10 +1055,10 @@ func (f *fileStatusManager) handleRuntimeMismatch( func (f *fileStatusManager) handleRuntimeMissing( ctx context.Context, workloadName string, fileWorkload core.Workload, ) (core.Workload, error) { - // Check if this is a remote workload using the Remote field + // Remote workloads don't exist in the container runtime, so it's normal + // for them to be missing. This also means they bypass the runtime-ownership + // check below — remote workloads have no owning runtime. if fileWorkload.Remote { - // Remote workloads don't exist in the container runtime, so it's normal for them to be missing - // Don't mark them as unhealthy return fileWorkload, nil } diff --git a/pkg/workloads/statuses/file_status_test.go b/pkg/workloads/statuses/file_status_test.go index cc2277bfd8..d5012fd9bf 100644 --- a/pkg/workloads/statuses/file_status_test.go +++ b/pkg/workloads/statuses/file_status_test.go @@ -4,6 +4,7 @@ package statuses import ( + "bytes" "context" "encoding/json" "errors" @@ -233,6 +234,14 @@ func TestFileStatusManager_GetWorkload_FileAndRuntimeCombination(t *testing.T) { return io.NopCloser(strings.NewReader(`{"name": "running-workload", "transport": "sse"}`)), nil }).AnyTimes() + // Mock runtime name for isOwnedByActiveRuntime and migrateRuntimeName + mockRuntime.EXPECT().Name().Return("docker").AnyTimes() + + // Mock GetWriter for migrateRuntimeName (legacy workload with no runtime_name gets migrated) + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), "running-workload").DoAndReturn(func(context.Context, string) (io.WriteCloser, error) { + return nopWriteCloser{&bytes.Buffer{}}, nil + }).AnyTimes() + // Create a workload status file and set it to running err := manager.SetWorkloadStatus(ctx, "running-workload", rt.WorkloadStatusStarting, "") require.NoError(t, err) @@ -915,6 +924,14 @@ func TestFileStatusManager_GetWorkload_HealthyRunningWorkload(t *testing.T) { return io.NopCloser(strings.NewReader(`{"name": "healthy-workload", "transport": "sse"}`)), nil }).AnyTimes() + // Mock runtime name for isOwnedByActiveRuntime and migrateRuntimeName + mockRuntime.EXPECT().Name().Return("docker").AnyTimes() + + // Mock GetWriter for migrateRuntimeName (legacy workload with no runtime_name gets migrated) + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), "healthy-workload").DoAndReturn(func(context.Context, string) (io.WriteCloser, error) { + return nopWriteCloser{&bytes.Buffer{}}, nil + }).AnyTimes() + // Set the workload status to running in the file err := manager.SetWorkloadStatus(ctx, "healthy-workload", rt.WorkloadStatusRunning, "container started") require.NoError(t, err) @@ -964,6 +981,9 @@ func TestFileStatusManager_GetWorkload_ProxyNotRunning(t *testing.T) { return io.NopCloser(strings.NewReader(`{"name": "proxy-down-workload", "transport": "sse"}`)), nil }).AnyTimes() + // Mock runtime name for isOwnedByActiveRuntime + mockRuntime.EXPECT().Name().Return("docker").AnyTimes() + // First, create a status file manually to ensure file is found statusFile := workloadStatusFile{ Status: rt.WorkloadStatusRunning, @@ -1041,6 +1061,14 @@ func TestFileStatusManager_GetWorkload_HealthyWithProxy(t *testing.T) { return io.NopCloser(strings.NewReader(`{"name": "healthy-with-proxy", "transport": "sse"}`)), nil }).AnyTimes() + // Mock runtime name for isOwnedByActiveRuntime and migrateRuntimeName + mockRuntime.EXPECT().Name().Return("docker").AnyTimes() + + // Mock GetWriter for migrateRuntimeName (legacy workload with no runtime_name gets migrated) + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), "healthy-with-proxy").DoAndReturn(func(context.Context, string) (io.WriteCloser, error) { + return nopWriteCloser{&bytes.Buffer{}}, nil + }).AnyTimes() + // Set the workload status to running in the file err := manager.SetWorkloadStatus(ctx, "healthy-with-proxy", rt.WorkloadStatusRunning, "container started") require.NoError(t, err) @@ -2353,3 +2381,135 @@ func TestFileStatusManager_CrossRuntimeProtection(t *testing.T) { }) } } + +// nopWriteCloser wraps a bytes.Buffer with a no-op Close. +type nopWriteCloser struct { + *bytes.Buffer +} + +func (nopWriteCloser) Close() error { return nil } + +func TestMigrateRuntimeName(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + runtimeName string + runConfigJSON string + expectWrite bool + expectedRuntime string + getReaderErr bool + getWriterErr bool + }{ + { + name: "migrates empty runtime_name", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload", "image": "test:latest"}`, + expectWrite: true, + expectedRuntime: "docker", + }, + { + name: "migrates explicit empty runtime_name", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload", "runtime_name": "", "image": "test:latest"}`, + expectWrite: true, + expectedRuntime: "docker", + }, + { + name: "no-op when runtime_name already set", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload", "runtime_name": "docker", "image": "test:latest"}`, + expectWrite: false, + }, + { + name: "no-op when runtime_name set to different runtime", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload", "runtime_name": "go-microvm", "image": "test:latest"}`, + expectWrite: false, + }, + { + name: "graceful on GetReader error", + runtimeName: "docker", + getReaderErr: true, + expectWrite: false, + }, + { + name: "graceful on GetWriter error", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload"}`, + getWriterErr: true, + expectWrite: false, + }, + { + name: "graceful on malformed JSON", + runtimeName: "docker", + runConfigJSON: `{bad json`, + expectWrite: false, + }, + { + name: "migrates non-string runtime_name", + runtimeName: "docker", + runConfigJSON: `{"name": "my-workload", "runtime_name": 42}`, + expectWrite: true, + expectedRuntime: "docker", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager, mockRuntime, mockRunConfigStore := newTestFileStatusManager(t, ctrl) + ctx := t.Context() + + workloadName := "test-workload" + mockRuntime.EXPECT().Name().Return(tt.runtimeName).AnyTimes() + + if tt.getReaderErr { + mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName). + Return(nil, errors.New("store error")) + } else { + mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName).DoAndReturn( + func(context.Context, string) (io.ReadCloser, error) { + return io.NopCloser(strings.NewReader(tt.runConfigJSON)), nil + }, + ) + } + + var writeBuf nopWriteCloser + if tt.expectWrite { + if tt.getWriterErr { + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), workloadName). + Return(nil, errors.New("write error")) + } else { + writeBuf = nopWriteCloser{&bytes.Buffer{}} + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), workloadName). + Return(writeBuf, nil) + } + } else if tt.getWriterErr { + mockRunConfigStore.EXPECT().GetWriter(gomock.Any(), workloadName). + Return(nil, errors.New("write error")) + } + + manager.migrateRuntimeName(ctx, workloadName) + + if tt.expectWrite && !tt.getWriterErr { + var written map[string]json.RawMessage + require.NoError(t, json.Unmarshal(writeBuf.Bytes(), &written)) + + var runtimeName string + require.NoError(t, json.Unmarshal(written["runtime_name"], &runtimeName)) + assert.Equal(t, tt.expectedRuntime, runtimeName, + "migrated runtime_name should match active runtime") + + // Verify other fields are preserved + var name string + require.NoError(t, json.Unmarshal(written["name"], &name)) + assert.Equal(t, "my-workload", name, "other fields should be preserved") + } + }) + } +} From d30f994811b73de294661aba241a225e244e86d2 Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Fri, 29 May 2026 11:17:37 +0000 Subject: [PATCH 4/5] Harden runtime-ownership check against ambiguity isOwnedByActiveRuntime could only ever deny ownership when it read a RunConfig whose runtime_name positively named a different runtime. Every ambiguous outcome must default to "owned" so we never silently suppress reconciliation for a workload we are responsible for. Check Exists() before reading so a genuinely-missing RunConfig is handled distinctly from a present-but-unparseable one, trim whitespace-only runtime_name to the legacy path, and document why the bias toward ownership is required. Run the eager policy gate before stamping RuntimeName so a policy denial never dereferences the runtime. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/thv/app/run.go | 8 +- pkg/api/v1/workload_service.go | 6 +- pkg/workloads/statuses/file_status.go | 32 +++++-- pkg/workloads/statuses/file_status_test.go | 102 +++++++++++++++++++++ 4 files changed, 134 insertions(+), 14 deletions(-) diff --git a/cmd/thv/app/run.go b/cmd/thv/app/run.go index a46ba70fb1..a10f12949a 100644 --- a/cmd/thv/app/run.go +++ b/cmd/thv/app/run.go @@ -243,10 +243,6 @@ func runSingleServer(ctx context.Context, runFlags *RunFlags, serverOrImage stri return err } - // Record which runtime owns this workload so that reconciliation logic - // does not corrupt its status file when a different runtime is active. - runnerConfig.RuntimeName = rt.Name() - // Enforce policy in the main process before saving state or spawning a // detached worker, so violations surface synchronously with a non-zero // exit code rather than silently failing in the background log. @@ -254,6 +250,10 @@ func runSingleServer(ctx context.Context, runFlags *RunFlags, serverOrImage stri return fmt.Errorf("server creation blocked by policy: %w", err) } + // Record which runtime owns this workload so that reconciliation logic + // does not corrupt its status file when a different runtime is active. + runnerConfig.RuntimeName = rt.Name() + // Always save the run config to disk before starting (both foreground and detached modes) // NOTE: Save before secrets processing to avoid storing secrets in the state store if err := runnerConfig.SaveState(ctx); err != nil { diff --git a/pkg/api/v1/workload_service.go b/pkg/api/v1/workload_service.go index 117dc098d3..6ef324c9f1 100644 --- a/pkg/api/v1/workload_service.go +++ b/pkg/api/v1/workload_service.go @@ -100,9 +100,6 @@ func (s *WorkloadService) CreateWorkloadFromRequest(ctx context.Context, req *cr return nil, err } - // Record which runtime owns this workload for cross-runtime reconciliation. - runConfig.RuntimeName = s.containerRuntime.Name() - // Enforce policy before saving state or starting the workload, so // violations are returned as API errors rather than creating the server // in a broken state. @@ -110,6 +107,9 @@ func (s *WorkloadService) CreateWorkloadFromRequest(ctx context.Context, req *cr return nil, fmt.Errorf("server creation blocked by policy: %w", err) } + // Record which runtime owns this workload for cross-runtime reconciliation. + runConfig.RuntimeName = s.containerRuntime.Name() + // Save the workload state if err := runConfig.SaveState(ctx); err != nil { slog.Error("failed to save workload config", "error", err) diff --git a/pkg/workloads/statuses/file_status.go b/pkg/workloads/statuses/file_status.go index b12a86de79..0c61d81996 100644 --- a/pkg/workloads/statuses/file_status.go +++ b/pkg/workloads/statuses/file_status.go @@ -76,14 +76,30 @@ type fileStatusManager struct { // isOwnedByActiveRuntime checks whether the named workload was created by the // currently active runtime. It loads the runtime_name field from the persisted -// RunConfig and compares it against f.runtime.Name(). Legacy workloads that -// predate the runtime_name field (empty string) are conservatively treated as -// owned by the active runtime so that existing validation behaviour is preserved. +// RunConfig and compares it against f.runtime.Name(). +// +// Ownership can only be denied when we positively identify a *different* owning +// runtime: the RunConfig must exist, decode cleanly, and carry a non-empty +// runtime_name that differs from the active runtime. Every other case -- +// missing/unreadable config, decode failure, or a legacy workload that predates +// the runtime_name field (empty string) -- is conservatively treated as owned by +// the active runtime, preserving the pre-feature validation behaviour. This bias +// matters: a false "not ours" would silently suppress reconciliation for a +// workload we are responsible for, leaving it stuck in a stale status. func (f *fileStatusManager) isOwnedByActiveRuntime(ctx context.Context, workloadName string) bool { + // A workload with no RunConfig cannot have been tagged with another + // runtime's name, so it can only be ours (or a legacy workload). Checking + // existence explicitly lets us distinguish "no config" from "config present + // but cannot be parsed" and mirrors isRemoteWorkload's handling. + exists, err := f.runConfigStore.Exists(ctx, workloadName) + if err != nil || !exists { + return true + } + reader, err := f.runConfigStore.GetReader(ctx, workloadName) if err != nil { - // RunConfig missing or unreadable -- assume ours so we don't silently - // skip validation for workloads we should be checking. + // RunConfig unreadable -- assume ours so we don't silently skip + // validation for workloads we should be checking. return true } defer func() { @@ -99,10 +115,12 @@ func (f *fileStatusManager) isOwnedByActiveRuntime(ctx context.Context, workload return true // can't determine ownership -- assume ours } - // Empty means legacy workload; treat as owned by active runtime. - if config.RuntimeName == "" { + // Empty means legacy workload (predates runtime_name); treat as ours. + if strings.TrimSpace(config.RuntimeName) == "" { return true } + + // Only deny ownership when another runtime is positively identified. return config.RuntimeName == f.runtime.Name() } diff --git a/pkg/workloads/statuses/file_status_test.go b/pkg/workloads/statuses/file_status_test.go index d5012fd9bf..3823fdfc1b 100644 --- a/pkg/workloads/statuses/file_status_test.go +++ b/pkg/workloads/statuses/file_status_test.go @@ -2389,6 +2389,108 @@ type nopWriteCloser struct { func (nopWriteCloser) Close() error { return nil } +// TestIsOwnedByActiveRuntime exercises every branch of the ownership check. +// Ownership may only be denied when a *different* runtime is positively +// identified; every ambiguous case must default to "owned" so we never silently +// suppress reconciliation for a workload we are responsible for. +func TestIsOwnedByActiveRuntime(t *testing.T) { + t.Parallel() + + const workloadName = "test-workload" + const activeRuntime = "docker" + + tests := []struct { + name string + // existsResult/existsErr control the runConfigStore.Exists mock. + existsResult bool + existsErr error + // when the config exists, getReaderErr or runConfigJSON drive GetReader. + getReaderErr error + runConfigJSON string + expectOwned bool + }{ + { + name: "owned by active runtime", + existsResult: true, + runConfigJSON: `{"runtime_name": "docker"}`, + expectOwned: true, + }, + { + name: "owned by a different runtime is not ours", + existsResult: true, + runConfigJSON: `{"runtime_name": "go-microvm"}`, + expectOwned: false, + }, + { + name: "legacy workload with empty runtime_name is assumed ours", + existsResult: true, + runConfigJSON: `{"name": "test-workload"}`, + expectOwned: true, + }, + { + name: "whitespace-only runtime_name is treated as legacy", + existsResult: true, + runConfigJSON: `{"runtime_name": " "}`, + expectOwned: true, + }, + { + name: "missing run config is assumed ours", + existsResult: false, + expectOwned: true, + }, + { + name: "exists error is assumed ours", + existsErr: errors.New("store unavailable"), + expectOwned: true, + }, + { + name: "unreadable run config is assumed ours", + existsResult: true, + getReaderErr: errors.New("read error"), + expectOwned: true, + }, + { + name: "undecodable run config is assumed ours", + existsResult: true, + runConfigJSON: `{bad json`, + expectOwned: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + manager, mockRuntime, mockRunConfigStore := newTestFileStatusManager(t, ctrl) + ctx := t.Context() + + // Name() is only consulted when a non-empty runtime_name is compared. + mockRuntime.EXPECT().Name().Return(activeRuntime).AnyTimes() + + mockRunConfigStore.EXPECT().Exists(gomock.Any(), workloadName). + Return(tt.existsResult, tt.existsErr) + + // GetReader is only reached when the config exists and Exists succeeds. + if tt.existsErr == nil && tt.existsResult { + if tt.getReaderErr != nil { + mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName). + Return(nil, tt.getReaderErr) + } else { + mockRunConfigStore.EXPECT().GetReader(gomock.Any(), workloadName).DoAndReturn( + func(context.Context, string) (io.ReadCloser, error) { + return io.NopCloser(strings.NewReader(tt.runConfigJSON)), nil + }) + } + } + + assert.Equal(t, tt.expectOwned, manager.isOwnedByActiveRuntime(ctx, workloadName)) + }) + } +} + func TestMigrateRuntimeName(t *testing.T) { t.Parallel() From 8d7851a51c880e986e82c82b2b2db1a3cae57a97 Mon Sep 17 00:00:00 2001 From: Juan Antonio Osorio Date: Fri, 29 May 2026 11:26:00 +0000 Subject: [PATCH 5/5] Report concrete runtime type as ownership identity Client.Name() returned the static RuntimeName constant ("docker"), which the factory uses to register all Docker-API-compatible runtimes under one entry. Using it as the per-workload ownership identity collapsed Docker, Podman, and Colima into "docker", so a workload created under one could be claimed by another during status reconciliation -- the exact cross-runtime corruption this PR set out to prevent, still reachable between Docker and Podman. Return the concrete runtimeType detected at connection time so each runtime identifies itself distinctly. The stamp written at create time and the value compared at reconcile time both come from this method on the same runtime, so the ownership round-trip holds (a Podman workload stamps "podman" and is later recognized as Podman-owned). Released builds never persisted runtime_name, so all existing workloads take the empty/legacy "assume ours" path and are opportunistically migrated to their true runtime name on the next healthy reconcile -- no regression. Co-Authored-By: Claude Opus 4.8 (1M context) --- pkg/container/docker/client.go | 16 ++++-- pkg/container/docker/client_name_test.go | 62 ++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 pkg/container/docker/client_name_test.go diff --git a/pkg/container/docker/client.go b/pkg/container/docker/client.go index b4b02605fd..087832bd60 100644 --- a/pkg/container/docker/client.go +++ b/pkg/container/docker/client.go @@ -651,9 +651,19 @@ func (c *Client) AttachToWorkload(ctx context.Context, workloadName string) (io. return resp.Conn, stdoutReader, nil } -// Name returns the registered name of this runtime. -func (*Client) Name() string { - return RuntimeName +// Name returns the name of the concrete runtime this client is connected to +// (e.g. "docker", "podman", "colima"), as detected at connection time. +// +// This intentionally returns the detected runtimeType rather than the static +// RuntimeName constant used for factory registration. RuntimeName collapses all +// Docker-API-compatible runtimes under a single "docker" registration, but for +// workload-ownership tracking each concrete runtime must identify itself +// distinctly: Docker and Podman are separate daemons with separate containers, +// so a workload created under one must not be claimed by the other during +// status reconciliation. Returning the constant would let Podman-owned and +// Docker-owned workloads share the identity "docker" and corrupt each other. +func (c *Client) Name() string { + return string(c.runtimeType) } // IsRunning checks the health of the container runtime. diff --git a/pkg/container/docker/client_name_test.go b/pkg/container/docker/client_name_test.go new file mode 100644 index 0000000000..a2cac57468 --- /dev/null +++ b/pkg/container/docker/client_name_test.go @@ -0,0 +1,62 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package docker + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + rt "github.com/stacklok/toolhive/pkg/container/runtime" +) + +// TestClientName verifies that Name reports the concrete detected runtime type +// rather than the static factory-registration constant. Distinct identities are +// required so that workload-ownership tracking does not let Docker- and +// Podman-owned workloads (separate daemons) corrupt each other's status. +func TestClientName(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + runtimeType rt.Type + want string + }{ + {name: "docker", runtimeType: rt.TypeDocker, want: "docker"}, + {name: "podman", runtimeType: rt.TypePodman, want: "podman"}, + {name: "colima", runtimeType: rt.TypeColima, want: "colima"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + c := &Client{runtimeType: tt.runtimeType} + assert.Equal(t, tt.want, c.Name()) + }) + } +} + +// TestClientNameOwnershipRoundTrip asserts the property the cross-runtime +// protection relies on: the identity stamped at create time and the identity +// compared at reconcile time come from the same Name call, so a workload created +// under Podman is recognized as Podman-owned (and not as Docker-owned) later. +func TestClientNameOwnershipRoundTrip(t *testing.T) { + t.Parallel() + + podman := &Client{runtimeType: rt.TypePodman} + docker := &Client{runtimeType: rt.TypeDocker} + + // Identity recorded on the RunConfig when the workload is created. + stampedAtCreate := podman.Name() + + // Same Podman runtime later reconciling: must recognize ownership. + assert.Equal(t, stampedAtCreate, podman.Name(), + "podman must recognize its own workloads across calls") + + // A different runtime (Docker) reconciling the same workload: must NOT + // claim ownership, otherwise it would corrupt the Podman workload's status. + assert.NotEqual(t, stampedAtCreate, docker.Name(), + "docker must not claim ownership of a podman-created workload") +}