From 731d245d4288d6309948bfe6278fb5cb108f0c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 13:05:46 +0200 Subject: [PATCH 1/7] :seedling: Implement image-url-command v2 --- api/v1beta1/conditions_const.go | 21 +++ api/v1beta1/hcloudmachine_types.go | 13 ++ api/v1beta1/hcloudmachine_validation.go | 13 ++ ...cture.cluster.x-k8s.io_hcloudmachines.yaml | 13 ++ ...uster.x-k8s.io_hcloudmachinetemplates.yaml | 13 ++ .../baremetal/client/mocks/ssh/Client.go | 119 ++++++++++++++ .../baremetal/client/ssh/ssh_client.go | 78 +++++++++- pkg/services/hcloud/server/server.go | 146 ++++++++++++++++++ 8 files changed, 415 insertions(+), 1 deletion(-) diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index 5e568f532..e80ea277b 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -271,6 +271,27 @@ const ( RebootSucceededCondition clusterv1beta1.ConditionType = "RebootSucceeded" ) +const ( + // NodeProvisioningSucceededCondition aggregates the result of all image-url-command v2 phases. + // True once all four phase conditions are True; False on any permanent failure; Unknown while in progress. + NodeProvisioningSucceededCondition clusterv1beta1.ConditionType = "NodeProvisioningSucceeded" + // PreparationSucceededCondition reports disks partitioned, filesystems mounted, required binaries verified. + PreparationSucceededCondition clusterv1beta1.ConditionType = "PreparationSucceeded" + // ImageDeploymentSucceededCondition reports OCI tarball pulled, checksum/signature verified, image written to disk. + ImageDeploymentSucceededCondition clusterv1beta1.ConditionType = "ImageDeploymentSucceeded" + // BootstrapDeliverySucceededCondition reports cloud-init/CAPI bootstrap data written into the deployed image. + BootstrapDeliverySucceededCondition clusterv1beta1.ConditionType = "BootstrapDeliverySucceeded" + // HandoverSucceededCondition reports reboot initiated; binary completed, controller takes over. + HandoverSucceededCondition clusterv1beta1.ConditionType = "HandoverSucceeded" + + // ProvisioningPhaseSucceededReason indicates a provisioning phase completed successfully. + ProvisioningPhaseSucceededReason = "Succeeded" + // ProvisioningPhaseFailedReason indicates a provisioning phase failed permanently. + ProvisioningPhaseFailedReason = "Failed" + // ProvisioningPhaseNotStartedReason indicates a provisioning phase was never reached. + ProvisioningPhaseNotStartedReason = "NotStarted" +) + const ( // RemediationSkippedCondition reports that remediation was skipped because // the HCloudMachine has a state that makes remediation unnecessary or impossible. diff --git a/api/v1beta1/hcloudmachine_types.go b/api/v1beta1/hcloudmachine_types.go index 69e436e00..19e0c95b4 100644 --- a/api/v1beta1/hcloudmachine_types.go +++ b/api/v1beta1/hcloudmachine_types.go @@ -94,6 +94,19 @@ type HCloudMachineSpec struct { // +optional ImageURLCommand string `json:"imageURLCommand,omitempty"` + // ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + // When set to "v2", CAPH reads /root/output.json after the command completes and maps each + // provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + // ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + // NodeProvisioningSucceeded). + // + // Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + // + // +kubebuilder:validation:Enum="";v2 + // +kubebuilder:validation:Optional + // +optional + ImageURLCommandAPIVersion string `json:"imageURLCommandAPIVersion,omitempty"` + // SSHKeys define machine-specific SSH keys and override cluster-wide SSH keys. // +optional SSHKeys []SSHKey `json:"sshKeys,omitempty"` diff --git a/api/v1beta1/hcloudmachine_validation.go b/api/v1beta1/hcloudmachine_validation.go index 57eff56f2..550f0df5a 100644 --- a/api/v1beta1/hcloudmachine_validation.go +++ b/api/v1beta1/hcloudmachine_validation.go @@ -55,6 +55,13 @@ func validateHCloudMachineSpecUpdate(oldSpec, newSpec HCloudMachineSpec) field.E ) } + // ImageURLCommandAPIVersion is immutable + if !reflect.DeepEqual(oldSpec.ImageURLCommandAPIVersion, newSpec.ImageURLCommandAPIVersion) { + allErrs = append(allErrs, + field.Forbidden(field.NewPath("spec", "imageURLCommandAPIVersion"), "field is immutable"), + ) + } + // SSHKeys is immutable if !reflect.DeepEqual(oldSpec.SSHKeys, newSpec.SSHKeys) { allErrs = append(allErrs, @@ -114,5 +121,11 @@ func validateHCloudMachineSpec(spec HCloudMachineSpec) field.ErrorList { } } + if spec.ImageURLCommandAPIVersion != "" && spec.ImageURL == "" { + allErrs = append(allErrs, + field.Invalid(field.NewPath("spec", "imageURLCommandAPIVersion"), spec.ImageURLCommandAPIVersion, + "imageURLCommandAPIVersion requires imageURL to be set")) + } + return allErrs } diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml index b7e3b5c31..1711473bd 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml @@ -105,6 +105,19 @@ spec: ImageURLCommand must be set if ImageURL is set. ImageURLCommand must be empty if ImageURL is empty. type: string + imageURLCommandAPIVersion: + description: |- + ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + When set to "v2", CAPH reads /root/output.json after the command completes and maps each + provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + NodeProvisioningSucceeded). + + Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + enum: + - "" + - v2 + type: string placementGroupName: description: PlacementGroupName defines the placement group of the machine in HCloud API that must reference an existing placement diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml index 34289288f..849ee1fec 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml @@ -132,6 +132,19 @@ spec: ImageURLCommand must be set if ImageURL is set. ImageURLCommand must be empty if ImageURL is empty. type: string + imageURLCommandAPIVersion: + description: |- + ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + When set to "v2", CAPH reads /root/output.json after the command completes and maps each + provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + NodeProvisioningSucceeded). + + Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + enum: + - "" + - v2 + type: string placementGroupName: description: PlacementGroupName defines the placement group of the machine in HCloud API that must reference an existing diff --git a/pkg/services/baremetal/client/mocks/ssh/Client.go b/pkg/services/baremetal/client/mocks/ssh/Client.go index b462e365e..b2537c7b9 100644 --- a/pkg/services/baremetal/client/mocks/ssh/Client.go +++ b/pkg/services/baremetal/client/mocks/ssh/Client.go @@ -1227,6 +1227,62 @@ func (_c *Client_GetResultOfInstallImage_Call) RunAndReturn(run func(context.Con return _c } +// ReadOutputJSON provides a mock function with given fields: ctx +func (_m *Client) ReadOutputJSON(ctx context.Context) (string, error) { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for ReadOutputJSON") + } + + var r0 string + var r1 error + if rf, ok := ret.Get(0).(func(context.Context) (string, error)); ok { + return rf(ctx) + } + if rf, ok := ret.Get(0).(func(context.Context) string); ok { + r0 = rf(ctx) + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func(context.Context) error); ok { + r1 = rf(ctx) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_ReadOutputJSON_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ReadOutputJSON' +type Client_ReadOutputJSON_Call struct { + *mock.Call +} + +// ReadOutputJSON is a helper method to define mock.On call +// - ctx context.Context +func (_e *Client_Expecter) ReadOutputJSON(ctx interface{}) *Client_ReadOutputJSON_Call { + return &Client_ReadOutputJSON_Call{Call: _e.mock.On("ReadOutputJSON", ctx)} +} + +func (_c *Client_ReadOutputJSON_Call) Run(run func(ctx context.Context)) *Client_ReadOutputJSON_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(context.Context)) + }) + return _c +} + +func (_c *Client_ReadOutputJSON_Call) Return(_a0 string, _a1 error) *Client_ReadOutputJSON_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_ReadOutputJSON_Call) RunAndReturn(run func(context.Context) (string, error)) *Client_ReadOutputJSON_Call { + _c.Call.Return(run) + return _c +} + // Reboot provides a mock function with given fields: ctx func (_m *Client) Reboot(ctx context.Context) sshclient.Output { ret := _m.Called(ctx) @@ -1450,6 +1506,69 @@ func (_c *Client_StateOfImageURLCommand_Call) RunAndReturn(run func(context.Cont return _c } +// StateOfImageURLCommandV2 provides a mock function with given fields: ctx +func (_m *Client) StateOfImageURLCommandV2(ctx context.Context) (sshclient.ImageURLCommandState, string, error) { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for StateOfImageURLCommandV2") + } + + var r0 sshclient.ImageURLCommandState + var r1 string + var r2 error + if rf, ok := ret.Get(0).(func(context.Context) (sshclient.ImageURLCommandState, string, error)); ok { + return rf(ctx) + } + if rf, ok := ret.Get(0).(func(context.Context) sshclient.ImageURLCommandState); ok { + r0 = rf(ctx) + } else { + r0 = ret.Get(0).(sshclient.ImageURLCommandState) + } + + if rf, ok := ret.Get(1).(func(context.Context) string); ok { + r1 = rf(ctx) + } else { + r1 = ret.Get(1).(string) + } + + if rf, ok := ret.Get(2).(func(context.Context) error); ok { + r2 = rf(ctx) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// Client_StateOfImageURLCommandV2_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'StateOfImageURLCommandV2' +type Client_StateOfImageURLCommandV2_Call struct { + *mock.Call +} + +// StateOfImageURLCommandV2 is a helper method to define mock.On call +// - ctx context.Context +func (_e *Client_Expecter) StateOfImageURLCommandV2(ctx interface{}) *Client_StateOfImageURLCommandV2_Call { + return &Client_StateOfImageURLCommandV2_Call{Call: _e.mock.On("StateOfImageURLCommandV2", ctx)} +} + +func (_c *Client_StateOfImageURLCommandV2_Call) Run(run func(ctx context.Context)) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(context.Context)) + }) + return _c +} + +func (_c *Client_StateOfImageURLCommandV2_Call) Return(state sshclient.ImageURLCommandState, outputJSON string, err error) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Return(state, outputJSON, err) + return _c +} + +func (_c *Client_StateOfImageURLCommandV2_Call) RunAndReturn(run func(context.Context) (sshclient.ImageURLCommandState, string, error)) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Return(run) + return _c +} + // UntarTGZ provides a mock function with given fields: ctx func (_m *Client) UntarTGZ(ctx context.Context) sshclient.Output { ret := _m.Called(ctx) diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 510fa5998..1969b27c2 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -45,9 +45,31 @@ const ( sshTimeOut time.Duration = 5 * time.Second sshUser = "root" - imageURLCommandLog = "/root/image-url-command.log" + imageURLCommandLog = "/root/image-url-command.log" + outputJSONPath = "/root/output.json" + outputJSONMaxRetries = 10 ) +// ImageURLCommandOutputV2 is the format of /root/output.json written by the v2 image-url-command binary. +type ImageURLCommandOutputV2 struct { + APIVersion string `json:"apiVersion"` + Phases map[string]ImageURLCommandPhaseResult `json:"phases"` +} + +// ImageURLCommandPhaseResult holds the result of one provisioning phase. +type ImageURLCommandPhaseResult struct { + Status string `json:"status"` + FailedStep string `json:"failedStep,omitempty"` + Steps []ImageURLCommandStepResult `json:"steps"` +} + +// ImageURLCommandStepResult holds the result of one step within a phase. +type ImageURLCommandStepResult struct { + Name string `json:"name"` + Status string `json:"status"` + Message string `json:"message"` +} + //go:embed detect-linux-on-another-disk.sh var detectLinuxOnAnotherDiskShellScript string @@ -215,6 +237,16 @@ type Client interface { // StateOfImageURLCommand returns the current states of the ImageURLCommand. States can // be: NotStarted, Running, Failed, FinishedSuccesfully. StateOfImageURLCommand(ctx context.Context) (state ImageURLCommandState, logFile string, err error) + + // StateOfImageURLCommandV2 is the v2 variant of StateOfImageURLCommand. + // Instead of checking IMAGE_URL_DONE in the log, it reads /root/output.json once the + // binary exits. Returns (FinishedSuccessfully, outputJSONContent, nil) on completion. + StateOfImageURLCommandV2(ctx context.Context) (state ImageURLCommandState, outputJSON string, err error) + + // ReadOutputJSON reads /root/output.json from the rescue system. + // It retries up to outputJSONMaxRetries times when the content does not end with '}', + // which guards against reading a partially-written file. + ReadOutputJSON(ctx context.Context) (string, error) } // Factory is the interface for creating new Client objects. @@ -929,3 +961,47 @@ func (c *sshClient) getImageURLCommandOutput(ctx context.Context) (string, error } return out.StdOut, nil } + +func (c *sshClient) ReadOutputJSON(ctx context.Context) (string, error) { + for range outputJSONMaxRetries { + out := c.runSSH(ctx, "cat "+outputJSONPath) + exitStatus, err := out.ExitStatus() + if err != nil { + return "", fmt.Errorf("reading output.json: %w", err) + } + if exitStatus != 0 { + return "", fmt.Errorf("reading output.json failed with exit status %d", exitStatus) + } + content := strings.TrimSpace(out.StdOut) + if strings.HasSuffix(content, "}") { + return content, nil + } + } + return "", fmt.Errorf("output.json did not end with '}' after %d retries", outputJSONMaxRetries) +} + +func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageURLCommandState, outputJSON string, err error) { + out := c.runSSH(ctx, `[ -e /root/image-url-command.pid ]`) + exitStatus, err := out.ExitStatus() + if err != nil { + return ImageURLCommandStateNotStarted, "", fmt.Errorf("checking PID file: %w", err) + } + if exitStatus > 0 { + return ImageURLCommandStateNotStarted, "", nil + } + + out = c.runSSH(ctx, `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`) + exitStatus, err = out.ExitStatus() + if err != nil { + return ImageURLCommandStateNotStarted, "", fmt.Errorf("checking if image-url-command is running: %w", err) + } + if exitStatus == 0 { + return ImageURLCommandStateRunning, "", nil + } + + content, err := c.ReadOutputJSON(ctx) + if err != nil { + return ImageURLCommandStateFailed, err.Error(), nil + } + return ImageURLCommandStateFinishedSuccessfully, content, nil +} diff --git a/pkg/services/hcloud/server/server.go b/pkg/services/hcloud/server/server.go index 171d21ec8..72865f1bd 100644 --- a/pkg/services/hcloud/server/server.go +++ b/pkg/services/hcloud/server/server.go @@ -19,6 +19,7 @@ package server import ( "context" + "encoding/json" "errors" "fmt" "net" @@ -653,6 +654,10 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server return reconcile.Result{}, fmt.Errorf("getSSHClient failed (wait for image-url-command): %w", err) } + if hm.Spec.ImageURLCommandAPIVersion == "v2" { + return s.handleBootStateRunningImageCommandV2(ctx, hcloudSSHClient) + } + state, logFile, err := hcloudSSHClient.StateOfImageURLCommand(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("StateOfImageURLCommand failed: %w", err) @@ -720,6 +725,147 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server } } +// handleBootStateRunningImageCommandV2 handles the RunningImageCommand boot state for +// imageURLCommandAPIVersion=v2. It reads /root/output.json once the binary exits and maps +// phase results to HCloudMachine conditions. +func (s *Service) handleBootStateRunningImageCommandV2(ctx context.Context, hcloudSSHClient sshclient.Client) (reconcile.Result, error) { + hm := s.scope.HCloudMachine + + state, outputJSON, err := hcloudSSHClient.StateOfImageURLCommandV2(ctx) + if err != nil { + return reconcile.Result{}, fmt.Errorf("StateOfImageURLCommandV2 failed: %w", err) + } + + durationOfState := time.Since(hm.Status.BootStateSince.Time) + // Please keep the number (7) in sync with the docstring of ImageURL. + if durationOfState > 7*time.Minute { + msg := fmt.Sprintf("ImageURLCommand timed out after %s. Deleting machine", + durationOfState.Round(time.Second).String()) + s.scope.Error(errors.New(msg), "", "outputJSON", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + record.Warn(hm, "ImageURLCommandFailed", outputJSON) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "RunningImageCommandTimedOut", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + } + + switch state { + case sshclient.ImageURLCommandStateRunning: + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "HCloudImageURLCommandRunning", clusterv1beta1.ConditionSeverityInfo, + "imageURLCommand running") + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + + case sshclient.ImageURLCommandStateFinishedSuccessfully: + var output sshclient.ImageURLCommandOutputV2 + if err := json.Unmarshal([]byte(outputJSON), &output); err != nil { + return reconcile.Result{}, fmt.Errorf("parsing output.json: %w", err) + } + + allSucceeded := applyNodeProvisioningConditions(hm, output) + if !allSucceeded { + msg := "ImageURLCommand provisioning failed. Deleting machine" + s.scope.Error(errors.New(msg), "", "outputJSON", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + record.Warn(hm, "ImageURLCommandFailed", outputJSON) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "ImageCommandFailed", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + } + + if rebootErr := hcloudSSHClient.Reboot(ctx).Err; rebootErr != nil { + return reconcile.Result{}, fmt.Errorf("reboot after ImageURLCommand failed: %w", rebootErr) + } + hm.SetBootState(infrav1.HCloudBootStateBootingToRealOS) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "BootingToRealOS", clusterv1beta1.ConditionSeverityInfo, + "Operating system of node is booting") + return reconcile.Result{RequeueAfter: requeueImmediately}, nil + + case sshclient.ImageURLCommandStateFailed: + msg := "ImageURLCommand failed to read output.json. Deleting machine" + s.scope.Error(errors.New(msg), "", "detail", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "ImageCommandFailed", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + + case sshclient.ImageURLCommandStateNotStarted: + return reconcile.Result{}, fmt.Errorf("image-url-command not started in BootState RunningImageCommand? Should not happen") + + default: + return reconcile.Result{}, fmt.Errorf("unknown ImageURLCommandState: %q", state) + } +} + +// applyNodeProvisioningConditions parses an ImageURLCommandOutputV2 and sets phase conditions +// on the HCloudMachine. Returns true if all four phases succeeded. +func applyNodeProvisioningConditions(hm *infrav1.HCloudMachine, output sshclient.ImageURLCommandOutputV2) bool { + type phaseMapping struct { + name string + condition clusterv1beta1.ConditionType + } + phases := []phaseMapping{ + {"Preparation", infrav1.PreparationSucceededCondition}, + {"ImageDeployment", infrav1.ImageDeploymentSucceededCondition}, + {"BootstrapDelivery", infrav1.BootstrapDeliverySucceededCondition}, + {"Handover", infrav1.HandoverSucceededCondition}, + } + + allSucceeded := true + anyFailed := false + for _, pm := range phases { + phase, ok := output.Phases[pm.name] + if !ok { + v1beta1conditions.MarkUnknown(hm, pm.condition, + infrav1.ProvisioningPhaseNotStartedReason, "phase not present in output.json") + allSucceeded = false + continue + } + switch phase.Status { + case "Succeeded": + v1beta1conditions.MarkTrue(hm, pm.condition) + case "Failed": + reason := infrav1.ProvisioningPhaseFailedReason + message := "" + for _, step := range phase.Steps { + if step.Name == phase.FailedStep { + reason = step.Name + message = step.Message + break + } + } + v1beta1conditions.MarkFalse(hm, pm.condition, reason, + clusterv1beta1.ConditionSeverityError, "%s", message) + allSucceeded = false + anyFailed = true + default: // "NotStarted" or unknown + v1beta1conditions.MarkUnknown(hm, pm.condition, + infrav1.ProvisioningPhaseNotStartedReason, "phase was not reached") + allSucceeded = false + } + } + + if allSucceeded { + v1beta1conditions.MarkTrue(hm, infrav1.NodeProvisioningSucceededCondition) + } else if anyFailed { + v1beta1conditions.MarkFalse(hm, infrav1.NodeProvisioningSucceededCondition, + infrav1.ProvisioningPhaseFailedReason, clusterv1beta1.ConditionSeverityError, + "A provisioning phase failed.") + } else { + v1beta1conditions.MarkUnknown(hm, infrav1.NodeProvisioningSucceededCondition, + infrav1.ProvisioningPhaseNotStartedReason, "Not all phases completed.") + } + + return allSucceeded +} + // handleBootingToRealOS is used for both ways (imageName/snapshot and imageURL). func (s *Service) handleBootingToRealOS(ctx context.Context, server *hcloud.Server) (res reconcile.Result, err error) { hm := s.scope.HCloudMachine From f1ba86e8bbc550f9c3eb5158ff446af17f8cd6d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 13:23:25 +0200 Subject: [PATCH 2/7] :seedling: Fix lint errors in ssh_client.go Fix gci import alignment and return err instead of nil in StateOfImageURLCommandV2 error path. Co-Authored-By: Claude Sonnet 4.6 --- pkg/services/baremetal/client/ssh/ssh_client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 1969b27c2..07823dddc 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -45,8 +45,8 @@ const ( sshTimeOut time.Duration = 5 * time.Second sshUser = "root" - imageURLCommandLog = "/root/image-url-command.log" - outputJSONPath = "/root/output.json" + imageURLCommandLog = "/root/image-url-command.log" + outputJSONPath = "/root/output.json" outputJSONMaxRetries = 10 ) @@ -1001,7 +1001,7 @@ func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageUR content, err := c.ReadOutputJSON(ctx) if err != nil { - return ImageURLCommandStateFailed, err.Error(), nil + return ImageURLCommandStateFailed, err.Error(), err } return ImageURLCommandStateFinishedSuccessfully, content, nil } From 8b992c4af71748d2f5e26ea202a877f0219b765d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 15:29:16 +0200 Subject: [PATCH 3/7] :seedling: Use output.json status field as v2 completion signal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add top-level Status field to ImageURLCommandOutputV2. StateOfImageURLCommandV2 now treats a missing or unparseable Status as "not done yet" (requeue) instead of failing immediately — the binary writes output.json atomically so the file is either complete with a valid status or absent. Also pass --api-version flag to the binary via StartImageURLCommand's new apiVersion parameter. Co-Authored-By: Claude Sonnet 4.6 --- .../baremetal/client/mocks/ssh/Client.go | 33 ++++++++++--------- .../baremetal/client/ssh/ssh_client.go | 27 ++++++++++++--- pkg/services/baremetal/host/host.go | 2 +- pkg/services/baremetal/host/host_test.go | 6 ++-- pkg/services/hcloud/server/server.go | 2 +- pkg/services/hcloud/server/server_test.go | 2 +- 6 files changed, 45 insertions(+), 27 deletions(-) diff --git a/pkg/services/baremetal/client/mocks/ssh/Client.go b/pkg/services/baremetal/client/mocks/ssh/Client.go index b2537c7b9..b0956be5a 100644 --- a/pkg/services/baremetal/client/mocks/ssh/Client.go +++ b/pkg/services/baremetal/client/mocks/ssh/Client.go @@ -1375,9 +1375,9 @@ func (_c *Client_ResetKubeadm_Call) RunAndReturn(run func(context.Context) sshcl return _c } -// StartImageURLCommand provides a mock function with given fields: ctx, command, imageURL, bootstrapData, machineName, deviceNames -func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (int, string, error) { - ret := _m.Called(ctx, command, imageURL, bootstrapData, machineName, deviceNames) +// StartImageURLCommand provides a mock function with given fields: ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion +func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (int, string, error) { + ret := _m.Called(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) if len(ret) == 0 { panic("no return value specified for StartImageURLCommand") @@ -1386,23 +1386,23 @@ func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imag var r0 int var r1 string var r2 error - if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string) (int, string, error)); ok { - return rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string, string) (int, string, error)); ok { + return rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } - if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string) int); ok { - r0 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string, string) int); ok { + r0 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r0 = ret.Get(0).(int) } - if rf, ok := ret.Get(1).(func(context.Context, string, string, []byte, string, []string) string); ok { - r1 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(1).(func(context.Context, string, string, []byte, string, []string, string) string); ok { + r1 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r1 = ret.Get(1).(string) } - if rf, ok := ret.Get(2).(func(context.Context, string, string, []byte, string, []string) error); ok { - r2 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(2).(func(context.Context, string, string, []byte, string, []string, string) error); ok { + r2 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r2 = ret.Error(2) } @@ -1422,13 +1422,14 @@ type Client_StartImageURLCommand_Call struct { // - bootstrapData []byte // - machineName string // - deviceNames []string -func (_e *Client_Expecter) StartImageURLCommand(ctx interface{}, command interface{}, imageURL interface{}, bootstrapData interface{}, machineName interface{}, deviceNames interface{}) *Client_StartImageURLCommand_Call { - return &Client_StartImageURLCommand_Call{Call: _e.mock.On("StartImageURLCommand", ctx, command, imageURL, bootstrapData, machineName, deviceNames)} +// - apiVersion string +func (_e *Client_Expecter) StartImageURLCommand(ctx interface{}, command interface{}, imageURL interface{}, bootstrapData interface{}, machineName interface{}, deviceNames interface{}, apiVersion interface{}) *Client_StartImageURLCommand_Call { + return &Client_StartImageURLCommand_Call{Call: _e.mock.On("StartImageURLCommand", ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion)} } -func (_c *Client_StartImageURLCommand_Call) Run(run func(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string)) *Client_StartImageURLCommand_Call { +func (_c *Client_StartImageURLCommand_Call) Run(run func(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string)) *Client_StartImageURLCommand_Call { _c.Call.Run(func(args mock.Arguments) { - run(args[0].(context.Context), args[1].(string), args[2].(string), args[3].([]byte), args[4].(string), args[5].([]string)) + run(args[0].(context.Context), args[1].(string), args[2].(string), args[3].([]byte), args[4].(string), args[5].([]string), args[6].(string)) }) return _c } @@ -1438,7 +1439,7 @@ func (_c *Client_StartImageURLCommand_Call) Return(exitStatus int, stdoutAndStde return _c } -func (_c *Client_StartImageURLCommand_Call) RunAndReturn(run func(context.Context, string, string, []byte, string, []string) (int, string, error)) *Client_StartImageURLCommand_Call { +func (_c *Client_StartImageURLCommand_Call) RunAndReturn(run func(context.Context, string, string, []byte, string, []string, string) (int, string, error)) *Client_StartImageURLCommand_Call { _c.Call.Return(run) return _c } diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 07823dddc..723338137 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -23,6 +23,7 @@ import ( "context" _ "embed" "encoding/base64" + "encoding/json" "errors" "fmt" "io" @@ -51,8 +52,11 @@ const ( ) // ImageURLCommandOutputV2 is the format of /root/output.json written by the v2 image-url-command binary. +// Status is the top-level completion signal: "Succeeded" or "Failed". A missing or empty Status +// means the file is not yet complete (binary still running or crashed before the atomic rename). type ImageURLCommandOutputV2 struct { APIVersion string `json:"apiVersion"` + Status string `json:"status"` Phases map[string]ImageURLCommandPhaseResult `json:"phases"` } @@ -232,7 +236,8 @@ type Client interface { // This gets used when imageURL set. // For hcloud deviceNames is always {"sda"}. For baremetal it corresponds to the WWNs // of RootDeviceHints. - StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (exitStatus int, stdoutAndStderr string, err error) + // apiVersion is appended as --api-version= unless it is empty or "v1". + StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (exitStatus int, stdoutAndStderr string, err error) // StateOfImageURLCommand returns the current states of the ImageURLCommand. States can // be: NotStarted, Running, Failed, FinishedSuccesfully. @@ -831,7 +836,7 @@ func (c *sshClient) ExecutePreProvisionCommand(ctx context.Context, command stri return exitStatus, s, nil } -func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (int, string, error) { +func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (int, string, error) { logger := ctrl.LoggerFrom(ctx).WithName("ssh-client") // validate deviceNames @@ -892,11 +897,15 @@ func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL return 0, "", fmt.Errorf("error copying bootstrap data to %s:%d:%s %w", c.ip, c.port, dest, err) } + apiVersionFlag := "" + if apiVersion != "" && apiVersion != "v1" { + apiVersionFlag = " --api-version=" + apiVersion + } cmd := fmt.Sprintf(`#!/usr/bin/bash -OCI_REGISTRY_AUTH_TOKEN='%s' nohup /root/image-url-command '%s' /root/bootstrap.data '%s' '%s' >%s 2>&1 %s 2>&1 /root/image-url-command.pid `, os.Getenv("OCI_REGISTRY_AUTH_TOKEN"), imageURL, machineName, strings.Join(deviceNames, " "), - imageURLCommandLog) + apiVersionFlag, imageURLCommandLog) out := c.runSSH(ctx, cmd) @@ -999,9 +1008,17 @@ func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageUR return ImageURLCommandStateRunning, "", nil } + // Binary has exited. output.json is written atomically (rename), so the Status field is + // either present and valid, or the file does not exist yet (binary crashed before rename). + // Treat any read or parse failure as "not done yet" and requeue; the 7-minute timeout in + // handleBootStateRunningImageCommandV2 handles the case where the binary crashed permanently. content, err := c.ReadOutputJSON(ctx) if err != nil { - return ImageURLCommandStateFailed, err.Error(), err + return ImageURLCommandStateRunning, "", nil + } + var output ImageURLCommandOutputV2 + if err := json.Unmarshal([]byte(content), &output); err != nil || output.Status == "" { + return ImageURLCommandStateRunning, "", nil } return ImageURLCommandStateFinishedSuccessfully, content, nil } diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 747e2a9df..ccea2f14e 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1317,7 +1317,7 @@ func (s *Service) actionImageInstallingImageURLCommand(ctx context.Context, sshC // get device names from storage device deviceNames := getDeviceNames(s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN(), storage) - exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, commandPath, s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.URL, data, s.scope.Hostname(), deviceNames) + exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, commandPath, s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.URL, data, s.scope.Hostname(), deviceNames, "") if err != nil { err := fmt.Errorf("StartImageURLCommand failed (retrying): %w", err) // This could be a temporary network error. Retry. diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 9f6f9bb70..ab25e025f 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -196,7 +196,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) commandPath := filepath.Join(baremetalImageURLCommandDir, host.Spec.Status.InstallImage.ImageURLCommand) - sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(0, "", nil) + sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "").Return(0, "", nil) // Create bootstrap secret in fake client with key 'value' secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) @@ -208,7 +208,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { res := svc.actionImageInstalling(ctx) Expect(res).To(BeAssignableToTypeOf(actionContinue{})) - Expect(sshMock.AssertCalled(GinkgoT(), "StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"})).To(BeTrue()) + Expect(sshMock.AssertCalled(GinkgoT(), "StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "")).To(BeTrue()) c := v1beta1conditions.Get(host, infrav1.ProvisionSucceededCondition) Expect(c.Message).To(ContainSubstring(`imageURLCommand started`)) }) @@ -223,7 +223,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) commandPath := filepath.Join(baremetalImageURLCommandDir, host.Spec.Status.InstallImage.ImageURLCommand) - sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(7, "boom", nil) + sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "").Return(7, "boom", nil) secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) diff --git a/pkg/services/hcloud/server/server.go b/pkg/services/hcloud/server/server.go index 72865f1bd..6aacb8fe4 100644 --- a/pkg/services/hcloud/server/server.go +++ b/pkg/services/hcloud/server/server.go @@ -607,7 +607,7 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc return reconcile.Result{}, fmt.Errorf("resolving imageURLCommand failed: %w", err) } - exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, imageURLCommandPath, hm.Spec.ImageURL, data, s.scope.Name(), []string{"sda"}) + exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, imageURLCommandPath, hm.Spec.ImageURL, data, s.scope.Name(), []string{"sda"}, hm.Spec.ImageURLCommandAPIVersion) if err != nil { err := fmt.Errorf("StartImageURLCommand failed (retrying): %w", err) // This could be a temporary network error. Retry. diff --git a/pkg/services/hcloud/server/server_test.go b/pkg/services/hcloud/server/server_test.go index 2a33581df..12dafe2ab 100644 --- a/pkg/services/hcloud/server/server_test.go +++ b/pkg/services/hcloud/server/server_test.go @@ -1189,7 +1189,7 @@ var _ = Describe("Reconcile", func() { StdErr: "", Err: nil, }) - startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "my-machine", []string{"sda"}).Return(0, "", nil) + startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "my-machine", []string{"sda"}, mock.Anything).Return(0, "", nil) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateRunningImageCommand)) From 14d5c7a7bfcaa7e7ecb2baa1e6e26921f4a8aeca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 16:04:41 +0200 Subject: [PATCH 4/7] :seedling: Fix lint errors in ssh_client.go Co-Authored-By: Claude Sonnet 4.6 --- pkg/services/baremetal/client/ssh/ssh_client.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 723338137..2f773542f 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -1012,12 +1012,10 @@ func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageUR // either present and valid, or the file does not exist yet (binary crashed before rename). // Treat any read or parse failure as "not done yet" and requeue; the 7-minute timeout in // handleBootStateRunningImageCommandV2 handles the case where the binary crashed permanently. - content, err := c.ReadOutputJSON(ctx) - if err != nil { - return ImageURLCommandStateRunning, "", nil - } + content, _ := c.ReadOutputJSON(ctx) // missing file = still running; treat error as not-done var output ImageURLCommandOutputV2 - if err := json.Unmarshal([]byte(content), &output); err != nil || output.Status == "" { + _ = json.Unmarshal([]byte(content), &output) // parse failure leaves Status empty = not-done + if output.Status == "" { return ImageURLCommandStateRunning, "", nil } return ImageURLCommandStateFinishedSuccessfully, content, nil From 93d9e9547ee3561f42efb899a513726fb317d830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 17:03:36 +0200 Subject: [PATCH 5/7] :seedling: Fail immediately when image-url-command process exits without output.json When the process is gone (ps check fails) but output.json is missing or incomplete, return ImageURLCommandStateFailed right away instead of ImageURLCommandStateRunning. This prevents waiting up to 7 minutes for the timeout to fire when the binary crashed before writing its output. Co-Authored-By: Claude Sonnet 4.6 --- pkg/services/baremetal/client/ssh/ssh_client.go | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 2f773542f..1fb7c2d72 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -1008,15 +1008,18 @@ func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageUR return ImageURLCommandStateRunning, "", nil } - // Binary has exited. output.json is written atomically (rename), so the Status field is - // either present and valid, or the file does not exist yet (binary crashed before rename). - // Treat any read or parse failure as "not done yet" and requeue; the 7-minute timeout in - // handleBootStateRunningImageCommandV2 handles the case where the binary crashed permanently. - content, _ := c.ReadOutputJSON(ctx) // missing file = still running; treat error as not-done + // Process has exited. Fail immediately if output.json is missing or incomplete — + // don't wait for the 7-minute timeout. + content, err := c.ReadOutputJSON(ctx) + if err != nil { + return ImageURLCommandStateFailed, fmt.Sprintf("process exited but output.json not readable: %s", err), nil + } var output ImageURLCommandOutputV2 - _ = json.Unmarshal([]byte(content), &output) // parse failure leaves Status empty = not-done + if err := json.Unmarshal([]byte(content), &output); err != nil { + return ImageURLCommandStateFailed, content, nil + } if output.Status == "" { - return ImageURLCommandStateRunning, "", nil + return ImageURLCommandStateFailed, content, nil } return ImageURLCommandStateFinishedSuccessfully, content, nil } From 5e96c5617d847fe5b44ca0c32e6e707f492a6ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 17:51:25 +0200 Subject: [PATCH 6/7] :seedling: Add unit tests for StateOfImageURLCommandV2 process-exited paths Uses an in-process fake SSH server so the tests run without any external infrastructure. The two new failure-path tests would have caught the bug fixed in the previous commit: when the process has exited but output.json is missing or has no Status field, the old code returned ImageURLCommandStateRunning (waiting for the 7-minute timeout) instead of ImageURLCommandStateFailed (immediate termination). Co-Authored-By: Claude Sonnet 4.6 --- .../client/ssh/ssh_client_imageurl_v2_test.go | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go diff --git a/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go b/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go new file mode 100644 index 000000000..8049c4363 --- /dev/null +++ b/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go @@ -0,0 +1,212 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package sshclient contains the interface to speak to bare metal servers with ssh. +package sshclient + +import ( + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "encoding/binary" + "encoding/pem" + "net" + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/crypto/ssh" +) + +// fakeSSHResponse defines what a fake SSH server returns for a given command. +type fakeSSHResponse struct { + stdout string + exitCode uint32 +} + +// startFakeSSHServer starts an in-process SSH server that maps exact command strings +// to fixed responses. Each runSSH call opens a new TCP connection, so the server +// handles concurrent connections. Returns a PEM-encoded RSA private key the sshClient +// can use for authentication (server accepts any key) and the listener port. +func startFakeSSHServer(t *testing.T, responses map[string]fakeSSHResponse) (clientPrivKeyPEM string, port int) { + t.Helper() + + hostKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + hostSigner, err := ssh.NewSignerFromKey(hostKey) + require.NoError(t, err) + + clientKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + clientPEM := string(pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(clientKey), + })) + + serverConfig := &ssh.ServerConfig{NoClientAuth: true} + serverConfig.AddHostKey(hostSigner) + + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + t.Cleanup(func() { _ = ln.Close() }) + + go func() { + for { + conn, err := ln.Accept() + if err != nil { + return + } + go fakeSSHHandleConn(conn, serverConfig, responses) + } + }() + + return clientPEM, ln.Addr().(*net.TCPAddr).Port +} + +func fakeSSHHandleConn(conn net.Conn, config *ssh.ServerConfig, responses map[string]fakeSSHResponse) { + srvConn, chans, reqs, err := ssh.NewServerConn(conn, config) + if err != nil { + return + } + defer srvConn.Close() + go ssh.DiscardRequests(reqs) + for newChan := range chans { + if newChan.ChannelType() != "session" { + _ = newChan.Reject(ssh.UnknownChannelType, "unknown channel type") + continue + } + ch, requests, err := newChan.Accept() + if err != nil { + return + } + go fakeSSHHandleSession(ch, requests, responses) + } +} + +func fakeSSHHandleSession(ch ssh.Channel, requests <-chan *ssh.Request, responses map[string]fakeSSHResponse) { + defer ch.Close() + for req := range requests { + if req.Type != "exec" { + if req.WantReply { + _ = req.Reply(false, nil) + } + continue + } + if req.WantReply { + _ = req.Reply(true, nil) + } + // SSH exec payload: 4-byte big-endian length followed by the command string. + if len(req.Payload) < 4 { + fakeSSHSendExitStatus(ch, 1) + return + } + cmdLen := binary.BigEndian.Uint32(req.Payload[:4]) + cmd := string(req.Payload[4 : 4+cmdLen]) + + resp, ok := responses[cmd] + if !ok { + fakeSSHSendExitStatus(ch, 127) + return + } + if resp.stdout != "" { + _, _ = ch.Write([]byte(resp.stdout)) + } + fakeSSHSendExitStatus(ch, resp.exitCode) + return + } +} + +func fakeSSHSendExitStatus(ch ssh.Channel, code uint32) { + payload := make([]byte, 4) + binary.BigEndian.PutUint32(payload, code) + _, _ = ch.SendRequest("exit-status", false, payload) +} + +// TestStateOfImageURLCommandV2_ProcessExitedWithoutOutputJSON verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFailed immediately when +// the image-url-command process has exited but output.json is missing. +// +// Without the fix this test exposes, the function returned ImageURLCommandStateRunning, +// causing the controller to wait up to 7 minutes for a timeout that was never needed. +func TestStateOfImageURLCommandV2_ProcessExitedWithoutOutputJSON(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, // PID file exists + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, // process gone + `cat ` + outputJSONPath: {exitCode: 1}, // output.json missing + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, detail, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFailed, state, + "expected immediate failure when process is gone without output.json, not ImageURLCommandStateRunning") + require.Contains(t, detail, "process exited") +} + +// TestStateOfImageURLCommandV2_ProcessExitedWithEmptyStatus verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFailed when the process +// has exited and output.json exists but has an empty Status field (e.g. the +// binary wrote a partial file before crashing). +func TestStateOfImageURLCommandV2_ProcessExitedWithEmptyStatus(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, + `cat ` + outputJSONPath: {exitCode: 0, stdout: `{"apiVersion":"v2"}`}, // Status field missing + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, _, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFailed, state, + "expected failure when output.json has no Status field") +} + +// TestStateOfImageURLCommandV2_ProcessStillRunning verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateRunning while the +// process is still alive (the normal in-progress case). +func TestStateOfImageURLCommandV2_ProcessStillRunning(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 0}, // still running + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, _, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateRunning, state) +} + +// TestStateOfImageURLCommandV2_FinishedSuccessfully verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFinishedSuccessfully +// when the process has exited and output.json contains a non-empty Status. +func TestStateOfImageURLCommandV2_FinishedSuccessfully(t *testing.T) { + outputJSON := `{"apiVersion":"v2","status":"Succeeded","phases":{}}` + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, + `cat ` + outputJSONPath: {exitCode: 0, stdout: outputJSON}, + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, content, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFinishedSuccessfully, state) + require.Equal(t, outputJSON, content) +} From 188506357640970f4f5b3a3804952cef77c10326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20G=C3=BCttler?= Date: Thu, 21 May 2026 18:21:18 +0200 Subject: [PATCH 7/7] tests. --- hack/update-operator-dev-deployment.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/update-operator-dev-deployment.sh b/hack/update-operator-dev-deployment.sh index 47cb7ef74..d2cecbd3f 100755 --- a/hack/update-operator-dev-deployment.sh +++ b/hack/update-operator-dev-deployment.sh @@ -60,7 +60,7 @@ if ! kubectl cluster-info >/dev/null; then fi current_context=$(kubectl config current-context) -if ! echo "$current_context" | grep -P '.*-admin@.*-mgt-cluster|kind-'; then +if ! echo "$current_context" | grep -P '.*-admin@.*-mgt-cluster|kind-|img-url-cmd-via-go'; then echo "The script refuses to update because the current context is: $current_context" echo "Expecting something like foo-mgt-cluster-admin@foo-mgt-cluster with 'foo' being a short version of your name" exit 1