diff --git a/api/v1beta1/conditions_const.go b/api/v1beta1/conditions_const.go index 5e568f532..e80ea277b 100644 --- a/api/v1beta1/conditions_const.go +++ b/api/v1beta1/conditions_const.go @@ -271,6 +271,27 @@ const ( RebootSucceededCondition clusterv1beta1.ConditionType = "RebootSucceeded" ) +const ( + // NodeProvisioningSucceededCondition aggregates the result of all image-url-command v2 phases. + // True once all four phase conditions are True; False on any permanent failure; Unknown while in progress. + NodeProvisioningSucceededCondition clusterv1beta1.ConditionType = "NodeProvisioningSucceeded" + // PreparationSucceededCondition reports disks partitioned, filesystems mounted, required binaries verified. + PreparationSucceededCondition clusterv1beta1.ConditionType = "PreparationSucceeded" + // ImageDeploymentSucceededCondition reports OCI tarball pulled, checksum/signature verified, image written to disk. + ImageDeploymentSucceededCondition clusterv1beta1.ConditionType = "ImageDeploymentSucceeded" + // BootstrapDeliverySucceededCondition reports cloud-init/CAPI bootstrap data written into the deployed image. + BootstrapDeliverySucceededCondition clusterv1beta1.ConditionType = "BootstrapDeliverySucceeded" + // HandoverSucceededCondition reports reboot initiated; binary completed, controller takes over. + HandoverSucceededCondition clusterv1beta1.ConditionType = "HandoverSucceeded" + + // ProvisioningPhaseSucceededReason indicates a provisioning phase completed successfully. + ProvisioningPhaseSucceededReason = "Succeeded" + // ProvisioningPhaseFailedReason indicates a provisioning phase failed permanently. + ProvisioningPhaseFailedReason = "Failed" + // ProvisioningPhaseNotStartedReason indicates a provisioning phase was never reached. + ProvisioningPhaseNotStartedReason = "NotStarted" +) + const ( // RemediationSkippedCondition reports that remediation was skipped because // the HCloudMachine has a state that makes remediation unnecessary or impossible. diff --git a/api/v1beta1/hcloudmachine_types.go b/api/v1beta1/hcloudmachine_types.go index 69e436e00..19e0c95b4 100644 --- a/api/v1beta1/hcloudmachine_types.go +++ b/api/v1beta1/hcloudmachine_types.go @@ -94,6 +94,19 @@ type HCloudMachineSpec struct { // +optional ImageURLCommand string `json:"imageURLCommand,omitempty"` + // ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + // When set to "v2", CAPH reads /root/output.json after the command completes and maps each + // provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + // ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + // NodeProvisioningSucceeded). + // + // Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + // + // +kubebuilder:validation:Enum="";v2 + // +kubebuilder:validation:Optional + // +optional + ImageURLCommandAPIVersion string `json:"imageURLCommandAPIVersion,omitempty"` + // SSHKeys define machine-specific SSH keys and override cluster-wide SSH keys. // +optional SSHKeys []SSHKey `json:"sshKeys,omitempty"` diff --git a/api/v1beta1/hcloudmachine_validation.go b/api/v1beta1/hcloudmachine_validation.go index 57eff56f2..550f0df5a 100644 --- a/api/v1beta1/hcloudmachine_validation.go +++ b/api/v1beta1/hcloudmachine_validation.go @@ -55,6 +55,13 @@ func validateHCloudMachineSpecUpdate(oldSpec, newSpec HCloudMachineSpec) field.E ) } + // ImageURLCommandAPIVersion is immutable + if !reflect.DeepEqual(oldSpec.ImageURLCommandAPIVersion, newSpec.ImageURLCommandAPIVersion) { + allErrs = append(allErrs, + field.Forbidden(field.NewPath("spec", "imageURLCommandAPIVersion"), "field is immutable"), + ) + } + // SSHKeys is immutable if !reflect.DeepEqual(oldSpec.SSHKeys, newSpec.SSHKeys) { allErrs = append(allErrs, @@ -114,5 +121,11 @@ func validateHCloudMachineSpec(spec HCloudMachineSpec) field.ErrorList { } } + if spec.ImageURLCommandAPIVersion != "" && spec.ImageURL == "" { + allErrs = append(allErrs, + field.Invalid(field.NewPath("spec", "imageURLCommandAPIVersion"), spec.ImageURLCommandAPIVersion, + "imageURLCommandAPIVersion requires imageURL to be set")) + } + return allErrs } diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml index b7e3b5c31..1711473bd 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml @@ -105,6 +105,19 @@ spec: ImageURLCommand must be set if ImageURL is set. ImageURLCommand must be empty if ImageURL is empty. type: string + imageURLCommandAPIVersion: + description: |- + ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + When set to "v2", CAPH reads /root/output.json after the command completes and maps each + provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + NodeProvisioningSucceeded). + + Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + enum: + - "" + - v2 + type: string placementGroupName: description: PlacementGroupName defines the placement group of the machine in HCloud API that must reference an existing placement diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml index 34289288f..849ee1fec 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachinetemplates.yaml @@ -132,6 +132,19 @@ spec: ImageURLCommand must be set if ImageURL is set. ImageURLCommand must be empty if ImageURL is empty. type: string + imageURLCommandAPIVersion: + description: |- + ImageURLCommandAPIVersion is the output format version of the ImageURLCommand binary. + When set to "v2", CAPH reads /root/output.json after the command completes and maps each + provisioning phase to a condition on the HCloudMachine (PreparationSucceeded, + ImageDeploymentSucceeded, BootstrapDeliverySucceeded, HandoverSucceeded, and the aggregate + NodeProvisioningSucceeded). + + Leave empty (default) to use the legacy IMAGE_URL_DONE log-based detection. + enum: + - "" + - v2 + type: string placementGroupName: description: PlacementGroupName defines the placement group of the machine in HCloud API that must reference an existing diff --git a/hack/update-operator-dev-deployment.sh b/hack/update-operator-dev-deployment.sh index 47cb7ef74..d2cecbd3f 100755 --- a/hack/update-operator-dev-deployment.sh +++ b/hack/update-operator-dev-deployment.sh @@ -60,7 +60,7 @@ if ! kubectl cluster-info >/dev/null; then fi current_context=$(kubectl config current-context) -if ! echo "$current_context" | grep -P '.*-admin@.*-mgt-cluster|kind-'; then +if ! echo "$current_context" | grep -P '.*-admin@.*-mgt-cluster|kind-|img-url-cmd-via-go'; then echo "The script refuses to update because the current context is: $current_context" echo "Expecting something like foo-mgt-cluster-admin@foo-mgt-cluster with 'foo' being a short version of your name" exit 1 diff --git a/pkg/services/baremetal/client/mocks/ssh/Client.go b/pkg/services/baremetal/client/mocks/ssh/Client.go index b462e365e..b0956be5a 100644 --- a/pkg/services/baremetal/client/mocks/ssh/Client.go +++ b/pkg/services/baremetal/client/mocks/ssh/Client.go @@ -1227,6 +1227,62 @@ func (_c *Client_GetResultOfInstallImage_Call) RunAndReturn(run func(context.Con return _c } +// ReadOutputJSON provides a mock function with given fields: ctx +func (_m *Client) ReadOutputJSON(ctx context.Context) (string, error) { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for ReadOutputJSON") + } + + var r0 string + var r1 error + if rf, ok := ret.Get(0).(func(context.Context) (string, error)); ok { + return rf(ctx) + } + if rf, ok := ret.Get(0).(func(context.Context) string); ok { + r0 = rf(ctx) + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func(context.Context) error); ok { + r1 = rf(ctx) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Client_ReadOutputJSON_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'ReadOutputJSON' +type Client_ReadOutputJSON_Call struct { + *mock.Call +} + +// ReadOutputJSON is a helper method to define mock.On call +// - ctx context.Context +func (_e *Client_Expecter) ReadOutputJSON(ctx interface{}) *Client_ReadOutputJSON_Call { + return &Client_ReadOutputJSON_Call{Call: _e.mock.On("ReadOutputJSON", ctx)} +} + +func (_c *Client_ReadOutputJSON_Call) Run(run func(ctx context.Context)) *Client_ReadOutputJSON_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(context.Context)) + }) + return _c +} + +func (_c *Client_ReadOutputJSON_Call) Return(_a0 string, _a1 error) *Client_ReadOutputJSON_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Client_ReadOutputJSON_Call) RunAndReturn(run func(context.Context) (string, error)) *Client_ReadOutputJSON_Call { + _c.Call.Return(run) + return _c +} + // Reboot provides a mock function with given fields: ctx func (_m *Client) Reboot(ctx context.Context) sshclient.Output { ret := _m.Called(ctx) @@ -1319,9 +1375,9 @@ func (_c *Client_ResetKubeadm_Call) RunAndReturn(run func(context.Context) sshcl return _c } -// StartImageURLCommand provides a mock function with given fields: ctx, command, imageURL, bootstrapData, machineName, deviceNames -func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (int, string, error) { - ret := _m.Called(ctx, command, imageURL, bootstrapData, machineName, deviceNames) +// StartImageURLCommand provides a mock function with given fields: ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion +func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (int, string, error) { + ret := _m.Called(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) if len(ret) == 0 { panic("no return value specified for StartImageURLCommand") @@ -1330,23 +1386,23 @@ func (_m *Client) StartImageURLCommand(ctx context.Context, command string, imag var r0 int var r1 string var r2 error - if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string) (int, string, error)); ok { - return rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string, string) (int, string, error)); ok { + return rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } - if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string) int); ok { - r0 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(0).(func(context.Context, string, string, []byte, string, []string, string) int); ok { + r0 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r0 = ret.Get(0).(int) } - if rf, ok := ret.Get(1).(func(context.Context, string, string, []byte, string, []string) string); ok { - r1 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(1).(func(context.Context, string, string, []byte, string, []string, string) string); ok { + r1 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r1 = ret.Get(1).(string) } - if rf, ok := ret.Get(2).(func(context.Context, string, string, []byte, string, []string) error); ok { - r2 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames) + if rf, ok := ret.Get(2).(func(context.Context, string, string, []byte, string, []string, string) error); ok { + r2 = rf(ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion) } else { r2 = ret.Error(2) } @@ -1366,13 +1422,14 @@ type Client_StartImageURLCommand_Call struct { // - bootstrapData []byte // - machineName string // - deviceNames []string -func (_e *Client_Expecter) StartImageURLCommand(ctx interface{}, command interface{}, imageURL interface{}, bootstrapData interface{}, machineName interface{}, deviceNames interface{}) *Client_StartImageURLCommand_Call { - return &Client_StartImageURLCommand_Call{Call: _e.mock.On("StartImageURLCommand", ctx, command, imageURL, bootstrapData, machineName, deviceNames)} +// - apiVersion string +func (_e *Client_Expecter) StartImageURLCommand(ctx interface{}, command interface{}, imageURL interface{}, bootstrapData interface{}, machineName interface{}, deviceNames interface{}, apiVersion interface{}) *Client_StartImageURLCommand_Call { + return &Client_StartImageURLCommand_Call{Call: _e.mock.On("StartImageURLCommand", ctx, command, imageURL, bootstrapData, machineName, deviceNames, apiVersion)} } -func (_c *Client_StartImageURLCommand_Call) Run(run func(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string)) *Client_StartImageURLCommand_Call { +func (_c *Client_StartImageURLCommand_Call) Run(run func(ctx context.Context, command string, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string)) *Client_StartImageURLCommand_Call { _c.Call.Run(func(args mock.Arguments) { - run(args[0].(context.Context), args[1].(string), args[2].(string), args[3].([]byte), args[4].(string), args[5].([]string)) + run(args[0].(context.Context), args[1].(string), args[2].(string), args[3].([]byte), args[4].(string), args[5].([]string), args[6].(string)) }) return _c } @@ -1382,7 +1439,7 @@ func (_c *Client_StartImageURLCommand_Call) Return(exitStatus int, stdoutAndStde return _c } -func (_c *Client_StartImageURLCommand_Call) RunAndReturn(run func(context.Context, string, string, []byte, string, []string) (int, string, error)) *Client_StartImageURLCommand_Call { +func (_c *Client_StartImageURLCommand_Call) RunAndReturn(run func(context.Context, string, string, []byte, string, []string, string) (int, string, error)) *Client_StartImageURLCommand_Call { _c.Call.Return(run) return _c } @@ -1450,6 +1507,69 @@ func (_c *Client_StateOfImageURLCommand_Call) RunAndReturn(run func(context.Cont return _c } +// StateOfImageURLCommandV2 provides a mock function with given fields: ctx +func (_m *Client) StateOfImageURLCommandV2(ctx context.Context) (sshclient.ImageURLCommandState, string, error) { + ret := _m.Called(ctx) + + if len(ret) == 0 { + panic("no return value specified for StateOfImageURLCommandV2") + } + + var r0 sshclient.ImageURLCommandState + var r1 string + var r2 error + if rf, ok := ret.Get(0).(func(context.Context) (sshclient.ImageURLCommandState, string, error)); ok { + return rf(ctx) + } + if rf, ok := ret.Get(0).(func(context.Context) sshclient.ImageURLCommandState); ok { + r0 = rf(ctx) + } else { + r0 = ret.Get(0).(sshclient.ImageURLCommandState) + } + + if rf, ok := ret.Get(1).(func(context.Context) string); ok { + r1 = rf(ctx) + } else { + r1 = ret.Get(1).(string) + } + + if rf, ok := ret.Get(2).(func(context.Context) error); ok { + r2 = rf(ctx) + } else { + r2 = ret.Error(2) + } + + return r0, r1, r2 +} + +// Client_StateOfImageURLCommandV2_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'StateOfImageURLCommandV2' +type Client_StateOfImageURLCommandV2_Call struct { + *mock.Call +} + +// StateOfImageURLCommandV2 is a helper method to define mock.On call +// - ctx context.Context +func (_e *Client_Expecter) StateOfImageURLCommandV2(ctx interface{}) *Client_StateOfImageURLCommandV2_Call { + return &Client_StateOfImageURLCommandV2_Call{Call: _e.mock.On("StateOfImageURLCommandV2", ctx)} +} + +func (_c *Client_StateOfImageURLCommandV2_Call) Run(run func(ctx context.Context)) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(context.Context)) + }) + return _c +} + +func (_c *Client_StateOfImageURLCommandV2_Call) Return(state sshclient.ImageURLCommandState, outputJSON string, err error) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Return(state, outputJSON, err) + return _c +} + +func (_c *Client_StateOfImageURLCommandV2_Call) RunAndReturn(run func(context.Context) (sshclient.ImageURLCommandState, string, error)) *Client_StateOfImageURLCommandV2_Call { + _c.Call.Return(run) + return _c +} + // UntarTGZ provides a mock function with given fields: ctx func (_m *Client) UntarTGZ(ctx context.Context) sshclient.Output { ret := _m.Called(ctx) diff --git a/pkg/services/baremetal/client/ssh/ssh_client.go b/pkg/services/baremetal/client/ssh/ssh_client.go index 510fa5998..1fb7c2d72 100644 --- a/pkg/services/baremetal/client/ssh/ssh_client.go +++ b/pkg/services/baremetal/client/ssh/ssh_client.go @@ -23,6 +23,7 @@ import ( "context" _ "embed" "encoding/base64" + "encoding/json" "errors" "fmt" "io" @@ -45,9 +46,34 @@ const ( sshTimeOut time.Duration = 5 * time.Second sshUser = "root" - imageURLCommandLog = "/root/image-url-command.log" + imageURLCommandLog = "/root/image-url-command.log" + outputJSONPath = "/root/output.json" + outputJSONMaxRetries = 10 ) +// ImageURLCommandOutputV2 is the format of /root/output.json written by the v2 image-url-command binary. +// Status is the top-level completion signal: "Succeeded" or "Failed". A missing or empty Status +// means the file is not yet complete (binary still running or crashed before the atomic rename). +type ImageURLCommandOutputV2 struct { + APIVersion string `json:"apiVersion"` + Status string `json:"status"` + Phases map[string]ImageURLCommandPhaseResult `json:"phases"` +} + +// ImageURLCommandPhaseResult holds the result of one provisioning phase. +type ImageURLCommandPhaseResult struct { + Status string `json:"status"` + FailedStep string `json:"failedStep,omitempty"` + Steps []ImageURLCommandStepResult `json:"steps"` +} + +// ImageURLCommandStepResult holds the result of one step within a phase. +type ImageURLCommandStepResult struct { + Name string `json:"name"` + Status string `json:"status"` + Message string `json:"message"` +} + //go:embed detect-linux-on-another-disk.sh var detectLinuxOnAnotherDiskShellScript string @@ -210,11 +236,22 @@ type Client interface { // This gets used when imageURL set. // For hcloud deviceNames is always {"sda"}. For baremetal it corresponds to the WWNs // of RootDeviceHints. - StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (exitStatus int, stdoutAndStderr string, err error) + // apiVersion is appended as --api-version= unless it is empty or "v1". + StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (exitStatus int, stdoutAndStderr string, err error) // StateOfImageURLCommand returns the current states of the ImageURLCommand. States can // be: NotStarted, Running, Failed, FinishedSuccesfully. StateOfImageURLCommand(ctx context.Context) (state ImageURLCommandState, logFile string, err error) + + // StateOfImageURLCommandV2 is the v2 variant of StateOfImageURLCommand. + // Instead of checking IMAGE_URL_DONE in the log, it reads /root/output.json once the + // binary exits. Returns (FinishedSuccessfully, outputJSONContent, nil) on completion. + StateOfImageURLCommandV2(ctx context.Context) (state ImageURLCommandState, outputJSON string, err error) + + // ReadOutputJSON reads /root/output.json from the rescue system. + // It retries up to outputJSONMaxRetries times when the content does not end with '}', + // which guards against reading a partially-written file. + ReadOutputJSON(ctx context.Context) (string, error) } // Factory is the interface for creating new Client objects. @@ -799,7 +836,7 @@ func (c *sshClient) ExecutePreProvisionCommand(ctx context.Context, command stri return exitStatus, s, nil } -func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string) (int, string, error) { +func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL string, bootstrapData []byte, machineName string, deviceNames []string, apiVersion string) (int, string, error) { logger := ctrl.LoggerFrom(ctx).WithName("ssh-client") // validate deviceNames @@ -860,11 +897,15 @@ func (c *sshClient) StartImageURLCommand(ctx context.Context, command, imageURL return 0, "", fmt.Errorf("error copying bootstrap data to %s:%d:%s %w", c.ip, c.port, dest, err) } + apiVersionFlag := "" + if apiVersion != "" && apiVersion != "v1" { + apiVersionFlag = " --api-version=" + apiVersion + } cmd := fmt.Sprintf(`#!/usr/bin/bash -OCI_REGISTRY_AUTH_TOKEN='%s' nohup /root/image-url-command '%s' /root/bootstrap.data '%s' '%s' >%s 2>&1 %s 2>&1 /root/image-url-command.pid `, os.Getenv("OCI_REGISTRY_AUTH_TOKEN"), imageURL, machineName, strings.Join(deviceNames, " "), - imageURLCommandLog) + apiVersionFlag, imageURLCommandLog) out := c.runSSH(ctx, cmd) @@ -929,3 +970,56 @@ func (c *sshClient) getImageURLCommandOutput(ctx context.Context) (string, error } return out.StdOut, nil } + +func (c *sshClient) ReadOutputJSON(ctx context.Context) (string, error) { + for range outputJSONMaxRetries { + out := c.runSSH(ctx, "cat "+outputJSONPath) + exitStatus, err := out.ExitStatus() + if err != nil { + return "", fmt.Errorf("reading output.json: %w", err) + } + if exitStatus != 0 { + return "", fmt.Errorf("reading output.json failed with exit status %d", exitStatus) + } + content := strings.TrimSpace(out.StdOut) + if strings.HasSuffix(content, "}") { + return content, nil + } + } + return "", fmt.Errorf("output.json did not end with '}' after %d retries", outputJSONMaxRetries) +} + +func (c *sshClient) StateOfImageURLCommandV2(ctx context.Context) (state ImageURLCommandState, outputJSON string, err error) { + out := c.runSSH(ctx, `[ -e /root/image-url-command.pid ]`) + exitStatus, err := out.ExitStatus() + if err != nil { + return ImageURLCommandStateNotStarted, "", fmt.Errorf("checking PID file: %w", err) + } + if exitStatus > 0 { + return ImageURLCommandStateNotStarted, "", nil + } + + out = c.runSSH(ctx, `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`) + exitStatus, err = out.ExitStatus() + if err != nil { + return ImageURLCommandStateNotStarted, "", fmt.Errorf("checking if image-url-command is running: %w", err) + } + if exitStatus == 0 { + return ImageURLCommandStateRunning, "", nil + } + + // Process has exited. Fail immediately if output.json is missing or incomplete — + // don't wait for the 7-minute timeout. + content, err := c.ReadOutputJSON(ctx) + if err != nil { + return ImageURLCommandStateFailed, fmt.Sprintf("process exited but output.json not readable: %s", err), nil + } + var output ImageURLCommandOutputV2 + if err := json.Unmarshal([]byte(content), &output); err != nil { + return ImageURLCommandStateFailed, content, nil + } + if output.Status == "" { + return ImageURLCommandStateFailed, content, nil + } + return ImageURLCommandStateFinishedSuccessfully, content, nil +} diff --git a/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go b/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go new file mode 100644 index 000000000..8049c4363 --- /dev/null +++ b/pkg/services/baremetal/client/ssh/ssh_client_imageurl_v2_test.go @@ -0,0 +1,212 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package sshclient contains the interface to speak to bare metal servers with ssh. +package sshclient + +import ( + "context" + "crypto/rand" + "crypto/rsa" + "crypto/x509" + "encoding/binary" + "encoding/pem" + "net" + "testing" + + "github.com/stretchr/testify/require" + "golang.org/x/crypto/ssh" +) + +// fakeSSHResponse defines what a fake SSH server returns for a given command. +type fakeSSHResponse struct { + stdout string + exitCode uint32 +} + +// startFakeSSHServer starts an in-process SSH server that maps exact command strings +// to fixed responses. Each runSSH call opens a new TCP connection, so the server +// handles concurrent connections. Returns a PEM-encoded RSA private key the sshClient +// can use for authentication (server accepts any key) and the listener port. +func startFakeSSHServer(t *testing.T, responses map[string]fakeSSHResponse) (clientPrivKeyPEM string, port int) { + t.Helper() + + hostKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + hostSigner, err := ssh.NewSignerFromKey(hostKey) + require.NoError(t, err) + + clientKey, err := rsa.GenerateKey(rand.Reader, 2048) + require.NoError(t, err) + clientPEM := string(pem.EncodeToMemory(&pem.Block{ + Type: "RSA PRIVATE KEY", + Bytes: x509.MarshalPKCS1PrivateKey(clientKey), + })) + + serverConfig := &ssh.ServerConfig{NoClientAuth: true} + serverConfig.AddHostKey(hostSigner) + + ln, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + t.Cleanup(func() { _ = ln.Close() }) + + go func() { + for { + conn, err := ln.Accept() + if err != nil { + return + } + go fakeSSHHandleConn(conn, serverConfig, responses) + } + }() + + return clientPEM, ln.Addr().(*net.TCPAddr).Port +} + +func fakeSSHHandleConn(conn net.Conn, config *ssh.ServerConfig, responses map[string]fakeSSHResponse) { + srvConn, chans, reqs, err := ssh.NewServerConn(conn, config) + if err != nil { + return + } + defer srvConn.Close() + go ssh.DiscardRequests(reqs) + for newChan := range chans { + if newChan.ChannelType() != "session" { + _ = newChan.Reject(ssh.UnknownChannelType, "unknown channel type") + continue + } + ch, requests, err := newChan.Accept() + if err != nil { + return + } + go fakeSSHHandleSession(ch, requests, responses) + } +} + +func fakeSSHHandleSession(ch ssh.Channel, requests <-chan *ssh.Request, responses map[string]fakeSSHResponse) { + defer ch.Close() + for req := range requests { + if req.Type != "exec" { + if req.WantReply { + _ = req.Reply(false, nil) + } + continue + } + if req.WantReply { + _ = req.Reply(true, nil) + } + // SSH exec payload: 4-byte big-endian length followed by the command string. + if len(req.Payload) < 4 { + fakeSSHSendExitStatus(ch, 1) + return + } + cmdLen := binary.BigEndian.Uint32(req.Payload[:4]) + cmd := string(req.Payload[4 : 4+cmdLen]) + + resp, ok := responses[cmd] + if !ok { + fakeSSHSendExitStatus(ch, 127) + return + } + if resp.stdout != "" { + _, _ = ch.Write([]byte(resp.stdout)) + } + fakeSSHSendExitStatus(ch, resp.exitCode) + return + } +} + +func fakeSSHSendExitStatus(ch ssh.Channel, code uint32) { + payload := make([]byte, 4) + binary.BigEndian.PutUint32(payload, code) + _, _ = ch.SendRequest("exit-status", false, payload) +} + +// TestStateOfImageURLCommandV2_ProcessExitedWithoutOutputJSON verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFailed immediately when +// the image-url-command process has exited but output.json is missing. +// +// Without the fix this test exposes, the function returned ImageURLCommandStateRunning, +// causing the controller to wait up to 7 minutes for a timeout that was never needed. +func TestStateOfImageURLCommandV2_ProcessExitedWithoutOutputJSON(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, // PID file exists + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, // process gone + `cat ` + outputJSONPath: {exitCode: 1}, // output.json missing + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, detail, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFailed, state, + "expected immediate failure when process is gone without output.json, not ImageURLCommandStateRunning") + require.Contains(t, detail, "process exited") +} + +// TestStateOfImageURLCommandV2_ProcessExitedWithEmptyStatus verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFailed when the process +// has exited and output.json exists but has an empty Status field (e.g. the +// binary wrote a partial file before crashing). +func TestStateOfImageURLCommandV2_ProcessExitedWithEmptyStatus(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, + `cat ` + outputJSONPath: {exitCode: 0, stdout: `{"apiVersion":"v2"}`}, // Status field missing + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, _, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFailed, state, + "expected failure when output.json has no Status field") +} + +// TestStateOfImageURLCommandV2_ProcessStillRunning verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateRunning while the +// process is still alive (the normal in-progress case). +func TestStateOfImageURLCommandV2_ProcessStillRunning(t *testing.T) { + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 0}, // still running + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, _, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateRunning, state) +} + +// TestStateOfImageURLCommandV2_FinishedSuccessfully verifies that +// StateOfImageURLCommandV2 returns ImageURLCommandStateFinishedSuccessfully +// when the process has exited and output.json contains a non-empty Status. +func TestStateOfImageURLCommandV2_FinishedSuccessfully(t *testing.T) { + outputJSON := `{"apiVersion":"v2","status":"Succeeded","phases":{}}` + privKeyPEM, port := startFakeSSHServer(t, map[string]fakeSSHResponse{ + `[ -e /root/image-url-command.pid ]`: {exitCode: 0}, + `ps -p "$(cat /root/image-url-command.pid)" -o args= | grep -q image-url-command`: {exitCode: 1}, + `cat ` + outputJSONPath: {exitCode: 0, stdout: outputJSON}, + }) + + c := sshClient{ip: "127.0.0.1", privateSSHKey: privKeyPEM, port: port} + + state, content, err := c.StateOfImageURLCommandV2(context.Background()) + require.NoError(t, err) + require.Equal(t, ImageURLCommandStateFinishedSuccessfully, state) + require.Equal(t, outputJSON, content) +} diff --git a/pkg/services/baremetal/host/host.go b/pkg/services/baremetal/host/host.go index 747e2a9df..ccea2f14e 100644 --- a/pkg/services/baremetal/host/host.go +++ b/pkg/services/baremetal/host/host.go @@ -1317,7 +1317,7 @@ func (s *Service) actionImageInstallingImageURLCommand(ctx context.Context, sshC // get device names from storage device deviceNames := getDeviceNames(s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN(), storage) - exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, commandPath, s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.URL, data, s.scope.Hostname(), deviceNames) + exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, commandPath, s.scope.HetznerBareMetalHost.Spec.Status.InstallImage.Image.URL, data, s.scope.Hostname(), deviceNames, "") if err != nil { err := fmt.Errorf("StartImageURLCommand failed (retrying): %w", err) // This could be a temporary network error. Retry. diff --git a/pkg/services/baremetal/host/host_test.go b/pkg/services/baremetal/host/host_test.go index 9f6f9bb70..ab25e025f 100644 --- a/pkg/services/baremetal/host/host_test.go +++ b/pkg/services/baremetal/host/host_test.go @@ -196,7 +196,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) commandPath := filepath.Join(baremetalImageURLCommandDir, host.Spec.Status.InstallImage.ImageURLCommand) - sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(0, "", nil) + sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "").Return(0, "", nil) // Create bootstrap secret in fake client with key 'value' secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) @@ -208,7 +208,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { res := svc.actionImageInstalling(ctx) Expect(res).To(BeAssignableToTypeOf(actionContinue{})) - Expect(sshMock.AssertCalled(GinkgoT(), "StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"})).To(BeTrue()) + Expect(sshMock.AssertCalled(GinkgoT(), "StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "")).To(BeTrue()) c := v1beta1conditions.Get(host, infrav1.ProvisionSucceededCondition) Expect(c.Message).To(ContainSubstring(`imageURLCommand started`)) }) @@ -223,7 +223,7 @@ var _ = Describe("actionImageInstalling (image-url-command)", func() { svc := newTestService(host, nil, bmmock.NewSSHFactory(sshMock, sshMock, sshMock), nil, helpers.GetDefaultSSHSecret(rescueSSHKeyName, "default")) commandPath := filepath.Join(baremetalImageURLCommandDir, host.Spec.Status.InstallImage.ImageURLCommand) - sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}).Return(7, "boom", nil) + sshMock.On("StartImageURLCommand", mock.Anything, commandPath, host.Spec.Status.InstallImage.Image.URL, mock.Anything, svc.scope.Hostname(), []string{"nvme1n1"}, "").Return(7, "boom", nil) secret := &corev1.Secret{ObjectMeta: metav1.ObjectMeta{Name: host.Spec.Status.UserData.Name, Namespace: host.Spec.Status.UserData.Namespace}, Data: map[string][]byte{"value": []byte("#cloud-config")}} Expect(svc.scope.Client.Create(ctx, secret)).To(Succeed()) diff --git a/pkg/services/hcloud/server/server.go b/pkg/services/hcloud/server/server.go index 171d21ec8..6aacb8fe4 100644 --- a/pkg/services/hcloud/server/server.go +++ b/pkg/services/hcloud/server/server.go @@ -19,6 +19,7 @@ package server import ( "context" + "encoding/json" "errors" "fmt" "net" @@ -606,7 +607,7 @@ func (s *Service) handleBootStateBootingToRescue(ctx context.Context, server *hc return reconcile.Result{}, fmt.Errorf("resolving imageURLCommand failed: %w", err) } - exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, imageURLCommandPath, hm.Spec.ImageURL, data, s.scope.Name(), []string{"sda"}) + exitStatus, stdoutStderr, err := sshClient.StartImageURLCommand(ctx, imageURLCommandPath, hm.Spec.ImageURL, data, s.scope.Name(), []string{"sda"}, hm.Spec.ImageURLCommandAPIVersion) if err != nil { err := fmt.Errorf("StartImageURLCommand failed (retrying): %w", err) // This could be a temporary network error. Retry. @@ -653,6 +654,10 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server return reconcile.Result{}, fmt.Errorf("getSSHClient failed (wait for image-url-command): %w", err) } + if hm.Spec.ImageURLCommandAPIVersion == "v2" { + return s.handleBootStateRunningImageCommandV2(ctx, hcloudSSHClient) + } + state, logFile, err := hcloudSSHClient.StateOfImageURLCommand(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("StateOfImageURLCommand failed: %w", err) @@ -720,6 +725,147 @@ func (s *Service) handleBootStateRunningImageCommand(ctx context.Context, server } } +// handleBootStateRunningImageCommandV2 handles the RunningImageCommand boot state for +// imageURLCommandAPIVersion=v2. It reads /root/output.json once the binary exits and maps +// phase results to HCloudMachine conditions. +func (s *Service) handleBootStateRunningImageCommandV2(ctx context.Context, hcloudSSHClient sshclient.Client) (reconcile.Result, error) { + hm := s.scope.HCloudMachine + + state, outputJSON, err := hcloudSSHClient.StateOfImageURLCommandV2(ctx) + if err != nil { + return reconcile.Result{}, fmt.Errorf("StateOfImageURLCommandV2 failed: %w", err) + } + + durationOfState := time.Since(hm.Status.BootStateSince.Time) + // Please keep the number (7) in sync with the docstring of ImageURL. + if durationOfState > 7*time.Minute { + msg := fmt.Sprintf("ImageURLCommand timed out after %s. Deleting machine", + durationOfState.Round(time.Second).String()) + s.scope.Error(errors.New(msg), "", "outputJSON", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + record.Warn(hm, "ImageURLCommandFailed", outputJSON) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "RunningImageCommandTimedOut", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + } + + switch state { + case sshclient.ImageURLCommandStateRunning: + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "HCloudImageURLCommandRunning", clusterv1beta1.ConditionSeverityInfo, + "imageURLCommand running") + return reconcile.Result{RequeueAfter: 5 * time.Second}, nil + + case sshclient.ImageURLCommandStateFinishedSuccessfully: + var output sshclient.ImageURLCommandOutputV2 + if err := json.Unmarshal([]byte(outputJSON), &output); err != nil { + return reconcile.Result{}, fmt.Errorf("parsing output.json: %w", err) + } + + allSucceeded := applyNodeProvisioningConditions(hm, output) + if !allSucceeded { + msg := "ImageURLCommand provisioning failed. Deleting machine" + s.scope.Error(errors.New(msg), "", "outputJSON", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + record.Warn(hm, "ImageURLCommandFailed", outputJSON) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "ImageCommandFailed", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + } + + if rebootErr := hcloudSSHClient.Reboot(ctx).Err; rebootErr != nil { + return reconcile.Result{}, fmt.Errorf("reboot after ImageURLCommand failed: %w", rebootErr) + } + hm.SetBootState(infrav1.HCloudBootStateBootingToRealOS) + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "BootingToRealOS", clusterv1beta1.ConditionSeverityInfo, + "Operating system of node is booting") + return reconcile.Result{RequeueAfter: requeueImmediately}, nil + + case sshclient.ImageURLCommandStateFailed: + msg := "ImageURLCommand failed to read output.json. Deleting machine" + s.scope.Error(errors.New(msg), "", "detail", outputJSON) + if err := s.scope.SetErrorAndRemediate(ctx, msg); err != nil { + return reconcile.Result{}, err + } + v1beta1conditions.MarkFalse(hm, infrav1.ServerProvisionedCondition, + "ImageCommandFailed", clusterv1beta1.ConditionSeverityWarning, "%s", msg) + return reconcile.Result{}, nil + + case sshclient.ImageURLCommandStateNotStarted: + return reconcile.Result{}, fmt.Errorf("image-url-command not started in BootState RunningImageCommand? Should not happen") + + default: + return reconcile.Result{}, fmt.Errorf("unknown ImageURLCommandState: %q", state) + } +} + +// applyNodeProvisioningConditions parses an ImageURLCommandOutputV2 and sets phase conditions +// on the HCloudMachine. Returns true if all four phases succeeded. +func applyNodeProvisioningConditions(hm *infrav1.HCloudMachine, output sshclient.ImageURLCommandOutputV2) bool { + type phaseMapping struct { + name string + condition clusterv1beta1.ConditionType + } + phases := []phaseMapping{ + {"Preparation", infrav1.PreparationSucceededCondition}, + {"ImageDeployment", infrav1.ImageDeploymentSucceededCondition}, + {"BootstrapDelivery", infrav1.BootstrapDeliverySucceededCondition}, + {"Handover", infrav1.HandoverSucceededCondition}, + } + + allSucceeded := true + anyFailed := false + for _, pm := range phases { + phase, ok := output.Phases[pm.name] + if !ok { + v1beta1conditions.MarkUnknown(hm, pm.condition, + infrav1.ProvisioningPhaseNotStartedReason, "phase not present in output.json") + allSucceeded = false + continue + } + switch phase.Status { + case "Succeeded": + v1beta1conditions.MarkTrue(hm, pm.condition) + case "Failed": + reason := infrav1.ProvisioningPhaseFailedReason + message := "" + for _, step := range phase.Steps { + if step.Name == phase.FailedStep { + reason = step.Name + message = step.Message + break + } + } + v1beta1conditions.MarkFalse(hm, pm.condition, reason, + clusterv1beta1.ConditionSeverityError, "%s", message) + allSucceeded = false + anyFailed = true + default: // "NotStarted" or unknown + v1beta1conditions.MarkUnknown(hm, pm.condition, + infrav1.ProvisioningPhaseNotStartedReason, "phase was not reached") + allSucceeded = false + } + } + + if allSucceeded { + v1beta1conditions.MarkTrue(hm, infrav1.NodeProvisioningSucceededCondition) + } else if anyFailed { + v1beta1conditions.MarkFalse(hm, infrav1.NodeProvisioningSucceededCondition, + infrav1.ProvisioningPhaseFailedReason, clusterv1beta1.ConditionSeverityError, + "A provisioning phase failed.") + } else { + v1beta1conditions.MarkUnknown(hm, infrav1.NodeProvisioningSucceededCondition, + infrav1.ProvisioningPhaseNotStartedReason, "Not all phases completed.") + } + + return allSucceeded +} + // handleBootingToRealOS is used for both ways (imageName/snapshot and imageURL). func (s *Service) handleBootingToRealOS(ctx context.Context, server *hcloud.Server) (res reconcile.Result, err error) { hm := s.scope.HCloudMachine diff --git a/pkg/services/hcloud/server/server_test.go b/pkg/services/hcloud/server/server_test.go index 2a33581df..12dafe2ab 100644 --- a/pkg/services/hcloud/server/server_test.go +++ b/pkg/services/hcloud/server/server_test.go @@ -1189,7 +1189,7 @@ var _ = Describe("Reconcile", func() { StdErr: "", Err: nil, }) - startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "my-machine", []string{"sda"}).Return(0, "", nil) + startImageURLCommandMock := testEnv.HCloudSSHClient.On("StartImageURLCommand", mock.Anything, mock.Anything, mock.Anything, mock.Anything, "my-machine", []string{"sda"}, mock.Anything).Return(0, "", nil) _, err = service.Reconcile(ctx) Expect(err).To(BeNil()) Expect(service.scope.HCloudMachine.Status.BootState).To(Equal(infrav1.HCloudBootStateRunningImageCommand))