Skip to content

Commit 52090eb

Browse files
authored
🌱 Remove usage of FailureReason and FailureMessage (hcloud, PR2) (#1770)
* 🌱 Remove usage of FailureReason and FailureMessage (baremetal) > Deprecated: This field is deprecated and is > going to be removed when support for v1beta1 > will be dropped. Please see > https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md > for more details.
1 parent bce32f9 commit 52090eb

13 files changed

Lines changed: 399 additions & 174 deletions

File tree

api/v1beta1/hcloudmachine_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,12 +120,18 @@ type HCloudMachineStatus struct {
120120
// FailureReason will be set in the event that there is a terminal problem
121121
// reconciling the Machine and will contain a succinct value suitable
122122
// for machine interpretation.
123+
//
124+
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
125+
//
123126
// +optional
124127
FailureReason *capierrors.MachineStatusError `json:"failureReason,omitempty"`
125128

126129
// FailureMessage will be set in the event that there is a terminal problem
127130
// reconciling the Machine and will contain a more verbose string suitable
128131
// for logging and human consumption.
132+
//
133+
// Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
134+
//
129135
// +optional
130136
FailureMessage *string `json:"failureMessage,omitempty"`
131137

api/v1beta1/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,4 +292,8 @@ const (
292292

293293
// HCloudBootStateOperatingSystemRunning indicates that the server is successfully running.
294294
HCloudBootStateOperatingSystemRunning HCloudBootState = "OperatingSystemRunning"
295+
296+
// HCloudBootStateProvisioningFailed indicates that provisioning failed. The capi machine, and
297+
// the hcloud machine will get deleted.
298+
HCloudBootStateProvisioningFailed HCloudBootState = "ProvisioningFailed"
295299
)

config/crd/bases/infrastructure.cluster.x-k8s.io_hcloudmachines.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -262,12 +262,16 @@ spec:
262262
FailureMessage will be set in the event that there is a terminal problem
263263
reconciling the Machine and will contain a more verbose string suitable
264264
for logging and human consumption.
265+
266+
Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
265267
type: string
266268
failureReason:
267269
description: |-
268270
FailureReason will be set in the event that there is a terminal problem
269271
reconciling the Machine and will contain a succinct value suitable
270272
for machine interpretation.
273+
274+
Deprecated: This field is deprecated and is going to be removed when support for v1beta1 will be dropped. Please see https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md for more details.
271275
type: string
272276
instanceState:
273277
description: InstanceState is the state of the server for this machine.

controllers/hcloudmachine_controller.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,12 @@ func (r *HCloudMachineReconciler) Reconcile(ctx context.Context, req reconcile.R
241241
return r.reconcileDelete(ctx, machineScope)
242242
}
243243

244-
if hcloudMachine.Status.FailureReason != nil {
245-
// This machine will be removed.
244+
if hcloudMachine.Status.BootState == infrav1.HCloudBootStateProvisioningFailed {
245+
// This hcloud machine will be removed soon.
246+
log.Info("hcloudmachine: ProvisioningFailed. Not reconciling this machine.")
246247
return reconcile.Result{}, nil
247248
}
249+
248250
return r.reconcileNormal(ctx, machineScope)
249251
}
250252

controllers/hcloudremediation_controller_test.go

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,13 @@ import (
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2828
"k8s.io/utils/ptr"
2929
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
30+
"sigs.k8s.io/cluster-api/util/conditions"
3031
"sigs.k8s.io/cluster-api/util/patch"
3132
"sigs.k8s.io/controller-runtime/pkg/client"
33+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
3234

3335
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
36+
"github.com/syself/cluster-api-provider-hetzner/pkg/scope"
3437
hcloudutil "github.com/syself/cluster-api-provider-hetzner/pkg/services/hcloud/util"
3538
"github.com/syself/cluster-api-provider-hetzner/pkg/utils"
3639
)
@@ -147,9 +150,8 @@ var _ = Describe("HCloudRemediationReconciler", func() {
147150
},
148151
},
149152
Spec: infrav1.HCloudMachineSpec{
150-
ImageName: "my-control-plane",
151-
Type: "cpx31",
152-
PlacementGroupName: &defaultPlacementGroupName,
153+
ImageName: "my-control-plane",
154+
Type: "cpx31",
153155
},
154156
}
155157
Expect(testEnv.Create(ctx, hcloudMachine)).To(Succeed())
@@ -227,14 +229,18 @@ var _ = Describe("HCloudRemediationReconciler", func() {
227229
Expect(testEnv.Create(ctx, hcloudRemediation)).To(Succeed())
228230

229231
By("checking if hcloudRemediation is in deleting phase and capiMachine has the MachineOwnerRemediatedCondition")
230-
Eventually(func() bool {
232+
Eventually(func() error {
231233
if err := testEnv.Get(ctx, hcloudRemediationkey, hcloudRemediation); err != nil {
232-
return false
234+
return err
233235
}
234-
235-
return hcloudRemediation.Status.Phase == infrav1.PhaseDeleting &&
236-
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
237-
}, timeout).Should(BeTrue())
236+
if hcloudRemediation.Status.Phase != infrav1.PhaseDeleting {
237+
return fmt.Errorf("hcloudRemediation.Status.Phase is not infrav1.PhaseDeleting")
238+
}
239+
if !isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason) {
240+
return fmt.Errorf("MachineOwnerRemediatedCondition not set")
241+
}
242+
return nil
243+
}, timeout).Should(Succeed())
238244
})
239245

240246
It("checks that, under normal conditions, a reboot is carried out and retryCount and lastRemediated are set", func() {
@@ -318,5 +324,92 @@ var _ = Describe("HCloudRemediationReconciler", func() {
318324
isPresentAndFalseWithReason(capiMachineKey, capiMachine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason)
319325
}, timeout).Should(BeTrue())
320326
})
327+
It("should delete machine if SetErrorAndRemediate() was called", func() {
328+
By("Creating Server")
329+
330+
hcloudClient := testEnv.HCloudClientFactory.NewClient("dummy-token")
331+
332+
server, err := hcloudClient.CreateServer(ctx, hcloud.ServerCreateOpts{
333+
Name: "myserver",
334+
})
335+
Expect(err).ShouldNot(HaveOccurred())
336+
providerID := hcloudutil.ProviderIDFromServerID(int(server.ID))
337+
hcloudMachine.Spec.ProviderID = &providerID
338+
err = testEnv.Update(ctx, hcloudMachine)
339+
Expect(err).ShouldNot(HaveOccurred())
340+
341+
By("Call SetRemediateMachineAnnotationToDeleteMachine")
342+
Eventually(func() error {
343+
err = testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
344+
if err != nil {
345+
return err
346+
}
347+
err = scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, testEnv, capiMachine, hcloudMachine, "test-of-set-error-and-remediate")
348+
if err != nil {
349+
return err
350+
}
351+
err = testEnv.Status().Update(ctx, hcloudMachine)
352+
if err != nil {
353+
return err
354+
}
355+
return nil
356+
}).Should(Succeed())
357+
358+
By("Wait until HCloudBootStateProvisioningFailed is set.")
359+
Eventually(func() error {
360+
err := testEnv.Get(ctx, client.ObjectKeyFromObject(hcloudMachine), hcloudMachine)
361+
if err != nil {
362+
return err
363+
}
364+
if hcloudMachine.Status.BootState != infrav1.HCloudBootStateProvisioningFailed {
365+
return fmt.Errorf("BootState is not HCloudBootStateProvisioningFailed, but %q",
366+
hcloudMachine.Status.BootState)
367+
}
368+
return nil
369+
}).Should(Succeed())
370+
371+
By("Do the job of CAPI: Create a HCloudRemediation")
372+
rem := &infrav1.HCloudRemediation{
373+
ObjectMeta: metav1.ObjectMeta{
374+
Name: hcloudMachine.Name,
375+
Namespace: hcloudMachine.Namespace,
376+
},
377+
Spec: infrav1.HCloudRemediationSpec{
378+
Strategy: &infrav1.RemediationStrategy{
379+
Type: infrav1.RemediationTypeReboot,
380+
RetryLimit: 5,
381+
Timeout: &metav1.Duration{
382+
Duration: time.Minute,
383+
},
384+
},
385+
},
386+
}
387+
388+
err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.GetScheme())
389+
Expect(err).Should(Succeed())
390+
391+
err = testEnv.Create(ctx, rem)
392+
Expect(err).ShouldNot(HaveOccurred())
393+
394+
By("Wait until our remediation controller has set condition on capi machine")
395+
Eventually(func() error {
396+
err := testEnv.Get(ctx, client.ObjectKeyFromObject(capiMachine), capiMachine)
397+
if err != nil {
398+
return err
399+
}
400+
401+
c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition)
402+
if c == nil {
403+
return fmt.Errorf("not set: MachineOwnerRemediatedCondition")
404+
}
405+
if c.Status != corev1.ConditionFalse {
406+
return fmt.Errorf("status not set yet")
407+
}
408+
if c.Message != "Remediation finished (machine will be deleted): exit remediation because infra machine is in BootState ProvisioningFailed (no need to try a reboot)" {
409+
return fmt.Errorf("Message is not as expected: %q", c.Message)
410+
}
411+
return nil
412+
}).Should(Succeed())
413+
})
321414
})
322415
})

pkg/scope/hcloudremediation.go

Lines changed: 13 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -72,33 +72,26 @@ func NewHCloudRemediationScope(params HCloudRemediationScopeParams) (*HCloudReme
7272
return nil, fmt.Errorf("failed to init patch helper: %w", err)
7373
}
7474

75-
machinePatchHelper, err := patch.NewHelper(params.Machine, params.Client)
76-
if err != nil {
77-
return nil, fmt.Errorf("failed to init machine patch helper: %w", err)
78-
}
79-
8075
return &HCloudRemediationScope{
81-
Logger: params.Logger,
82-
Client: params.Client,
83-
HCloudClient: params.HCloudClient,
84-
patchHelper: patchHelper,
85-
machinePatchHelper: machinePatchHelper,
86-
Machine: params.Machine,
87-
HCloudMachine: params.HCloudMachine,
88-
HCloudRemediation: params.HCloudRemediation,
76+
Logger: params.Logger,
77+
Client: params.Client,
78+
HCloudClient: params.HCloudClient,
79+
patchHelper: patchHelper,
80+
Machine: params.Machine,
81+
HCloudMachine: params.HCloudMachine,
82+
HCloudRemediation: params.HCloudRemediation,
8983
}, nil
9084
}
9185

9286
// HCloudRemediationScope defines the basic context for an actuator to operate upon.
9387
type HCloudRemediationScope struct {
9488
logr.Logger
95-
Client client.Client
96-
patchHelper *patch.Helper
97-
machinePatchHelper *patch.Helper
98-
HCloudClient hcloudclient.Client
99-
Machine *clusterv1.Machine
100-
HCloudMachine *infrav1.HCloudMachine
101-
HCloudRemediation *infrav1.HCloudRemediation
89+
Client client.Client
90+
patchHelper *patch.Helper
91+
HCloudClient hcloudclient.Client
92+
Machine *clusterv1.Machine
93+
HCloudMachine *infrav1.HCloudMachine
94+
HCloudRemediation *infrav1.HCloudRemediation
10295
}
10396

10497
// Close closes the current scope persisting the cluster configuration and status.
@@ -126,8 +119,3 @@ func (m *HCloudRemediationScope) ServerIDFromProviderID() (int64, error) {
126119
func (m *HCloudRemediationScope) PatchObject(ctx context.Context, opts ...patch.Option) error {
127120
return m.patchHelper.Patch(ctx, m.HCloudRemediation, opts...)
128121
}
129-
130-
// PatchMachine persists the machine spec and status.
131-
func (m *HCloudRemediationScope) PatchMachine(ctx context.Context, opts ...patch.Option) error {
132-
return m.machinePatchHelper.Patch(ctx, m.Machine, opts...)
133-
}

pkg/scope/machine.go

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,11 @@ import (
2828

2929
"k8s.io/apimachinery/pkg/types"
3030
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
31-
capierrors "sigs.k8s.io/cluster-api/errors" //nolint:staticcheck // we will handle that, when we update to capi v1.11
3231
"sigs.k8s.io/cluster-api/util"
3332
"sigs.k8s.io/cluster-api/util/conditions"
3433
"sigs.k8s.io/cluster-api/util/patch"
34+
"sigs.k8s.io/cluster-api/util/record"
35+
"sigs.k8s.io/controller-runtime/pkg/client"
3536

3637
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
3738
secretutil "github.com/syself/cluster-api-provider-hetzner/pkg/secrets"
@@ -126,13 +127,44 @@ func (m *MachineScope) PatchObject(ctx context.Context) error {
126127
return m.patchHelper.Patch(ctx, m.HCloudMachine)
127128
}
128129

129-
// SetError sets the ErrorMessage and ErrorReason fields on the machine and logs
130-
// the message. It assumes the reason is invalid configuration, since that is
131-
// currently the only relevant MachineStatusError choice.
132-
// CAPI will delete the machine and create a new one.
133-
func (m *MachineScope) SetError(message string, reason capierrors.MachineStatusError) {
134-
m.HCloudMachine.Status.FailureMessage = &message
135-
m.HCloudMachine.Status.FailureReason = &reason
130+
// SetErrorAndRemediate sets "cluster.x-k8s.io/remediate-machine" annotation on the corresponding
131+
// CAPI machine. CAPI will remediate that machine. Additionally, an event of type Warning will be
132+
// created, and the DeleteMachineSucceededCondition will be set to False on the hcloud-machine. It
133+
// gets used, when a not-recoverable error happens. Example: hcloud server was deleted by hand in
134+
// the hcloud UI.
135+
func (m *MachineScope) SetErrorAndRemediate(ctx context.Context, message string) error {
136+
return SetRemediateMachineAnnotationToDeleteMachine(ctx, m.Client, m.Machine, m.HCloudMachine, message)
137+
}
138+
139+
// SetRemediateMachineAnnotationToDeleteMachine sets "cluster.x-k8s.io/remediate-machine" annotation
140+
// on the corresponding CAPI machine. This will trigger CAPI to start remediation. Our remediation
141+
// contoller will inspect BootState to differentiate between a remediate (with reboot) and delete
142+
// (no reboot gets tried). Finally the capi machine and the infra machine will be deleted.
143+
//
144+
// Background: the hcloudmachine controller has no permission to delete a capi machine. That's why
145+
// this extra step (via remediate-machine annotation) is needed.
146+
func SetRemediateMachineAnnotationToDeleteMachine(ctx context.Context, crClient client.Client, capiMachine *clusterv1.Machine, hcloudMachine *infrav1.HCloudMachine, message string) error {
147+
// Create a patch base
148+
patch := client.MergeFrom(capiMachine.DeepCopy())
149+
150+
// Modify only annotations on the in-memory copy
151+
if capiMachine.Annotations == nil {
152+
capiMachine.Annotations = map[string]string{}
153+
}
154+
capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] = ""
155+
156+
// Apply patch – only the diff (annotations) is sent to the API server
157+
if err := crClient.Patch(ctx, capiMachine, patch); err != nil {
158+
return fmt.Errorf("patch failed in SetErrorAndRemediate: %w", err)
159+
}
160+
161+
record.Warnf(hcloudMachine,
162+
"HCloudMachineWillBeRemediated",
163+
"HCloudMachine will be remediated: %s", message)
164+
165+
hcloudMachine.SetBootState(infrav1.HCloudBootStateProvisioningFailed)
166+
167+
return nil
136168
}
137169

138170
// SetRegion sets the region field on the machine.

pkg/services/baremetal/remediation/remediation.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ func (s *Service) timeUntilNextRemediation(now time.Time) time.Duration {
194194
}
195195

196196
// setOwnerRemediatedConditionToFailed sets MachineOwnerRemediatedCondition on CAPI machine object
197-
// that have failed a healthcheck.
197+
// that have failed a healthcheck. This will make capi delete the capi and baremetal machine.
198198
func (s *Service) setOwnerRemediatedConditionToFailed(ctx context.Context, msg string) error {
199199
capiMachine, err := util.GetOwnerMachine(ctx, s.scope.Client, s.scope.BareMetalRemediation.ObjectMeta)
200200
if err != nil {

0 commit comments

Comments
 (0)