Skip to content

Commit bce32f9

Browse files
authored
🌱 Remove usage of FailureReason and FailureMessage (baremetal) (#1716)
* 🌱 Remove usage of FailureReason and FailureMessage (baremetal) > Deprecated: This field is deprecated and is > going to be removed when support for v1beta1 > will be dropped. Please see > https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20240916-improve-status-in-CAPI-resources.md > for more details.
1 parent 9174672 commit bce32f9

9 files changed

Lines changed: 182 additions & 71 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ test/e2e/data/infrastructure-hetzner/v1beta1/env*
3636
/tilt.d
3737
tilt_config.json
3838
baremetalhosts.yaml
39+
bm-*.yaml
3940

4041
# kubeconfigs
4142
./kind.kubeconfig

api/v1beta1/hetznerbaremetalhost_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,11 @@ func (sts ControllerGeneratedStatus) GetIPAddress() string {
342342
return sts.IPv4
343343
}
344344

345+
// HasFatalError returns true, if the corresponding capi machine should get deleted.
346+
func (sts ControllerGeneratedStatus) HasFatalError() bool {
347+
return sts.ErrorType == FatalError || sts.ErrorType == PermanentError
348+
}
349+
345350
// GetConditions returns the observations of the operational state of the HetznerBareMetalHost resource.
346351
func (host *HetznerBareMetalHost) GetConditions() clusterv1.Conditions {
347352
return host.Spec.Status.Conditions

api/v1beta1/hetznerbaremetalmachine_types.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -359,12 +359,6 @@ func (hbmm *HetznerBareMetalMachine) SetConditions(conditions clusterv1.Conditio
359359
hbmm.Status.Conditions = conditions
360360
}
361361

362-
// SetFailure sets a failure reason and message.
363-
func (hbmm *HetznerBareMetalMachine) SetFailure(reason string, message string) {
364-
hbmm.Status.FailureReason = &reason
365-
hbmm.Status.FailureMessage = &message
366-
}
367-
368362
// GetImageSuffix tests whether the suffix is known and outputs it if yes. Otherwise it returns an error.
369363
func GetImageSuffix(url string) (string, error) {
370364
if strings.HasPrefix(url, "oci://") {

api/v1beta1/hetznerbaremetalmachine_types_test.go

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -176,35 +176,6 @@ var _ = Describe("Test GetImageSuffix", func() {
176176
)
177177
})
178178

179-
var _ = Describe("Test SetFailure", func() {
180-
bmMachine := HetznerBareMetalMachine{}
181-
newFailureMessage := "bad failure message"
182-
newFailureReason := "bad failure reason"
183-
184-
It("sets new failure on the machine with existing failure", func() {
185-
failureMessage := "first message"
186-
failureReason := "first error"
187-
bmMachine.Status.FailureMessage = &failureMessage
188-
bmMachine.Status.FailureReason = &failureReason
189-
190-
bmMachine.SetFailure(newFailureReason, newFailureMessage)
191-
192-
Expect(bmMachine.Status.FailureMessage).ToNot(BeNil())
193-
Expect(bmMachine.Status.FailureReason).ToNot(BeNil())
194-
Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage))
195-
Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason))
196-
})
197-
198-
It("sets new failure on the machine without existing failure", func() {
199-
bmMachine.SetFailure(newFailureReason, newFailureMessage)
200-
201-
Expect(bmMachine.Status.FailureMessage).ToNot(BeNil())
202-
Expect(bmMachine.Status.FailureReason).ToNot(BeNil())
203-
Expect(*bmMachine.Status.FailureMessage).To(Equal(newFailureMessage))
204-
Expect(*bmMachine.Status.FailureReason).To(Equal(newFailureReason))
205-
})
206-
})
207-
208179
var _ = Describe("Test HasHostAnnotation", func() {
209180
type testCaseHasHostAnnotation struct {
210181
annotations map[string]string

controllers/controllers_suite_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ limitations under the License.
1717
package controllers
1818

1919
import (
20+
"context"
2021
"fmt"
2122
"os"
23+
"strings"
2224
"sync"
2325
"testing"
2426
"time"
@@ -34,6 +36,7 @@ import (
3436
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
3537
"sigs.k8s.io/cluster-api/util/conditions"
3638
ctrl "sigs.k8s.io/controller-runtime"
39+
"sigs.k8s.io/controller-runtime/pkg/client"
3740
"sigs.k8s.io/controller-runtime/pkg/controller"
3841
"sigs.k8s.io/controller-runtime/pkg/metrics"
3942

@@ -415,3 +418,21 @@ func isPresentAndTrue(key types.NamespacedName, getter conditions.Getter, condit
415418
objectCondition := conditions.Get(getter, condition)
416419
return objectCondition.Status == corev1.ConditionTrue
417420
}
421+
422+
func hasEvent(ctx context.Context, c client.Client, namespace, involvedObjectName, reason, message string) bool {
423+
eventList := &corev1.EventList{}
424+
if err := c.List(ctx, eventList, client.InNamespace(namespace)); err != nil {
425+
return false
426+
}
427+
428+
for i := range eventList.Items {
429+
event := eventList.Items[i]
430+
if event.Reason == reason &&
431+
event.InvolvedObject.Name == involvedObjectName &&
432+
strings.Contains(event.Message, message) {
433+
return true
434+
}
435+
}
436+
437+
return false
438+
}

controllers/hetznerbaremetalmachine_controller_test.go

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@ import (
3737
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3838
"k8s.io/utils/ptr"
3939
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
40+
"sigs.k8s.io/cluster-api/util"
41+
"sigs.k8s.io/cluster-api/util/conditions"
4042
"sigs.k8s.io/cluster-api/util/patch"
4143
"sigs.k8s.io/controller-runtime/pkg/client"
4244
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
45+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
4346
"sigs.k8s.io/controller-runtime/pkg/reconcile"
4447

4548
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
@@ -480,7 +483,7 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() {
480483
}, timeout, time.Second).Should(BeTrue())
481484
})
482485

483-
It("sets a failure reason when maintenance mode is set on the host", func() {
486+
It("sets RemediateMachineAnnotation when maintenance mode is set on the host", func() {
484487
By("making sure that machine is ready")
485488

486489
Eventually(func() bool {
@@ -500,14 +503,85 @@ var _ = Describe("HetznerBareMetalMachineReconciler", func() {
500503

501504
Expect(ph.Patch(ctx, host, patch.WithStatusObservedGeneration{})).To(Succeed())
502505

503-
By("checking that failure message is set on machine")
506+
By("checking that RemediateMachineAnnotation is set on machine")
504507

505-
Eventually(func() bool {
508+
Eventually(func() error {
506509
if err := testEnv.Get(ctx, key, bmMachine); err != nil {
507-
return false
510+
return err
508511
}
509-
return bmMachine.Status.FailureMessage != nil && *bmMachine.Status.FailureMessage == baremetal.FailureMessageMaintenanceMode
510-
}, timeout).Should(BeTrue())
512+
513+
capiMachine, err := util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta)
514+
if err != nil {
515+
return err
516+
}
517+
518+
_, exists := capiMachine.Annotations[clusterv1.RemediateMachineAnnotation]
519+
if !exists {
520+
return fmt.Errorf("RemediateMachineAnnotation not set on capi machine")
521+
}
522+
523+
if !hasEvent(ctx, testEnv, testNs.Name, bmMachine.Name, "MachineWillBeDeleted",
524+
baremetal.FailureMessageMaintenanceMode) {
525+
return fmt.Errorf("Event not found")
526+
}
527+
528+
return nil
529+
}, timeout).Should(Succeed())
530+
531+
By("Do the job of CAPI: Create a HetznerBareMetalRemediation")
532+
rem := &infrav1.HetznerBareMetalRemediation{
533+
ObjectMeta: metav1.ObjectMeta{
534+
Name: bmMachine.Name,
535+
Namespace: bmMachine.Namespace,
536+
},
537+
}
538+
err = controllerutil.SetOwnerReference(capiMachine, rem, testEnv.Scheme())
539+
Expect(err).To(BeNil())
540+
err = testEnv.Create(ctx, rem)
541+
Expect(err).To(BeNil())
542+
543+
By("Wait for MachineOwnerRemediatedCondition to be set on capiMachine")
544+
Eventually(func() error {
545+
capiMachine, err = util.GetOwnerMachine(ctx, testEnv, bmMachine.ObjectMeta)
546+
if err != nil {
547+
return err
548+
}
549+
if capiMachine == nil {
550+
return fmt.Errorf("capiMachine is nil")
551+
}
552+
553+
err = testEnv.Get(ctx, client.ObjectKeyFromObject(bmMachine), bmMachine)
554+
Expect(err).ShouldNot(HaveOccurred())
555+
556+
Expect(capiMachine.Name).To(Equal(bmMachine.Name))
557+
Expect(capiMachine.Spec.InfrastructureRef.Name).To(Equal(bmMachine.Name))
558+
559+
c := conditions.Get(capiMachine, clusterv1.MachineOwnerRemediatedCondition)
560+
if c == nil {
561+
return fmt.Errorf("condition MachineOwnerRemediatedCondition does not exist %+v", capiMachine.Status)
562+
}
563+
564+
if c.Status != corev1.ConditionFalse {
565+
return fmt.Errorf("condition MachineOwnerRemediatedCondition should be False")
566+
}
567+
568+
if c.Message != "Remediation finished (machine will be deleted): exit remediation because host is in maintenance mode" {
569+
return fmt.Errorf("Unexpected message: %q", c.Message)
570+
}
571+
572+
return nil
573+
}, timeout).Should(Succeed())
574+
575+
By("Play role of CAPI: Delete capi and bm machine")
576+
err = testEnv.Delete(ctx, capiMachine)
577+
Expect(err).Should(Succeed())
578+
err = testEnv.Delete(ctx, bmMachine)
579+
Expect(err).Should(Succeed())
580+
581+
By("waiting for the successful kubeadm reset event")
582+
Eventually(func() bool {
583+
return hasEvent(ctx, testEnv, testNs.Name, host.Name, "SuccessfulResetKubeAdm", "Reset was successful.")
584+
}, timeout, time.Second).Should(BeTrue())
511585
})
512586

513587
It("checks the hetznerBareMetalMachine status running phase", func() {

pkg/scope/baremetalmachine.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"sigs.k8s.io/cluster-api/util"
2626
"sigs.k8s.io/cluster-api/util/conditions"
2727
"sigs.k8s.io/cluster-api/util/patch"
28+
"sigs.k8s.io/cluster-api/util/record"
2829
"sigs.k8s.io/controller-runtime/pkg/client"
2930

3031
infrav1 "github.com/syself/cluster-api-provider-hetzner/api/v1beta1"
@@ -123,3 +124,31 @@ func (m *BareMetalMachineScope) IsControlPlane() bool {
123124
func (m *BareMetalMachineScope) IsBootstrapReady() bool {
124125
return m.Machine.Spec.Bootstrap.DataSecretName != nil
125126
}
127+
128+
// SetRemediateMachineAnnotationToDeleteMachine sets "cluster.x-k8s.io/remediate-machine" annotation
129+
// on the corresponding CAPI machine. This will trigger CAPI to start remediation. Our remediation
130+
// contoller will use HasFatalError() to differentiate between a remediate (with reboot) and delete
131+
// (no reboot gets tried). Finally the capi machine and the infra machine will be deleted.
132+
//
133+
// Background: the hbmm/hbmh controller has no permission to delete a capi machine. That's why this
134+
// extra step (via remediate-machine annotation) is needed.
135+
func (m *BareMetalMachineScope) SetRemediateMachineAnnotationToDeleteMachine(ctx context.Context, message string) error {
136+
capiMachine := m.Machine
137+
138+
// Create a patch base
139+
patch := client.MergeFrom(capiMachine.DeepCopy())
140+
141+
// Modify only annotations on the in-memory copy
142+
if capiMachine.Annotations == nil {
143+
capiMachine.Annotations = map[string]string{}
144+
}
145+
capiMachine.Annotations[clusterv1.RemediateMachineAnnotation] = ""
146+
147+
// Apply patch – only the diff (annotations) is sent to the API server
148+
if err := m.Client.Patch(ctx, capiMachine, patch); err != nil {
149+
return err
150+
}
151+
152+
record.Warnf(m.BareMetalMachine, "MachineWillBeDeleted", "Machine will be deleted: %s", message)
153+
return nil
154+
}

pkg/services/baremetal/baremetal/baremetal.go

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,18 @@ func (s *Service) Reconcile(ctx context.Context) (res reconcile.Result, err erro
113113
conditions.MarkTrue(s.scope.BareMetalMachine, infrav1.HostAssociateSucceededCondition)
114114

115115
// update the machine
116-
if err := s.update(ctx); err != nil {
116+
host, err := s.update(ctx)
117+
if err != nil {
117118
return checkForRequeueError(err, "failed to update machine")
118119
}
119120

121+
if host.Spec.Status.HasFatalError() {
122+
// hbmm will be deleted soon.
123+
s.scope.BareMetalMachine.Status.Phase = clusterv1.MachinePhaseDeleting
124+
s.scope.BareMetalMachine.Status.Ready = false
125+
return reconcile.Result{}, nil
126+
}
127+
120128
// set providerID if necessary
121129
if err := s.setProviderID(ctx); err != nil {
122130
return reconcile.Result{}, fmt.Errorf("failed to set providerID: %w", err)
@@ -204,14 +212,19 @@ func (s *Service) Delete(ctx context.Context) (reconcile.Result, error) {
204212
}
205213

206214
// update updates a machine and is invoked by the Machine Controller.
207-
func (s *Service) update(ctx context.Context) error {
215+
func (s *Service) update(ctx context.Context) (*infrav1.HetznerBareMetalHost, error) {
208216
host, helper, err := s.getAssociatedHostAndPatchHelper(ctx)
209217
if err != nil {
210-
return fmt.Errorf("failed to get host: %w", err)
218+
if apierrors.IsNotFound(err) {
219+
msg := fmt.Sprintf("host not found for machine %q. Setting error and deleting machine", s.scope.Machine.Name)
220+
err = s.scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, msg)
221+
return nil, err
222+
}
223+
return nil, fmt.Errorf("failed to get host: %w", err)
211224
}
212225
if host == nil {
213-
s.scope.BareMetalMachine.SetFailure("UpdateError", "host not found")
214-
return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err)
226+
err = errors.Join(s.scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, "Reconcile of hbmm: host not found"))
227+
return nil, fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err)
215228
}
216229

217230
readyCondition := conditions.Get(host, clusterv1.ReadyCondition)
@@ -232,28 +245,21 @@ func (s *Service) update(ctx context.Context) error {
232245
}
233246

234247
// maintenance mode on the host is a fatal error for the machine object
235-
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode && s.scope.BareMetalMachine.Status.FailureReason == nil {
236-
s.scope.BareMetalMachine.SetFailure("UpdateError", FailureMessageMaintenanceMode)
237-
record.Eventf(
238-
s.scope.BareMetalMachine,
239-
"BareMetalMachineSetFailure",
240-
"set failure reason due to maintenance mode of underlying host",
241-
)
242-
return nil
243-
}
244-
245-
// if host has a fatal error, then it should be set on the machine object as well
246-
if (host.Spec.Status.ErrorType == infrav1.FatalError || host.Spec.Status.ErrorType == infrav1.PermanentError) &&
247-
s.scope.BareMetalMachine.Status.FailureReason == nil {
248-
s.scope.BareMetalMachine.SetFailure("UpdateError", host.Spec.Status.ErrorMessage)
249-
record.Eventf(s.scope.BareMetalMachine, "BareMetalMachineSetFailure", host.Spec.Status.ErrorMessage)
250-
return nil
248+
if host.Spec.MaintenanceMode != nil && *host.Spec.MaintenanceMode {
249+
err := s.scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, FailureMessageMaintenanceMode)
250+
if err != nil {
251+
return nil, err
252+
}
253+
return host, nil
251254
}
252255

253-
// if host is healthy, the machine is healthy as well
254-
if host.Spec.Status.ErrorType == infrav1.ErrorType("") {
255-
s.scope.BareMetalMachine.Status.FailureMessage = nil
256-
s.scope.BareMetalMachine.Status.FailureReason = nil
256+
// if host has a fatal error, then it should be set on the hbmm object as well
257+
if host.Spec.Status.HasFatalError() {
258+
err := s.scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, host.Spec.Status.ErrorMessage)
259+
if err != nil {
260+
return nil, err
261+
}
262+
return host, nil
257263
}
258264

259265
// ensure that the references are correctly set on host
@@ -266,13 +272,13 @@ func (s *Service) update(ctx context.Context) error {
266272
ensureClusterLabel(host, s.scope.Machine.Spec.ClusterName)
267273

268274
if err := analyzePatchError(helper.Patch(ctx, host), false); err != nil {
269-
return fmt.Errorf("failed to patch host: %w", err)
275+
return nil, fmt.Errorf("failed to patch host: %w", err)
270276
}
271277

272278
// if machine is a control plane, the host should be set as target of load balancer
273279
if s.scope.IsControlPlane() {
274280
if err := s.reconcileLoadBalancerAttachment(ctx, host); err != nil {
275-
return fmt.Errorf("failed to reconcile load balancer attachment: %w", err)
281+
return nil, fmt.Errorf("failed to reconcile load balancer attachment: %w", err)
276282
}
277283
}
278284

@@ -281,7 +287,7 @@ func (s *Service) update(ctx context.Context) error {
281287

282288
// update status of HetznerBareMetalMachine with infos from host
283289
s.updateMachineAddresses(host)
284-
return nil
290+
return host, nil
285291
}
286292

287293
func (s *Service) associate(ctx context.Context) error {
@@ -661,8 +667,11 @@ func (s *Service) setProviderID(ctx context.Context) error {
661667
}
662668

663669
if host == nil {
664-
s.scope.BareMetalMachine.SetFailure("UpdateError", "host not found")
665-
return fmt.Errorf("host not found for machine %s: %w", s.scope.Machine.Name, err)
670+
err := s.scope.SetRemediateMachineAnnotationToDeleteMachine(ctx, "setProviderID failed: host not found")
671+
if err != nil {
672+
return err
673+
}
674+
return fmt.Errorf("host not found for machine %q", s.scope.Machine.Name)
666675
}
667676

668677
if host.Spec.Status.ProvisioningState != infrav1.StateProvisioned {

0 commit comments

Comments
 (0)