Skip to content

Commit a71c41e

Browse files
committed
MGMT-23738: Add RetryOnConflict to CreateAgentCR to fix flaky agent move between infraenvs
1 parent babc11d commit a71c41e

2 files changed

Lines changed: 107 additions & 32 deletions

File tree

internal/controller/controllers/crd_utils.go

Lines changed: 41 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1414
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1515
"k8s.io/apimachinery/pkg/types"
16+
"k8s.io/client-go/util/retry"
1617
"sigs.k8s.io/controller-runtime/pkg/client"
1718
)
1819

@@ -137,44 +138,52 @@ func (u *CRDUtils) CreateAgentCR(ctx context.Context, log logrus.FieldLogger, ho
137138
}
138139
}
139140

140-
updatedhost := &aiv1beta1.Agent{
141-
Spec: aiv1beta1.AgentSpec{
142-
Approved: false,
143-
Hostname: "",
144-
MachineConfigPool: "",
145-
Role: "",
146-
InstallationDiskID: "",
147-
InstallerArgs: "",
148-
IgnitionConfigOverrides: "",
149-
},
150-
ObjectMeta: metav1.ObjectMeta{
151-
Name: hostId,
152-
Namespace: infraEnvCR.Namespace,
153-
Labels: labels,
154-
ResourceVersion: host.ResourceVersion,
155-
OwnerReferences: []metav1.OwnerReference{
156-
{
157-
APIVersion: infraEnvCR.APIVersion,
158-
Kind: infraEnvCR.Kind,
159-
Name: infraEnvCR.Name,
160-
UID: infraEnvCR.UID,
141+
if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
142+
current := &aiv1beta1.Agent{}
143+
if err := u.client.Get(ctx, namespacedName, current); err != nil {
144+
return err
145+
}
146+
147+
updatedhost := &aiv1beta1.Agent{
148+
Spec: aiv1beta1.AgentSpec{
149+
Approved: false,
150+
Hostname: "",
151+
MachineConfigPool: "",
152+
Role: "",
153+
InstallationDiskID: "",
154+
InstallerArgs: "",
155+
IgnitionConfigOverrides: "",
156+
},
157+
ObjectMeta: metav1.ObjectMeta{
158+
Name: hostId,
159+
Namespace: infraEnvCR.Namespace,
160+
Labels: labels,
161+
ResourceVersion: current.ResourceVersion,
162+
OwnerReferences: []metav1.OwnerReference{
163+
{
164+
APIVersion: infraEnvCR.APIVersion,
165+
Kind: infraEnvCR.Kind,
166+
Name: infraEnvCR.Name,
167+
UID: infraEnvCR.UID,
168+
},
161169
},
162170
},
163-
},
171+
}
172+
if cluster != nil && cluster.KubeKeyNamespace != "" {
173+
updatedhost.Spec.ClusterDeploymentName = &aiv1beta1.ClusterReference{
174+
Name: cluster.KubeKeyName,
175+
Namespace: cluster.KubeKeyNamespace,
176+
}
177+
}
178+
return u.client.Update(ctx, updatedhost)
179+
}); err != nil {
180+
return err
164181
}
165-
166182
// Update 'kube_key_namespace' in Host
167-
if err1 := u.hostApi.UpdateKubeKeyNS(ctx, hostId, infraEnvCR.Namespace); err1 != nil {
183+
if err := u.hostApi.UpdateKubeKeyNS(ctx, hostId, infraEnvCR.Namespace); err != nil {
168184
return errors.Wrapf(err, "Failed to update 'kube_key_namespace' in host %s", hostId)
169185
}
170-
if cluster != nil && cluster.KubeKeyNamespace != "" {
171-
updatedhost.Spec.ClusterDeploymentName = &aiv1beta1.ClusterReference{
172-
Name: cluster.KubeKeyName,
173-
Namespace: cluster.KubeKeyNamespace,
174-
}
175-
}
176-
log.Infof("Updating Agent CR. Namespace: %s, Cluster: %s, HostID: %s", infraEnvCR.Namespace, clusterName, hostId)
177-
return u.client.Update(ctx, updatedhost)
186+
log.Infof("Updated Agent CR. Namespace: %s, Cluster: %s, HostID: %s", infraEnvCR.Namespace, clusterName, hostId)
178187
}
179188

180189
return nil

internal/controller/controllers/crd_utils_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controllers
22

33
import (
44
"context"
5+
"fmt"
56

67
strfmt "github.com/go-openapi/strfmt"
78
"github.com/google/uuid"
@@ -13,10 +14,13 @@ import (
1314
"github.com/openshift/assisted-service/models"
1415
hivev1 "github.com/openshift/hive/apis/hive/v1"
1516
"go.uber.org/mock/gomock"
17+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
18+
"k8s.io/apimachinery/pkg/runtime/schema"
1619
"k8s.io/apimachinery/pkg/types"
1720
"k8s.io/client-go/kubernetes/scheme"
1821
"sigs.k8s.io/controller-runtime/pkg/client"
1922
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
23+
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
2024
)
2125

2226
var _ = Describe("create agent CR", func() {
@@ -176,6 +180,68 @@ var _ = Describe("create agent CR", func() {
176180
Expect(err).NotTo(HaveOccurred())
177181
})
178182

183+
It("Already existing agent update retries on conflict", func() {
184+
clusterDeployment := newClusterDeployment(clusterName, testNamespace, defaultClusterSpec)
185+
Expect(c.Create(ctx, clusterDeployment)).ShouldNot(HaveOccurred())
186+
infraEnvImage := newInfraEnvImage(infraEnvName, infraEnvNamespace, v1beta1.InfraEnvSpec{
187+
ClusterRef: &v1beta1.ClusterReference{
188+
Name: clusterDeployment.Name,
189+
Namespace: clusterDeployment.Namespace,
190+
},
191+
})
192+
Expect(c.Create(ctx, infraEnvImage)).ShouldNot(HaveOccurred())
193+
194+
hostId := uuid.New().String()
195+
agent := newAgent(hostId, infraEnvNamespace, v1beta1.AgentSpec{})
196+
Expect(c.Create(ctx, agent)).ShouldNot(HaveOccurred())
197+
198+
id := strfmt.UUID(hostId)
199+
otherClusterId := strfmt.UUID(uuid.New().String())
200+
infraEnvId2 := strfmt.UUID(uuid.New().String())
201+
h := common.Host{
202+
Host: models.Host{
203+
ID: &id,
204+
ClusterID: &otherClusterId,
205+
InfraEnvID: infraEnvId,
206+
},
207+
}
208+
infraEnv2 := &common.InfraEnv{
209+
KubeKeyNamespace: infraEnvNamespace,
210+
InfraEnv: models.InfraEnv{
211+
ID: &infraEnvId2,
212+
Name: &infraEnvName,
213+
},
214+
}
215+
216+
mockHostApi.EXPECT().GetHostByKubeKey(gomock.Any()).Return(&h, nil).Times(1)
217+
mockHostApi.EXPECT().UnRegisterHost(ctx, gomock.Any()).Return(nil).Times(1)
218+
mockHostApi.EXPECT().UpdateKubeKeyNS(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).Times(1)
219+
220+
updateCount := 0
221+
conflictClient := fakeclient.NewClientBuilder().
222+
WithScheme(scheme.Scheme).
223+
WithObjects(clusterDeployment, infraEnvImage, agent).
224+
WithInterceptorFuncs(interceptor.Funcs{
225+
Update: func(ctx context.Context, cl client.WithWatch, obj client.Object, opts ...client.UpdateOption) error {
226+
updateCount++
227+
if updateCount == 1 {
228+
return k8serrors.NewConflict(
229+
schema.GroupResource{Group: "agent-install.openshift.io", Resource: "agents"},
230+
obj.GetName(),
231+
fmt.Errorf("the object has been modified"),
232+
)
233+
}
234+
return cl.Update(ctx, obj, opts...)
235+
},
236+
}).
237+
Build()
238+
239+
conflictCrdUtils := NewCRDUtils(conflictClient, mockHostApi)
240+
err := conflictCrdUtils.CreateAgentCR(ctx, log, hostId, infraEnv2, cluster)
241+
Expect(err).NotTo(HaveOccurred())
242+
Expect(updateCount).To(Equal(2))
243+
})
244+
179245
It("Already existing agent different infraenv same namespace", func() {
180246
clusterDeployment := newClusterDeployment(clusterName, testNamespace, defaultClusterSpec)
181247
Expect(c.Create(ctx, clusterDeployment)).ShouldNot(HaveOccurred())

0 commit comments

Comments
 (0)