Skip to content

Commit 10ae3a8

Browse files
committed
pkg/cluster: extend retryable coverage to deployments and private endpoint delete
1 parent 7bd9c67 commit 10ae3a8

27 files changed

Lines changed: 600 additions & 292 deletions

pkg/cluster/clusterserviceprincipal.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,10 @@ func (m *manager) createOrUpdateClusterServicePrincipalRBAC(ctx context.Context)
5454

5555
for _, assignment := range toDelete {
5656
m.log.Infof("deleting role assignment %s", *assignment.Name)
57-
err := m.retryableDelete("deleting role assignment "+*assignment.Name, func() error {
57+
err := arm.RetryableDelete(func() error {
5858
_, e := m.roleAssignments.Delete(ctx, *assignment.Scope, *assignment.Name)
5959
return e
60-
})
60+
}, m.log, "deleting role assignment "+*assignment.Name)
6161
if err != nil {
6262
return err
6363
}
@@ -75,7 +75,9 @@ func (m *manager) createOrUpdateClusterServicePrincipalRBAC(ctx context.Context)
7575
ContentVersion: "1.0.0.0",
7676
Resources: []*arm.Resource{m.clusterServicePrincipalRBAC()},
7777
}
78-
err = arm.DeployTemplate(ctx, m.log, m.deployments, resourceGroup, "clustersp", t, nil)
78+
err = arm.Retryable(func() error {
79+
return arm.DeployTemplate(ctx, m.log, m.deployments, resourceGroup, "clustersp", t, nil)
80+
}, m.log, "deploying cluster service principal RBAC")
7981
if err != nil {
8082
return err
8183
}

pkg/cluster/delete.go

Lines changed: 33 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import (
1414

1515
"k8s.io/apimachinery/pkg/util/wait"
1616

17-
"github.com/Azure/azure-sdk-for-go/sdk/azcore/arm"
17+
armsdk "github.com/Azure/azure-sdk-for-go/sdk/azcore/arm"
1818
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/msi/armmsi"
1919
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v6"
2020
mgmtfeatures "github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2019-07-01/features"
@@ -25,6 +25,7 @@ import (
2525
"github.com/Azure/ARO-RP/pkg/api"
2626
"github.com/Azure/ARO-RP/pkg/database/cosmosdb"
2727
"github.com/Azure/ARO-RP/pkg/env"
28+
"github.com/Azure/ARO-RP/pkg/util/arm"
2829
"github.com/Azure/ARO-RP/pkg/util/azureclient"
2930
"github.com/Azure/ARO-RP/pkg/util/azureclient/azuresdk/azcertificates"
3031
"github.com/Azure/ARO-RP/pkg/util/azureerrors"
@@ -58,16 +59,16 @@ func (m *manager) deleteNic(ctx context.Context, nicName string) error {
5859

5960
if *nic.Properties.ProvisioningState == armnetwork.ProvisioningStateFailed {
6061
m.log.Printf("NIC '%s' is in a Failed provisioning state, attempting to reconcile prior to deletion.", *nic.ID)
61-
err := m.armInterfaces.CreateOrUpdateAndWait(ctx, resourceGroup, *nic.Name, nic.Interface, nil)
62+
err := arm.Retryable(func() error {
63+
return m.armInterfaces.CreateOrUpdateAndWait(ctx, resourceGroup, *nic.Name, nic.Interface, nil)
64+
}, m.log, "reconciling NIC "+nicName)
6265
if err != nil {
6366
return err
6467
}
6568
}
66-
err = m.armInterfaces.DeleteAndWait(ctx, resourceGroup, *nic.Name, nil)
67-
if azureerrors.IsNotFoundError(err) {
68-
return nil
69-
}
70-
return err
69+
return arm.RetryableDelete(func() error {
70+
return m.armInterfaces.DeleteAndWait(ctx, resourceGroup, *nic.Name, nil)
71+
}, m.log, "deleting NIC "+nicName)
7172
}
7273

7374
func (m *manager) disconnectSecurityGroup(ctx context.Context, resourceID string) error {
@@ -90,7 +91,7 @@ func (m *manager) disconnectSecurityGroup(ctx context.Context, resourceID string
9091
// Note: subnet only has value in the ID field,
9192
// so we have to make another API request to get full subnet struct
9293
// TODO: there is probably an undesirable race condition here - check if etags can help.
93-
r, err := arm.ParseResourceID(*subnet.ID)
94+
r, err := armsdk.ParseResourceID(*subnet.ID)
9495
if err != nil {
9596
return &api.CloudError{
9697
StatusCode: http.StatusBadRequest,
@@ -130,7 +131,9 @@ func (m *manager) disconnectSecurityGroup(ctx context.Context, resourceID string
130131
s.Properties.NetworkSecurityGroup = nil
131132

132133
m.log.Printf("disconnecting network security group from subnet %s", *s.ID)
133-
err = m.armSubnets.CreateOrUpdateAndWait(ctx, r.ResourceGroupName, r.Parent.Name, r.Name, s, nil)
134+
err = arm.Retryable(func() error {
135+
return m.armSubnets.CreateOrUpdateAndWait(ctx, r.ResourceGroupName, r.Parent.Name, r.Name, s, nil)
136+
}, m.log, "disconnecting network security group from subnet "+*s.ID)
134137
if err != nil {
135138
b, _ := json.Marshal(err)
136139

@@ -238,11 +241,11 @@ func (m *manager) deleteResources(ctx context.Context) error {
238241
}
239242

240243
var future mgmtfeatures.ResourcesDeleteByIDFuture
241-
err = m.retryable("deleting "+*resource.ID, func() error {
244+
err = arm.Retryable(func() error {
242245
var deleteErr error
243246
future, deleteErr = m.resources.DeleteByID(ctx, *resource.ID, apiVersion)
244247
return deleteErr
245-
})
248+
}, m.log, "deleting "+*resource.ID)
246249
if err != nil {
247250
return deleteByIdCloudError(err)
248251
}
@@ -309,10 +312,10 @@ func (m *manager) deleteRoleAssignments(ctx context.Context) error {
309312
}
310313

311314
m.log.Infof("deleting role assignment %s", *assignment.Name)
312-
err := m.retryableDelete("deleting role assignment "+*assignment.Name, func() error {
315+
err := arm.RetryableDelete(func() error {
313316
_, e := m.roleAssignments.Delete(ctx, *assignment.Scope, *assignment.Name)
314317
return e
315-
})
318+
}, m.log, "deleting role assignment "+*assignment.Name)
316319
if err != nil {
317320
return err
318321
}
@@ -337,10 +340,10 @@ func (m *manager) deleteRoleDefinition(ctx context.Context) error {
337340
}
338341

339342
m.log.Infof("deleting role definition %s", *definition.Name)
340-
err := m.retryableDelete("deleting role definition "+*definition.Name, func() error {
343+
err := arm.RetryableDelete(func() error {
341344
_, e := m.roleDefinitions.Delete(ctx, (*definition.AssignableScopes)[0], *definition.Name)
342345
return e
343-
})
346+
}, m.log, "deleting role definition "+*definition.Name)
344347
if err != nil {
345348
return err
346349
}
@@ -462,19 +465,16 @@ func (m *manager) deleteFederatedCredentials(ctx context.Context) error {
462465
*federatedCredential.Properties.Issuer != string(*m.doc.OpenShiftCluster.Properties.ClusterProfile.OIDCIssuer):
463466
continue
464467
default:
465-
_, err = m.clusterMsiFederatedIdentityCredentials.Delete(
466-
ctx,
467-
identityResourceId.ResourceGroup,
468-
identityResourceId.ResourceName,
469-
*federatedCredential.Name,
470-
&armmsi.FederatedIdentityCredentialsClientDeleteOptions{},
471-
)
472-
<<<<<<< HEAD
473-
=======
474-
if azureerrors.IsNotFoundError(err) {
475-
err = nil
476-
}
477-
>>>>>>> dbd78aace (fixup! pkg/cluster: wrap autorest ARM write call sites with transient retry)
468+
err = arm.RetryableDelete(func() error {
469+
_, e := m.clusterMsiFederatedIdentityCredentials.Delete(
470+
ctx,
471+
identityResourceId.ResourceGroup,
472+
identityResourceId.ResourceName,
473+
*federatedCredential.Name,
474+
&armmsi.FederatedIdentityCredentialsClientDeleteOptions{},
475+
)
476+
return e
477+
}, m.log, "deleting federated identity credential "+*federatedCredential.Name)
478478
if err != nil {
479479
if azureerrors.IsStatusNotFoundError(err) {
480480
m.log.Infof("federated identity credentials not found for %s: %v", identity.ResourceID, err.Error())
@@ -528,9 +528,9 @@ func (m *manager) shouldDeleteResourceGroup(ctx context.Context, name string) (b
528528
}
529529

530530
func (m *manager) deleteResourceGroup(ctx context.Context, name string) error {
531-
err := m.retryable("deleting resource group "+name, func() error {
531+
err := arm.Retryable(func() error {
532532
return m.resourceGroups.DeleteAndWait(ctx, name)
533-
})
533+
}, m.log, "deleting resource group "+name)
534534

535535
detailedErr, isDetailedErr := err.(autorest.DetailedError)
536536
if azureerrors.ResourceGroupNotFound(err) || (isDetailedErr && (detailedErr.StatusCode == http.StatusNotFound)) {
@@ -553,15 +553,10 @@ func (m *manager) Delete(ctx context.Context) error {
553553
}
554554

555555
m.log.Print("deleting private endpoint")
556-
err = m.armFPPrivateEndpoints.DeleteAndWait(ctx, m.env.ResourceGroup(), env.RPPrivateEndpointPrefix+m.doc.ID, nil)
557-
<<<<<<< HEAD
558-
if err != nil && !azureerrors.IsNotFoundError(err) {
559-
=======
560-
if azureerrors.IsNotFoundError(err) {
561-
err = nil
562-
}
556+
err = arm.RetryableDelete(func() error {
557+
return m.armFPPrivateEndpoints.DeleteAndWait(ctx, m.env.ResourceGroup(), env.RPPrivateEndpointPrefix+m.doc.ID, nil)
558+
}, m.log, "deleting private endpoint "+env.RPPrivateEndpointPrefix+m.doc.ID)
563559
if err != nil {
564-
>>>>>>> dbd78aace (fixup! pkg/cluster: wrap autorest ARM write call sites with transient retry)
565560
return err
566561
}
567562

pkg/cluster/delete_test.go

Lines changed: 27 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
"time"
1414

1515
"github.com/sirupsen/logrus"
16-
logrustest "github.com/sirupsen/logrus/hooks/test"
1716
"github.com/stretchr/testify/assert"
1817
"github.com/stretchr/testify/require"
1918
"go.uber.org/mock/gomock"
@@ -30,6 +29,7 @@ import (
3029
"github.com/Azure/msi-dataplane/pkg/dataplane"
3130

3231
"github.com/Azure/ARO-RP/pkg/api"
32+
"github.com/Azure/ARO-RP/pkg/util/arm"
3333
mock_armmsi "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/azuresdk/armmsi"
3434
mock_armnetwork "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/azuresdk/armnetwork"
3535
mock_azsecrets "github.com/Azure/ARO-RP/pkg/util/mocks/azureclient/azuresdk/azsecrets"
@@ -407,135 +407,61 @@ func TestRetryableDelete(t *testing.T) {
407407
for _, tt := range []struct {
408408
name string
409409
err error
410-
wantErr bool
410+
wantErr error
411411
}{
412412
{
413413
name: "404 from inner f() is swallowed and nil returned",
414414
err: autorest.DetailedError{StatusCode: http.StatusNotFound},
415-
wantErr: false,
415+
wantErr: nil,
416416
},
417417
{
418418
name: "non-404 error propagates unchanged",
419419
err: autorest.DetailedError{StatusCode: http.StatusConflict},
420-
wantErr: true,
420+
wantErr: autorest.DetailedError{StatusCode: http.StatusConflict},
421421
},
422422
{
423423
name: "nil error propagates unchanged",
424424
err: nil,
425-
wantErr: false,
425+
wantErr: nil,
426426
},
427427
} {
428428
t.Run(tt.name, func(t *testing.T) {
429-
origBackoff := transientRetryBackoff
430-
transientRetryBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond}
431-
defer func() { transientRetryBackoff = origBackoff }()
429+
origBackoff := arm.TransientBackoff
430+
arm.TransientBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond}
431+
defer func() { arm.TransientBackoff = origBackoff }()
432432

433-
m := &manager{log: logrus.NewEntry(logrus.StandardLogger())}
434-
err := m.retryableDelete("test-op", func() error {
433+
log := logrus.NewEntry(logrus.StandardLogger())
434+
err := arm.RetryableDelete(func() error {
435435
return tt.err
436-
})
437-
if tt.wantErr {
438-
assert.Error(t, err)
439-
} else {
440-
assert.NoError(t, err)
441-
}
436+
}, log, "test-op")
437+
assert.Equal(t, tt.wantErr, err)
442438
})
443439
}
444440
}
445441

446442
func TestRetryableDeleteRetryPath(t *testing.T) {
447-
origBackoff := transientRetryBackoff
448-
transientRetryBackoff = wait.Backoff{Steps: 2, Duration: time.Millisecond, Factor: 2.0}
449-
defer func() { transientRetryBackoff = origBackoff }()
443+
origBackoff := arm.TransientBackoff
444+
arm.TransientBackoff = wait.Backoff{Steps: 2, Duration: time.Millisecond, Factor: 2.0}
445+
defer func() { arm.TransientBackoff = origBackoff }()
450446

451447
calls := 0
452-
m := &manager{log: logrus.NewEntry(logrus.StandardLogger())}
453-
err := m.retryableDelete("test-retry-op", func() error {
448+
log := logrus.NewEntry(logrus.StandardLogger())
449+
err := arm.RetryableDelete(func() error {
454450
calls++
455451
if calls == 1 {
456452
return autorest.DetailedError{StatusCode: http.StatusTooManyRequests}
457453
}
458454
return nil
459-
})
455+
}, log, "test-retry-op")
460456
require.NoError(t, err)
461457
assert.Equal(t, 2, calls, "expected inner function to be called twice: once for transient error, once for success")
462458
}
463459

464-
func TestIsRetryable(t *testing.T) {
465-
for _, tt := range []struct {
466-
name string
467-
err error
468-
wantRetry bool
469-
wantLogMsg string
470-
}{
471-
{
472-
name: "retryable 429 returns true and logs",
473-
err: autorest.DetailedError{StatusCode: http.StatusTooManyRequests},
474-
wantRetry: true,
475-
wantLogMsg: "error on test-op, retrying:",
476-
},
477-
{
478-
name: "non-retryable error returns false",
479-
err: errors.New("permanent failure"),
480-
wantRetry: false,
481-
},
482-
{
483-
name: "retryable azcore 429 returns true and logs",
484-
err: &azcore.ResponseError{StatusCode: http.StatusTooManyRequests},
485-
wantRetry: true,
486-
wantLogMsg: "error on test-op, retrying:",
487-
},
488-
{
489-
name: "retryable autorest 409+Retry-After returns true and logs",
490-
err: autorest.DetailedError{
491-
StatusCode: http.StatusConflict,
492-
Response: &http.Response{
493-
StatusCode: http.StatusConflict,
494-
Header: http.Header{"Retry-After": []string{"1"}},
495-
},
496-
},
497-
wantRetry: true,
498-
wantLogMsg: "error on test-op, retrying:",
499-
},
500-
{
501-
name: "retryable azcore 409+Retry-After returns true and logs",
502-
err: &azcore.ResponseError{
503-
StatusCode: http.StatusConflict,
504-
RawResponse: &http.Response{
505-
StatusCode: http.StatusConflict,
506-
Header: http.Header{"Retry-After": []string{"1"}},
507-
},
508-
},
509-
wantRetry: true,
510-
wantLogMsg: "error on test-op, retrying:",
511-
},
512-
} {
513-
t.Run(tt.name, func(t *testing.T) {
514-
logger, hook := logrustest.NewNullLogger()
515-
logger.SetLevel(logrus.WarnLevel)
516-
517-
m := &manager{log: logrus.NewEntry(logger)}
518-
predicate := m.isRetryable("test-op")
519-
520-
got := predicate(tt.err)
521-
assert.Equal(t, tt.wantRetry, got)
522-
523-
if tt.wantLogMsg != "" {
524-
require.Len(t, hook.Entries, 1)
525-
assert.Contains(t, hook.LastEntry().Message, tt.wantLogMsg)
526-
assert.Equal(t, logrus.WarnLevel, hook.LastEntry().Level)
527-
} else {
528-
assert.Empty(t, hook.Entries)
529-
}
530-
})
531-
}
532-
}
533-
534460
// TestDisconnectSecurityGroupRetryExhausted verifies that when the azcore subnet call fails, the error is propagated.
535461
func TestDisconnectSecurityGroupRetryExhausted(t *testing.T) {
536-
origBackoff := transientRetryBackoff
537-
transientRetryBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond, Factor: 2.0} // Steps: 1 = 1 attempt, 0 retries
538-
defer func() { transientRetryBackoff = origBackoff }()
462+
origBackoff := arm.TransientBackoff
463+
arm.TransientBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond, Factor: 2.0} // Steps: 1 = 1 attempt, 0 retries
464+
defer func() { arm.TransientBackoff = origBackoff }()
539465

540466
subscription := "00000000-0000-0000-0000-000000000000"
541467
resourceGroup := "test-rg"
@@ -1224,9 +1150,9 @@ func TestDeleteResourcesRetry(t *testing.T) {
12241150
},
12251151
} {
12261152
t.Run(tt.name, func(t *testing.T) {
1227-
origBackoff := transientRetryBackoff
1228-
transientRetryBackoff = wait.Backoff{Steps: 2, Duration: time.Millisecond, Factor: 2.0}
1229-
defer func() { transientRetryBackoff = origBackoff }()
1153+
origBackoff := arm.TransientBackoff
1154+
arm.TransientBackoff = wait.Backoff{Steps: 2, Duration: time.Millisecond, Factor: 2.0}
1155+
defer func() { arm.TransientBackoff = origBackoff }()
12301156

12311157
controller := gomock.NewController(t)
12321158
defer controller.Finish()
@@ -1270,9 +1196,9 @@ func TestDeleteResourcesRetry(t *testing.T) {
12701196
// TestDeleteResourcesRetryExhausted verifies that retry exhaustion propagates the error.
12711197
// Uses a single representative error (autorest 429); the exhaustion path is the same for all retryable errors.
12721198
func TestDeleteResourcesRetryExhausted(t *testing.T) {
1273-
origBackoff := transientRetryBackoff
1274-
transientRetryBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond, Factor: 2.0} // Steps: 1 = 1 attempt, 0 retries
1275-
defer func() { transientRetryBackoff = origBackoff }()
1199+
origBackoff := arm.TransientBackoff
1200+
arm.TransientBackoff = wait.Backoff{Steps: 1, Duration: time.Millisecond, Factor: 2.0} // Steps: 1 = 1 attempt, 0 retries
1201+
defer func() { arm.TransientBackoff = origBackoff }()
12761202

12771203
subscription := "00000000-0000-0000-0000-000000000000"
12781204
resourceGroup := "test-rg"

pkg/cluster/denyassignment.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,7 @@ func (m *manager) createOrUpdateDenyAssignment(ctx context.Context) error {
5252
},
5353
}
5454

55-
return arm.DeployTemplate(ctx, m.log, m.deployments, resourceGroup, "storage", t, nil)
55+
return arm.Retryable(func() error {
56+
return arm.DeployTemplate(ctx, m.log, m.deployments, resourceGroup, "storage", t, nil)
57+
}, m.log, "deploying deny assignment")
5658
}

0 commit comments

Comments
 (0)