Skip to content

Commit 663980b

Browse files
authored
Fix export policy race conditions in concurrent NAS and NAS-Economy drivers
1 parent fcbf12c commit 663980b

File tree

6 files changed

+922
-117
lines changed

6 files changed

+922
-117
lines changed

storage_drivers/ontap/ontap_common.go

Lines changed: 128 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,22 @@ func ensureExportPolicyExists(ctx context.Context, policyName string, clientAPI
318318
return clientAPI.ExportPolicyCreate(ctx, policyName)
319319
}
320320

321+
// destroyExportPolicy destroys an export policy.
322+
// NOTE: Caller MUST hold the exportPolicyMutex for policyName before calling this function.
323+
// This function is idempotent - if the export policy doesn't exist, it returns success.
321324
func destroyExportPolicy(ctx context.Context, policyName string, clientAPI api.OntapAPI) error {
322-
exportPolicyMutex.Lock(policyName)
323-
defer exportPolicyMutex.Unlock(policyName)
325+
// Check if the policy exists before attempting to delete
326+
exists, err := clientAPI.ExportPolicyExists(ctx, policyName)
327+
if err != nil {
328+
Logc(ctx).WithField("exportPolicy", policyName).WithError(err).Error(
329+
"Could not check if export policy exists before deletion.")
330+
return err
331+
}
332+
if !exists {
333+
Logc(ctx).WithField("exportPolicy", policyName).Debug(
334+
"Export policy does not exist, nothing to delete (idempotent).")
335+
return nil
336+
}
324337

325338
return clientAPI.ExportPolicyDestroy(ctx, policyName)
326339
}
@@ -480,6 +493,118 @@ func ensureNodeAccessForPolicy(
480493
return nil
481494
}
482495

496+
// ensureNodeAccessForPolicyAndApply ensures an export policy exists with the correct rules for the target node,
497+
// then calls the provided applyPolicy function while still holding the export policy lock.
498+
// This prevents race conditions where a concurrent unpublish could delete the policy between
499+
// rule creation and policy assignment to a volume/qtree.
500+
//
501+
// The applyPolicy function should perform the actual assignment (e.g., VolumeModifyExportPolicy or
502+
// QtreeModifyExportPolicy) and will be called while the export policy mutex is held.
503+
func ensureNodeAccessForPolicyAndApply(
504+
ctx context.Context, targetNode *tridentmodels.Node, clientAPI api.OntapAPI,
505+
config *drivers.OntapStorageDriverConfig, policyName string,
506+
applyPolicy func() error,
507+
) error {
508+
fields := LogFields{
509+
"Method": "ensureNodeAccessForPolicyAndApply",
510+
"Type": "ontap_common",
511+
"policyName": policyName,
512+
"targetNodeIPs": targetNode.IPs,
513+
}
514+
515+
Logc(ctx).WithFields(fields).Debug(">>>> ensureNodeAccessForPolicyAndApply")
516+
defer Logc(ctx).WithFields(fields).Debug("<<<< ensureNodeAccessForPolicyAndApply")
517+
518+
exportPolicyMutex.Lock(policyName)
519+
defer exportPolicyMutex.Unlock(policyName)
520+
521+
if exists, err := clientAPI.ExportPolicyExists(ctx, policyName); err != nil {
522+
return err
523+
} else if !exists {
524+
Logc(ctx).WithField("exportPolicy", policyName).Debug("Export policy missing, will create it.")
525+
526+
if err = clientAPI.ExportPolicyCreate(ctx, policyName); err != nil {
527+
return err
528+
}
529+
}
530+
531+
desiredRules, err := network.FilterIPs(ctx, targetNode.IPs, config.AutoExportCIDRs)
532+
if err != nil {
533+
err = fmt.Errorf("unable to determine desired export policy rules; %v", err)
534+
Logc(ctx).Error(err)
535+
return err
536+
}
537+
Logc(ctx).WithField("desiredRules", desiredRules).Debug("Desired export policy rules.")
538+
539+
// first grab all existing rules
540+
existingRules, err := clientAPI.ExportRuleList(ctx, policyName)
541+
if err != nil {
542+
// Could not list rules, just log it, no action required.
543+
Logc(ctx).WithField("error", err).Debug("Export policy rules could not be listed.")
544+
}
545+
Logc(ctx).WithField("existingRules", existingRules).Debug("Existing export policy rules.")
546+
547+
for _, desiredRule := range desiredRules {
548+
desiredRule = strings.TrimSpace(desiredRule)
549+
550+
desiredIP := net.ParseIP(desiredRule)
551+
if desiredIP == nil {
552+
Logc(ctx).WithField("desiredRule", desiredRule).Debug("Invalid desired rule IP")
553+
continue
554+
}
555+
556+
// Loop through the existing rules one by one and compare to make sure we cover the scenario where the
557+
// existing rule is of format "1.1.1.1, 2.2.2.2" and the desired rule is format "1.1.1.1".
558+
// This can happen because of the difference in how ONTAP ZAPI and ONTAP REST creates export rule.
559+
560+
ruleFound := false
561+
for _, existingRule := range existingRules {
562+
existingIPs := strings.Split(existingRule, ",")
563+
564+
for _, ip := range existingIPs {
565+
ip = strings.TrimSpace(ip)
566+
567+
existingIP := net.ParseIP(ip)
568+
if existingIP == nil {
569+
Logc(ctx).WithField("existingRule", existingRule).Debug("Invalid existing rule IP")
570+
continue
571+
}
572+
573+
if existingIP.Equal(desiredIP) {
574+
ruleFound = true
575+
break
576+
}
577+
}
578+
579+
if ruleFound {
580+
break
581+
}
582+
}
583+
584+
// Rule does not exist, so create it
585+
if !ruleFound {
586+
if err = clientAPI.ExportRuleCreate(ctx, policyName, desiredRule, config.NASType); err != nil {
587+
// Check if error is that the export policy rule already exist error
588+
if errors.IsAlreadyExistsError(err) {
589+
Logc(ctx).WithField("desiredRule", desiredRule).WithError(err).Debug(
590+
"Export policy rule already exists")
591+
continue
592+
}
593+
return err
594+
}
595+
}
596+
}
597+
598+
// Apply the policy while still holding the lock
599+
if applyPolicy != nil {
600+
if err = applyPolicy(); err != nil {
601+
return err
602+
}
603+
}
604+
605+
return nil
606+
}
607+
483608
func getDesiredExportPolicyRules(
484609
ctx context.Context, nodes []*tridentmodels.Node, config *drivers.OntapStorageDriverConfig,
485610
) ([]string, error) {
@@ -4968,6 +5093,7 @@ func deleteAutomaticSnapshot(
49685093
// removeExportPolicyRules takes an export policy name,
49695094
// retrieves its rules and matches the rules that exist to the IP addresses from the node.
49705095
// Any matched IP addresses will be removed from the export policy.
5096+
// NOTE: Caller MUST hold the exportPolicyMutex for exportPolicy before calling this function.
49715097
func removeExportPolicyRules(
49725098
ctx context.Context, exportPolicy string, publishInfo *tridentmodels.VolumePublishInfo, clientAPI api.OntapAPI,
49735099
) error {
@@ -4980,9 +5106,6 @@ func removeExportPolicyRules(
49805106
Logc(ctx).WithFields(fields).Debug(">>>> removeExportPolicyRules")
49815107
defer Logc(ctx).WithFields(fields).Debug("<<<< removeExportPolicyRules")
49825108

4983-
exportPolicyMutex.Lock(exportPolicy)
4984-
defer exportPolicyMutex.Unlock(exportPolicy)
4985-
49865109
// CNVA-AWARE EXPORT POLICY MANAGEMENT:
49875110
// Instead of blindly removing rules for the departing node, we implement "keep only active nodes" logic.
49885111
// This is critical for CNVA where multiple PVCs (primary + subordinates) share the same export policy.

storage_drivers/ontap/ontap_common_test.go

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3512,6 +3512,72 @@ func TestEnsureExportPolicyExists(t *testing.T) {
35123512
assert.NoError(t, err)
35133513
}
35143514

3515+
// Tests for destroyExportPolicy - idempotent export policy deletion
3516+
3517+
func TestDestroyExportPolicy_PolicyExists_Success(t *testing.T) {
3518+
ctx := context.Background()
3519+
mockCtrl := gomock.NewController(t)
3520+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
3521+
policyName := "testPolicy"
3522+
3523+
// Policy exists and delete succeeds
3524+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Return(true, nil)
3525+
mockAPI.EXPECT().ExportPolicyDestroy(ctx, policyName).Return(nil)
3526+
3527+
err := destroyExportPolicy(ctx, policyName, mockAPI)
3528+
3529+
assert.NoError(t, err)
3530+
}
3531+
3532+
func TestDestroyExportPolicy_PolicyDoesNotExist_Idempotent(t *testing.T) {
3533+
ctx := context.Background()
3534+
mockCtrl := gomock.NewController(t)
3535+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
3536+
policyName := "testPolicy"
3537+
3538+
// Policy doesn't exist - should return success (idempotent)
3539+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Return(false, nil)
3540+
// ExportPolicyDestroy should NOT be called
3541+
3542+
err := destroyExportPolicy(ctx, policyName, mockAPI)
3543+
3544+
assert.NoError(t, err)
3545+
}
3546+
3547+
func TestDestroyExportPolicy_ExistenceCheckError(t *testing.T) {
3548+
ctx := context.Background()
3549+
mockCtrl := gomock.NewController(t)
3550+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
3551+
policyName := "testPolicy"
3552+
3553+
// Error checking existence - should return error
3554+
expectedErr := errors.New("API error checking policy existence")
3555+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Return(false, expectedErr)
3556+
// ExportPolicyDestroy should NOT be called
3557+
3558+
err := destroyExportPolicy(ctx, policyName, mockAPI)
3559+
3560+
assert.Error(t, err)
3561+
assert.Equal(t, expectedErr, err)
3562+
}
3563+
3564+
func TestDestroyExportPolicy_DestroyError(t *testing.T) {
3565+
ctx := context.Background()
3566+
mockCtrl := gomock.NewController(t)
3567+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
3568+
policyName := "testPolicy"
3569+
3570+
// Policy exists but delete fails
3571+
expectedErr := errors.New("error destroying export policy")
3572+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Return(true, nil)
3573+
mockAPI.EXPECT().ExportPolicyDestroy(ctx, policyName).Return(expectedErr)
3574+
3575+
err := destroyExportPolicy(ctx, policyName, mockAPI)
3576+
3577+
assert.Error(t, err)
3578+
assert.Equal(t, expectedErr, err)
3579+
}
3580+
35153581
func TestReconcileExportPolicyRules_AllRulesExist(t *testing.T) {
35163582
ctx := context.TODO()
35173583
policyName := "testPolicy"
@@ -4507,6 +4573,149 @@ func TestEnsureNodeAccessForPolicy_OverlappingStringInIPs(t *testing.T) {
45074573
assert.NoError(t, err, "expected no error")
45084574
}
45094575

4576+
func TestEnsureNodeAccessForPolicyAndApply_Success(t *testing.T) {
4577+
ctx := context.Background()
4578+
ontapConfig := newOntapStorageDriverConfig()
4579+
policyName := "trident-fakeUUID"
4580+
mockCtrl := gomock.NewController(t)
4581+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
4582+
4583+
nodeIP := "1.1.1.1"
4584+
nodes := make([]*tridentmodels.Node, 0)
4585+
nodes = append(nodes, &tridentmodels.Node{Name: "node1", IPs: []string{nodeIP}})
4586+
4587+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Times(1).Return(true, nil)
4588+
ruleListCall := mockAPI.EXPECT().ExportRuleList(gomock.Any(), policyName).Return(make(map[int]string), nil)
4589+
mockAPI.EXPECT().ExportRuleCreate(gomock.Any(), gomock.Any(), nodeIP, gomock.Any()).After(ruleListCall).Return(nil)
4590+
4591+
ontapConfig.AutoExportPolicy = true
4592+
ontapConfig.AutoExportCIDRs = []string{"0.0.0.0/0"}
4593+
4594+
// applyPolicy callback should be called and return success
4595+
applyPolicyCalled := false
4596+
applyPolicy := func() error {
4597+
applyPolicyCalled = true
4598+
return nil
4599+
}
4600+
4601+
err := ensureNodeAccessForPolicyAndApply(ctx, nodes[0], mockAPI, ontapConfig, policyName, applyPolicy)
4602+
assert.NoError(t, err)
4603+
assert.True(t, applyPolicyCalled, "applyPolicy callback should have been called")
4604+
}
4605+
4606+
func TestEnsureNodeAccessForPolicyAndApply_ApplyPolicyError(t *testing.T) {
4607+
ctx := context.Background()
4608+
ontapConfig := newOntapStorageDriverConfig()
4609+
policyName := "trident-fakeUUID"
4610+
mockCtrl := gomock.NewController(t)
4611+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
4612+
4613+
nodeIP := "1.1.1.1"
4614+
nodes := make([]*tridentmodels.Node, 0)
4615+
nodes = append(nodes, &tridentmodels.Node{Name: "node1", IPs: []string{nodeIP}})
4616+
4617+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Times(1).Return(true, nil)
4618+
ruleListCall := mockAPI.EXPECT().ExportRuleList(gomock.Any(), policyName).Return(make(map[int]string), nil)
4619+
mockAPI.EXPECT().ExportRuleCreate(gomock.Any(), gomock.Any(), nodeIP, gomock.Any()).After(ruleListCall).Return(nil)
4620+
4621+
ontapConfig.AutoExportPolicy = true
4622+
ontapConfig.AutoExportCIDRs = []string{"0.0.0.0/0"}
4623+
4624+
// applyPolicy callback returns an error
4625+
applyPolicyErr := errors.New("error setting export policy on volume")
4626+
applyPolicy := func() error {
4627+
return applyPolicyErr
4628+
}
4629+
4630+
err := ensureNodeAccessForPolicyAndApply(ctx, nodes[0], mockAPI, ontapConfig, policyName, applyPolicy)
4631+
assert.Error(t, err)
4632+
assert.Equal(t, applyPolicyErr, err, "error from applyPolicy should be propagated")
4633+
}
4634+
4635+
func TestEnsureNodeAccessForPolicyAndApply_NilCallback(t *testing.T) {
4636+
ctx := context.Background()
4637+
ontapConfig := newOntapStorageDriverConfig()
4638+
policyName := "trident-fakeUUID"
4639+
mockCtrl := gomock.NewController(t)
4640+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
4641+
4642+
nodeIP := "1.1.1.1"
4643+
nodes := make([]*tridentmodels.Node, 0)
4644+
nodes = append(nodes, &tridentmodels.Node{Name: "node1", IPs: []string{nodeIP}})
4645+
4646+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Times(1).Return(true, nil)
4647+
ruleListCall := mockAPI.EXPECT().ExportRuleList(gomock.Any(), policyName).Return(make(map[int]string), nil)
4648+
mockAPI.EXPECT().ExportRuleCreate(gomock.Any(), gomock.Any(), nodeIP, gomock.Any()).After(ruleListCall).Return(nil)
4649+
4650+
ontapConfig.AutoExportPolicy = true
4651+
ontapConfig.AutoExportCIDRs = []string{"0.0.0.0/0"}
4652+
4653+
// nil applyPolicy callback should work without error
4654+
err := ensureNodeAccessForPolicyAndApply(ctx, nodes[0], mockAPI, ontapConfig, policyName, nil)
4655+
assert.NoError(t, err)
4656+
}
4657+
4658+
func TestEnsureNodeAccessForPolicyAndApply_EarlyError_ApplyPolicyNotCalled(t *testing.T) {
4659+
ctx := context.Background()
4660+
ontapConfig := newOntapStorageDriverConfig()
4661+
policyName := "trident-fakeUUID"
4662+
mockCtrl := gomock.NewController(t)
4663+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
4664+
4665+
nodeIP := "1.1.1.1"
4666+
nodes := make([]*tridentmodels.Node, 0)
4667+
nodes = append(nodes, &tridentmodels.Node{Name: "node1", IPs: []string{nodeIP}})
4668+
4669+
// ExportPolicyExists returns an error
4670+
expectedErr := errors.New("API error checking policy")
4671+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Times(1).Return(false, expectedErr)
4672+
4673+
ontapConfig.AutoExportPolicy = true
4674+
ontapConfig.AutoExportCIDRs = []string{"0.0.0.0/0"}
4675+
4676+
// applyPolicy callback should NOT be called when an earlier step fails
4677+
applyPolicyCalled := false
4678+
applyPolicy := func() error {
4679+
applyPolicyCalled = true
4680+
return nil
4681+
}
4682+
4683+
err := ensureNodeAccessForPolicyAndApply(ctx, nodes[0], mockAPI, ontapConfig, policyName, applyPolicy)
4684+
assert.Error(t, err)
4685+
assert.False(t, applyPolicyCalled, "applyPolicy callback should NOT have been called on early error")
4686+
}
4687+
4688+
func TestEnsureNodeAccessForPolicyAndApply_PolicyCreatedAndApplied(t *testing.T) {
4689+
ctx := context.Background()
4690+
ontapConfig := newOntapStorageDriverConfig()
4691+
policyName := "trident-fakeUUID"
4692+
mockCtrl := gomock.NewController(t)
4693+
mockAPI := mockapi.NewMockOntapAPI(mockCtrl)
4694+
4695+
nodeIP := "1.1.1.1"
4696+
nodes := make([]*tridentmodels.Node, 0)
4697+
nodes = append(nodes, &tridentmodels.Node{Name: "node1", IPs: []string{nodeIP}})
4698+
4699+
// Policy doesn't exist, should be created
4700+
mockAPI.EXPECT().ExportPolicyExists(ctx, policyName).Times(1).Return(false, nil)
4701+
mockAPI.EXPECT().ExportPolicyCreate(ctx, policyName).Times(1).Return(nil)
4702+
ruleListCall := mockAPI.EXPECT().ExportRuleList(gomock.Any(), policyName).Return(make(map[int]string), nil)
4703+
mockAPI.EXPECT().ExportRuleCreate(gomock.Any(), gomock.Any(), nodeIP, gomock.Any()).After(ruleListCall).Return(nil)
4704+
4705+
ontapConfig.AutoExportPolicy = true
4706+
ontapConfig.AutoExportCIDRs = []string{"0.0.0.0/0"}
4707+
4708+
applyPolicyCalled := false
4709+
applyPolicy := func() error {
4710+
applyPolicyCalled = true
4711+
return nil
4712+
}
4713+
4714+
err := ensureNodeAccessForPolicyAndApply(ctx, nodes[0], mockAPI, ontapConfig, policyName, applyPolicy)
4715+
assert.NoError(t, err)
4716+
assert.True(t, applyPolicyCalled, "applyPolicy callback should have been called after policy creation")
4717+
}
4718+
45104719
func TestIsDefaultAuthTypeOfType(t *testing.T) {
45114720
response := api.IscsiInitiatorAuth{
45124721
AuthType: "fakeAuthType",

0 commit comments

Comments
 (0)