Skip to content

Commit cdae514

Browse files
authored
Retry LunGetByName for SAN Economy volume import (REST R-A-W)
After LUN create, ONTAP REST LUN collection can briefly return num_records=0. Introduce shared LunGetter-based wait with exponential backoff (retry only NotFound). Wire SAN and SAN economy create, clone, import, and GetVolumeForImport paths; drop driver-local lunGetByNameWithRetry. Add unit tests for the helper.
1 parent ef0b081 commit cdae514

7 files changed

Lines changed: 319 additions & 22 deletions

File tree

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
// Copyright 2026 NetApp, Inc. All Rights Reserved.
2+
3+
package api
4+
5+
import (
6+
"context"
7+
stderrors "errors"
8+
"fmt"
9+
"time"
10+
11+
"github.com/cenkalti/backoff/v4"
12+
13+
. "github.com/netapp/trident/logging"
14+
"github.com/netapp/trident/utils/errors"
15+
)
16+
17+
const (
18+
waitForLunInitialInterval = 100 * time.Millisecond
19+
waitForLunMaxInterval = 2 * time.Second
20+
waitForLunMaxElapsed = 30 * time.Second
21+
waitForLunMultiplier = 2
22+
waitForLunRandomization = 0.2
23+
)
24+
25+
// LunGetter is the minimal surface needed for WaitForLunToExist. OntapAPI satisfies it.
26+
type LunGetter interface {
27+
LunGetByName(ctx context.Context, name string) (*Lun, error)
28+
}
29+
30+
// WaitForLunToExist calls LunGetByName until the LUN at lunPath is visible, ctx is done, or attempts
31+
// exceed the backoff budget. Only errors.IsNotFoundError results are retried (e.g. ONTAP REST
32+
// read-after-write where the LUN collection is briefly empty right after create). Other errors fail
33+
// immediately.
34+
func WaitForLunToExist(ctx context.Context, o LunGetter, lunPath string) (*Lun, error) {
35+
var found *Lun
36+
operation := func() error {
37+
lun, err := o.LunGetByName(ctx, lunPath)
38+
if err != nil {
39+
if errors.IsNotFoundError(err) {
40+
return err
41+
}
42+
return backoff.Permanent(err)
43+
}
44+
if lun == nil {
45+
// Do not retry: a missing result without NotFoundError is unexpected API behavior.
46+
return backoff.Permanent(errors.NotFoundError("LUN %s not found", lunPath))
47+
}
48+
found = lun
49+
return nil
50+
}
51+
notify := func(err error, d time.Duration) {
52+
Logc(ctx).WithFields(LogFields{
53+
"lunPath": lunPath,
54+
"increment": d,
55+
}).Debug("LUN not visible yet, retrying.")
56+
}
57+
bo := backoff.NewExponentialBackOff()
58+
bo.InitialInterval = waitForLunInitialInterval
59+
bo.MaxInterval = waitForLunMaxInterval
60+
bo.Multiplier = waitForLunMultiplier
61+
bo.RandomizationFactor = waitForLunRandomization
62+
bo.MaxElapsedTime = waitForLunMaxElapsed
63+
if deadline, ok := ctx.Deadline(); ok {
64+
if remaining := time.Until(deadline); remaining > 0 && remaining < bo.MaxElapsedTime {
65+
bo.MaxElapsedTime = remaining
66+
}
67+
}
68+
if err := backoff.RetryNotify(operation, backoff.WithContext(bo, ctx), notify); err != nil {
69+
var perm *backoff.PermanentError
70+
if stderrors.As(err, &perm) {
71+
return nil, perm.Err
72+
}
73+
if stderrors.Is(err, context.Canceled) || stderrors.Is(err, context.DeadlineExceeded) {
74+
return nil, fmt.Errorf("waiting for LUN %s interrupted: %w", lunPath, err)
75+
}
76+
if ctxErr := ctx.Err(); ctxErr != nil {
77+
return nil, fmt.Errorf("waiting for LUN %s interrupted: %w", lunPath, ctxErr)
78+
}
79+
return nil, fmt.Errorf("timed out waiting for LUN %s: %w", lunPath, err)
80+
}
81+
return found, nil
82+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Copyright 2026 NetApp, Inc. All Rights Reserved.
2+
3+
package api
4+
5+
import (
6+
"context"
7+
"errors"
8+
"testing"
9+
"time"
10+
11+
"github.com/stretchr/testify/assert"
12+
13+
terr "github.com/netapp/trident/utils/errors"
14+
)
15+
16+
type seqLunGetter struct {
17+
responses []struct {
18+
lun *Lun
19+
err error
20+
}
21+
i int
22+
}
23+
24+
func (s *seqLunGetter) LunGetByName(ctx context.Context, name string) (*Lun, error) {
25+
if s.i >= len(s.responses) {
26+
return nil, terr.NotFoundError("stub exhausted")
27+
}
28+
r := s.responses[s.i]
29+
s.i++
30+
return r.lun, r.err
31+
}
32+
33+
func TestWaitForLunToExist_RetriesNotFoundThenSucceeds(t *testing.T) {
34+
g := &seqLunGetter{
35+
responses: []struct {
36+
lun *Lun
37+
err error
38+
}{
39+
{nil, terr.NotFoundError("not found")},
40+
{&Lun{Name: "/vol/v/lun0", Size: "1073741824"}, nil},
41+
},
42+
}
43+
ctx := context.Background()
44+
lun, err := WaitForLunToExist(ctx, g, "/vol/v/lun0")
45+
assert.NoError(t, err)
46+
assert.NotNil(t, lun)
47+
assert.Equal(t, "/vol/v/lun0", lun.Name)
48+
assert.Equal(t, 2, g.i)
49+
}
50+
51+
func TestWaitForLunToExist_NonNotFoundFailsImmediately(t *testing.T) {
52+
g := &seqLunGetter{
53+
responses: []struct {
54+
lun *Lun
55+
err error
56+
}{
57+
{nil, errors.New("rpc failed")},
58+
},
59+
}
60+
ctx := context.Background()
61+
lun, err := WaitForLunToExist(ctx, g, "/vol/v/lun0")
62+
assert.Error(t, err)
63+
assert.Nil(t, lun)
64+
assert.Equal(t, 1, g.i)
65+
}
66+
67+
func TestWaitForLunToExist_NilLunWithoutErrorFailsImmediately(t *testing.T) {
68+
g := &seqLunGetter{
69+
responses: []struct {
70+
lun *Lun
71+
err error
72+
}{
73+
{nil, nil},
74+
},
75+
}
76+
ctx := context.Background()
77+
lun, err := WaitForLunToExist(ctx, g, "/vol/v/lun0")
78+
assert.Error(t, err)
79+
assert.Nil(t, lun)
80+
assert.Equal(t, 1, g.i)
81+
}
82+
83+
func TestWaitForLunToExist_ContextTimeout(t *testing.T) {
84+
g := &seqLunGetter{
85+
responses: []struct {
86+
lun *Lun
87+
err error
88+
}{
89+
{nil, terr.NotFoundError("not found")},
90+
},
91+
}
92+
ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
93+
defer cancel()
94+
lun, err := WaitForLunToExist(ctx, g, "/vol/v/lun0")
95+
assert.Error(t, err)
96+
assert.Nil(t, lun)
97+
assert.GreaterOrEqual(t, g.i, 1)
98+
}
99+
100+
func TestWaitForLunToExist_ContextCancelledBeforeRetry(t *testing.T) {
101+
g := &seqLunGetter{
102+
responses: []struct {
103+
lun *Lun
104+
err error
105+
}{
106+
{nil, terr.NotFoundError("not found")},
107+
},
108+
}
109+
ctx, cancel := context.WithCancel(context.Background())
110+
cancel()
111+
lun, err := WaitForLunToExist(ctx, g, "/vol/v/lun0")
112+
assert.Error(t, err)
113+
assert.ErrorContains(t, err, "interrupted")
114+
assert.ErrorIs(t, err, context.Canceled)
115+
assert.Nil(t, lun)
116+
}

storage_drivers/ontap/ontap_san.go

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,26 @@ func (d *SANStorageDriver) Create(
608608
continue
609609
}
610610

611+
if _, err = api.WaitForLunToExist(ctx, d.API, lunPath); err != nil {
612+
errMessage := fmt.Sprintf(
613+
"ONTAP-SAN pool %s/%s; LUN %s not visible after create: %v",
614+
storagePool.Name(), aggregate, name, err,
615+
)
616+
Logc(ctx).Error(errMessage)
617+
createErrors = append(createErrors, errors.New(errMessage))
618+
if err := d.API.LunDestroy(ctx, lunPath); err != nil {
619+
Logc(ctx).WithField("LUN", lunPath).Errorf("Could not clean up LUN; %v", err)
620+
} else {
621+
Logc(ctx).WithField("volume", name).Debugf("Cleaned up LUN after wait error.")
622+
}
623+
if err := d.API.VolumeDestroy(ctx, name, true, true); err != nil {
624+
Logc(ctx).WithField("volume", name).Errorf("Could not clean up volume; %v", err)
625+
} else {
626+
Logc(ctx).WithField("volume", name).Debugf("Cleaned up volume after LUN wait error.")
627+
}
628+
continue
629+
}
630+
611631
// Save the fstype in a LUN attribute so we know what to do in Attach. If this fails, clean up and
612632
// move on to the next pool.
613633
// Save the context, fstype, LUKS value, and pool name in LUN comment
@@ -777,12 +797,10 @@ func (d *SANStorageDriver) Import(ctx context.Context, volConfig *storage.Volume
777797
volConfig.LUKSEncryption = d.Config.LUKSEncryption
778798
}
779799

780-
// Ensure the volume has only one LUN
781-
lunInfo, err := d.API.LunGetByName(ctx, "/vol/"+originalName+"/*")
800+
// Ensure the volume has only one LUN (retry REST read-after-not-found when the LUN was just created).
801+
lunInfo, err := api.WaitForLunToExist(ctx, d.API, "/vol/"+originalName+"/*")
782802
if err != nil {
783803
return err
784-
} else if lunInfo == nil {
785-
return fmt.Errorf("lun not found in volume %s", originalName)
786804
}
787805
targetPath := "/vol/" + originalName + "/lun0"
788806

@@ -1394,7 +1412,7 @@ func (d *SANStorageDriver) GetVolumeForImport(ctx context.Context, volumeID stri
13941412
}
13951413

13961414
lunPath := fmt.Sprintf("/vol/%v/*", volumeID)
1397-
lunAttrs, err := d.API.LunGetByName(ctx, lunPath)
1415+
lunAttrs, err := api.WaitForLunToExist(ctx, d.API, lunPath)
13981416
if err != nil {
13991417
return nil, err
14001418
}

storage_drivers/ontap/ontap_san_economy.go

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -728,6 +728,29 @@ func (d *SANEconomyStorageDriver) Create(
728728
break // Try next aggregate
729729
}
730730

731+
if _, waitErr := api.WaitForLunToExist(ctx, d.API, lunPathEco); waitErr != nil {
732+
errMessage := fmt.Sprintf(
733+
"ONTAP-SAN-ECONOMY pool %s/%s; LUN %s/%s not visible after create: %v",
734+
storagePool.Name(), aggregate, bucketVol, name, waitErr,
735+
)
736+
Logc(ctx).Error(errMessage)
737+
createErrors = append(createErrors, errors.New(errMessage))
738+
if err := d.API.LunDestroy(ctx, lunPathEco); err != nil {
739+
Logc(ctx).WithField("LUN", lunPathEco).WithError(err).Error("Could not clean up LUN.")
740+
} else {
741+
Logc(ctx).WithField("volume", name).Debugf("Cleaned up LUN after wait error.")
742+
}
743+
if newVol {
744+
if err := d.API.VolumeDestroy(ctx, bucketVol, true, true); err != nil {
745+
Logc(ctx).WithField("volume", bucketVol).WithError(err).Error("Could not clean up volume.")
746+
} else {
747+
Logc(ctx).WithField("volume", name).Debugf("Cleaned up volume after LUN wait error.")
748+
}
749+
}
750+
lockedFlexvol.Unlock()
751+
break
752+
}
753+
731754
// LUN created successfully
732755
lunCreated = true
733756
}
@@ -891,6 +914,9 @@ func (d *SANEconomyStorageDriver) cloneLUNFromSnapshot(
891914
if err := d.API.LunCloneCreate(ctx, bucketVolume, sourcePath, lunName, qosPolicyGroup); err != nil {
892915
return "", err
893916
}
917+
if _, err := api.WaitForLunToExist(ctx, d.API, GetLUNPathEconomy(bucketVolume, lunName)); err != nil {
918+
return "", err
919+
}
894920
internalID := d.CreateLUNInternalID(d.Config.SVM, bucketVolume, lunName)
895921

896922
// Grow or shrink the FlexVol as needed (lock is still held)
@@ -977,6 +1003,10 @@ func (d *SANEconomyStorageDriver) createLUNClone(
9771003
return "", err
9781004
}
9791005

1006+
if _, err = api.WaitForLunToExist(ctx, client, GetLUNPathEconomy(flexvol, lunName)); err != nil {
1007+
return "", err
1008+
}
1009+
9801010
internalID := d.CreateLUNInternalID(config.SVM, flexvol, lunName)
9811011

9821012
// Grow or shrink the Flexvol as needed (lock is still held)
@@ -1027,12 +1057,10 @@ func (d *SANEconomyStorageDriver) Import(
10271057
volConfig.LUKSEncryption = d.Config.LUKSEncryption
10281058
}
10291059

1030-
// Ensure LUN found.
1031-
extantLUN, err := d.API.LunGetByName(ctx, "/vol/"+originalFlexvolName+"/"+originalLUNName)
1060+
// Ensure LUN found (retry REST read-after-not-found when the LUN is briefly invisible).
1061+
extantLUN, err := api.WaitForLunToExist(ctx, d.API, "/vol/"+originalFlexvolName+"/"+originalLUNName)
10321062
if err != nil {
10331063
return err
1034-
} else if extantLUN == nil {
1035-
return errors.NotFoundError("LUN %s not found in volume %s", originalLUNName, originalFlexvolName)
10361064
}
10371065
Logc(ctx).WithField("LUN", extantLUN.Name).Trace("Import - LUN found.")
10381066

@@ -2037,7 +2065,7 @@ func (d *SANEconomyStorageDriver) ProcessGroupSnapshot(
20372065
// Pull the size directly from the cloned LUN.
20382066
lunClonePath := GetLUNPathEconomy(bucketVol, lunCloneName)
20392067
fields["lunClonePath"] = lunClonePath
2040-
lunInfo, err := d.API.LunGetByName(ctx, lunClonePath)
2068+
lunInfo, err := api.WaitForLunToExist(ctx, d.API, lunClonePath)
20412069
if err != nil {
20422070
Logc(ctx).WithFields(fields).WithError(err).Error("Failed to find LUN clone from snapshot.")
20432071
errs = errors.Join(errs, err)
@@ -2523,7 +2551,7 @@ func (d *SANEconomyStorageDriver) GetVolumeForImport(
25232551
}
25242552

25252553
ecoLUNPath := GetLUNPathEconomy(originalFlexvolName, originalLUNName)
2526-
extantLUN, err := d.API.LunGetByName(ctx, ecoLUNPath)
2554+
extantLUN, err := api.WaitForLunToExist(ctx, d.API, ecoLUNPath)
25272555
if err != nil {
25282556
return nil, err
25292557
}

storage_drivers/ontap/ontap_san_economy_concurrency_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ func TestSANEco_CloneLUN_ConcurrentSameSource(t *testing.T) {
272272
},
273273
).Times(numClones)
274274

275+
mockAPI.EXPECT().LunGetByName(ctx, gomock.Any()).Return(&api.Lun{Size: "1073741824"}, nil).AnyTimes()
276+
275277
// Mock: VolumeInfo for resize checks (called after each clone)
276278
mockAPI.EXPECT().VolumeInfo(ctx, bucketName).Return(&api.Volume{
277279
Size: "1073741824", // 1GB
@@ -350,6 +352,8 @@ func TestSANEco_CloneLUN_ConcurrentDifferentBuckets(t *testing.T) {
350352
},
351353
).Times(numBuckets)
352354

355+
mockAPI.EXPECT().LunGetByName(ctx, gomock.Any()).Return(&api.Lun{Size: "1073741824"}, nil).AnyTimes()
356+
353357
// Mock: VolumeInfo for each bucket
354358
mockAPI.EXPECT().VolumeInfo(ctx, gomock.Any()).Return(&api.Volume{
355359
Size: "1073741824",
@@ -426,6 +430,8 @@ func TestSANEco_CloneLUN_ConcurrentSameBucket(t *testing.T) {
426430
},
427431
).Times(numClones)
428432

433+
mockAPI.EXPECT().LunGetByName(ctx, gomock.Any()).Return(&api.Lun{Size: "1073741824"}, nil).AnyTimes()
434+
429435
// Mock: VolumeInfo for resize
430436
mockAPI.EXPECT().VolumeInfo(ctx, bucketName).Return(&api.Volume{
431437
Size: "1073741824",

0 commit comments

Comments
 (0)