Skip to content

Commit e92b60c

Browse files
authored
Fix NVMe namespace race condition during concurrent volume creation
1 parent 22fe861 commit e92b60c

File tree

2 files changed

+437
-41
lines changed

2 files changed

+437
-41
lines changed

storage_drivers/ontap/ontap_san_nvme.go

Lines changed: 147 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ var NVMeNamespaceRegExp = regexp.MustCompile(`[^(\/vol\/.+\/.+)?$]`)
4242
var (
4343
beforeNQNRemovalFromSubsystem = fiji.Register("beforeNQNRemovalFromSubsystem", "ontap_san_nvme")
4444
beforeNamespaceUnmapFromSubsystem = fiji.Register("beforeNamespaceUnmapFromSubsystem", "ontap_san_nvme")
45+
beforeNamespaceCreate = fiji.Register("beforeNamespaceCreate", "ontap_san_nvme")
46+
afterNamespaceCreate = fiji.Register("afterNamespaceCreate", "ontap_san_nvme")
4547
)
4648

4749
// NVMeStorageDriver is for NVMe storage provisioning.
@@ -267,36 +269,138 @@ func (d *NVMeStorageDriver) validate(ctx context.Context) error {
267269
}
268270

269271
// Create a Volume+Namespace with the specified options.
270-
func (d *NVMeStorageDriver) Create(
271-
ctx context.Context, volConfig *storage.VolumeConfig, storagePool storage.Pool, volAttributes map[string]sa.Request,
272+
// cleanupIncompleteNamespace checks if a FlexVol exists without its NVMe namespace.
273+
// Returns (true, nil) if both volume and namespace exist (complete).
274+
// Returns (false, nil) if no volume exists or orphaned volume was cleaned up.
275+
// Returns (_, error) on API errors or if cleanup fails.
276+
func (d *NVMeStorageDriver) cleanupIncompleteNamespace(
277+
ctx context.Context, volConfig *storage.VolumeConfig,
272278
) error {
273279
name := volConfig.InternalName
274280

275281
fields := LogFields{
276-
"method": "Create",
282+
"method": "cleanupIncompleteNamespace",
277283
"type": "NVMeStorageDriver",
278284
"name": name,
279-
"attrs": volAttributes,
280285
}
281-
Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace(">>>> Create")
282-
defer Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace("<<<< Create")
286+
Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace(
287+
">>>> cleanupIncompleteNamespace")
288+
defer Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace(
289+
"<<<< cleanupIncompleteNamespace")
283290

284-
// If the volume already exists, bail out.
285291
volExists, err := d.API.VolumeExists(ctx, name)
286292
if err != nil {
287293
return fmt.Errorf("error checking for existing volume: %v", err)
288294
}
289-
if volExists {
295+
if !volExists {
296+
return nil
297+
}
298+
if volConfig.IsMirrorDestination {
290299
return drivers.NewVolumeExistsError(name)
291300
}
292301

293-
// If volume shall be mirrored, check that the SVM is peered with the other side.
294-
if volConfig.PeerVolumeHandle != "" {
295-
if err = checkSVMPeered(ctx, volConfig, d.API.SVMName(), d.API); err != nil {
296-
return err
302+
nsName := extractNamespaceName(volConfig.InternalID)
303+
nsPath := createNamespacePath(name, nsName)
304+
305+
ns, nsErr := d.API.NVMeNamespaceGetByName(ctx, nsPath)
306+
if nsErr != nil && !errors.IsNotFoundError(nsErr) {
307+
return fmt.Errorf("error checking for existing namespace %s: %v", nsPath, nsErr)
308+
}
309+
310+
// If both volume and namespace exist and volConfig has all critical fields populated,
311+
// this is a fully formed volume from a prior successful create -- report it as existing.
312+
if ns != nil && volConfig.FileSystem != "" {
313+
Logc(ctx).WithFields(LogFields{
314+
"volume": name,
315+
"namespace": nsPath,
316+
"uuid": ns.UUID,
317+
}).Debug("Found existing volume and namespace.")
318+
volConfig.AccessInfo.NVMeNamespaceUUID = ns.UUID
319+
volConfig.InternalID = nsPath
320+
return drivers.NewVolumeExistsError(name)
321+
}
322+
323+
// Either the namespace is missing (partial create) or volConfig is incomplete (e.g. empty
324+
// FileSystem from a fresh CSI retry). In both cases, destroy the FlexVol so Create can
325+
// re-create it with a fully populated volConfig.
326+
reason := "namespace missing"
327+
if ns != nil {
328+
reason = "volConfig.FileSystem is empty"
329+
}
330+
Logc(ctx).WithFields(LogFields{
331+
"volume": name,
332+
"reason": reason,
333+
}).Warning("Cleaning up incomplete volume to allow re-creation.")
334+
// Destroying the FlexVol is safe: ONTAP implicitly removes any contained namespaces,
335+
// and no pod can be using this volume yet since Create has not returned success.
336+
if err = d.API.VolumeDestroy(ctx, name, true, true); err != nil {
337+
return fmt.Errorf("could not clean up incomplete volume %s: %v", name, err)
338+
}
339+
return nil
340+
}
341+
342+
// getNamespaceWithRetry retrieves a namespace by path, retrying with linear backoff to handle
343+
// ONTAP eventual consistency where a namespace may not be visible immediately after creation.
344+
func (d *NVMeStorageDriver) getNamespaceWithRetry(
345+
ctx context.Context, nsPath string,
346+
) (*api.NVMeNamespace, error) {
347+
const maxRetries = 3
348+
var ns *api.NVMeNamespace
349+
var err error
350+
351+
for attempt := 1; attempt <= maxRetries; attempt++ {
352+
ns, err = d.API.NVMeNamespaceGetByName(ctx, nsPath)
353+
if err == nil && ns != nil {
354+
return ns, nil
355+
}
356+
357+
if attempt < maxRetries {
358+
Logc(ctx).WithFields(LogFields{
359+
"namespace": nsPath,
360+
"attempt": attempt,
361+
"error": err,
362+
}).Trace("Namespace not yet visible after creation, retrying.")
363+
364+
select {
365+
case <-time.After(time.Duration(attempt) * time.Second):
366+
case <-ctx.Done():
367+
return nil, fmt.Errorf("context cancelled while waiting for namespace visibility: %v", ctx.Err())
368+
}
369+
} else {
370+
Logc(ctx).WithFields(LogFields{
371+
"namespace": nsPath,
372+
"attempt": attempt,
373+
"error": err,
374+
}).Error("Namespace not visible after all retries.")
297375
}
298376
}
299377

378+
if err != nil {
379+
return nil, fmt.Errorf("error retrieving namespace %s after creation: %v", nsPath, err)
380+
}
381+
return nil, fmt.Errorf("newly created namespace %s not found", nsPath)
382+
}
383+
384+
func (d *NVMeStorageDriver) Create(
385+
ctx context.Context, volConfig *storage.VolumeConfig, storagePool storage.Pool, volAttributes map[string]sa.Request,
386+
) error {
387+
name := volConfig.InternalName
388+
389+
fields := LogFields{
390+
"method": "Create",
391+
"type": "NVMeStorageDriver",
392+
"name": name,
393+
"attrs": volAttributes,
394+
}
395+
Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace(">>>> Create")
396+
defer Logd(ctx, d.Name(), d.Config.DebugTraceFlags["method"]).WithFields(fields).Trace("<<<< Create")
397+
398+
// Check if volume+namespace already exist. Clean up orphaned FlexVols (volume without namespace).
399+
// Returns VolumeExistsError if both are present, nil if creation should proceed.
400+
if err := d.cleanupIncompleteNamespace(ctx, volConfig); err != nil {
401+
return err
402+
}
403+
300404
// Get candidate physical pools.
301405
physicalPools, err := getPoolsForCreate(ctx, volConfig, storagePool, volAttributes, d.physicalPools, d.virtualPools)
302406
if err != nil {
@@ -403,6 +507,13 @@ func (d *NVMeStorageDriver) Create(
403507
volConfig.LUKSEncryption = luksEncryption
404508
volConfig.FileSystem = fstype
405509

510+
// If volume shall be mirrored, check that the SVM is peered with the other side.
511+
if volConfig.PeerVolumeHandle != "" {
512+
if err = checkSVMPeered(ctx, volConfig, d.API.SVMName(), d.API); err != nil {
513+
return err
514+
}
515+
}
516+
406517
Logc(ctx).WithFields(LogFields{
407518
"name": name,
408519
"namespaceSize": namespaceSize,
@@ -472,20 +583,20 @@ func (d *NVMeStorageDriver) Create(
472583
DPVolume: volConfig.IsMirrorDestination,
473584
})
474585
if err != nil {
475-
if api.IsVolumeCreateJobExistsError(err) {
476-
// TODO(sphadnis): If it was decided that iSCSI has a bug here, make similar changes for NVMe.
477-
return nil
478-
}
479-
480-
errMessage := fmt.Sprintf(
481-
"ONTAP-NVMe pool %s/%s; error creating volume %s: %v", storagePool.Name(),
482-
aggregate, name, err,
483-
)
484-
Logc(ctx).Error(errMessage)
485-
createErrors = append(createErrors, errors.New(errMessage))
586+
if !api.IsVolumeCreateJobExistsError(err) {
587+
errMessage := fmt.Sprintf(
588+
"ONTAP-NVMe pool %s/%s; error creating volume %s: %v", storagePool.Name(),
589+
aggregate, name, err,
590+
)
591+
Logc(ctx).Error(errMessage)
592+
createErrors = append(createErrors, errors.New(errMessage))
486593

487-
// Move on to the next pool.
488-
continue
594+
// Move on to the next pool.
595+
continue
596+
}
597+
// Log a message. Proceed to create Namespace, hoping volume would have been created by the time
598+
// we send Namespace create request.
599+
Logc(ctx).WithField("volume", name).Debug("Volume create is already in progress.")
489600
}
490601

491602
osType := "linux"
@@ -509,6 +620,10 @@ func (d *NVMeStorageDriver) Create(
509620
return err
510621
}
511622

623+
if err := beforeNamespaceCreate.Inject(); err != nil {
624+
return err
625+
}
626+
512627
// Create namespace. If this fails, clean up and move on to the next pool.
513628
err = d.API.NVMeNamespaceCreate(
514629
ctx, api.NVMeNamespace{
@@ -537,17 +652,11 @@ func (d *NVMeStorageDriver) Create(
537652
continue
538653
}
539654

540-
// Get the newly created namespace and save the UUID
541-
newNamespace, err := d.API.NVMeNamespaceGetByName(ctx, nsPath)
655+
newNamespace, err := d.getNamespaceWithRetry(ctx, nsPath)
542656
if err != nil {
543-
return fmt.Errorf("failure checking for existence of volume: %v", err)
544-
}
545-
546-
if newNamespace == nil {
547-
return fmt.Errorf("newly created volume %s not found", name)
657+
return err
548658
}
549659

550-
// Store the Namespace UUID and Namespace Path for future operations.
551660
volConfig.AccessInfo.NVMeNamespaceUUID = newNamespace.UUID
552661
volConfig.InternalID = nsPath
553662

@@ -558,6 +667,11 @@ func (d *NVMeStorageDriver) Create(
558667
"internalID": volConfig.InternalID,
559668
}).Debug("Created FlexVol with NVMe namespace.")
560669
}
670+
671+
if err := afterNamespaceCreate.Inject(); err != nil {
672+
return err
673+
}
674+
561675
return nil
562676
}
563677

0 commit comments

Comments
 (0)