Skip to content

Commit 649932d

Browse files
authored
Fix stale tridentNode CR not cleaned up after soft-deleted node's vol…
1 parent 15f27b3 commit 649932d

3 files changed

Lines changed: 246 additions & 0 deletions

File tree

core/orchestrator_core.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6552,6 +6552,8 @@ func (o *TridentOrchestrator) DeleteNode(ctx context.Context, nodeName string) (
65526552
if err = o.storeClient.AddOrUpdateNode(ctx, node); err != nil {
65536553
return err
65546554
}
6555+
6556+
o.nodes.Set(nodeName, node)
65556557
}
65566558

65576559
return nil

core/orchestrator_core_test.go

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4484,6 +4484,140 @@ func TestDeleteNode(t *testing.T) {
44844484
}
44854485
}
44864486

4487+
// TestDeleteNode_SoftDeleteUpdatesCacheDeletedFlag verifies that when a node is
4488+
// soft-deleted (marked Deleted=true because it still has volume publications),
4489+
// the in-memory cache is updated with the Deleted flag.
4490+
func TestDeleteNode_SoftDeleteUpdatesCacheDeletedFlag(t *testing.T) {
4491+
mockCtrl := gomock.NewController(t)
4492+
defer mockCtrl.Finish()
4493+
4494+
mockStoreClient := mockpersistentstore.NewMockStoreClient(mockCtrl)
4495+
4496+
o := getOrchestrator(t, false)
4497+
o.storeClient = mockStoreClient
4498+
4499+
nodeName := "test-node-1"
4500+
volumeName := "test-vol-1"
4501+
4502+
// Set up a node.
4503+
o.nodes.Set(nodeName, &models.Node{
4504+
Name: nodeName,
4505+
IPs: []string{"10.0.0.1"},
4506+
Deleted: false,
4507+
})
4508+
4509+
// Set up a volume publication so DeleteNode takes the soft-delete path.
4510+
err := o.volumePublications.Set(volumeName, nodeName, &models.VolumePublication{
4511+
NodeName: nodeName,
4512+
VolumeName: volumeName,
4513+
})
4514+
assert.NoError(t, err)
4515+
4516+
// Mock: AddOrUpdateNode is called to persist the soft-delete.
4517+
mockStoreClient.EXPECT().AddOrUpdateNode(gomock.Any(), gomock.Any()).Return(nil)
4518+
4519+
// Act: Delete the node while it still has publications -> soft-delete.
4520+
err = o.DeleteNode(ctx(), nodeName)
4521+
assert.NoError(t, err)
4522+
4523+
// Assert: Node should still be in cache (not hard-deleted).
4524+
node := o.nodes.Get(nodeName)
4525+
assert.NotNil(t, node, "Node should still be in cache after soft-delete")
4526+
4527+
// Assert: The Deleted flag in the cache MUST be true.
4528+
// NodeCache.Get() returns a deep copy.
4529+
// DeleteNode was modifying the copy and persisting to store, but never
4530+
// calling o.nodes.Set() to update the cache. So the cache still had Deleted=false.
4531+
assert.True(t, node.Deleted,
4532+
"Node in cache should have Deleted=true after soft-delete; "+
4533+
"if this fails, the cache was not updated")
4534+
}
4535+
4536+
// TestDeleteNode_SoftDeletedNodeCleanedUpAfterLastUnpublish verifies the full
4537+
// end-to-end flow: when a node is soft-deleted and then all its volumes are
4538+
// unpublished, the tridentNode CR (and cache entry) should be cleaned up.
4539+
func TestDeleteNode_SoftDeletedNodeCleanedUpAfterLastUnpublish(t *testing.T) {
4540+
mockCtrl := gomock.NewController(t)
4541+
defer mockCtrl.Finish()
4542+
4543+
backendUUID := "backend-uuid-1"
4544+
nodeName := "test-node-1"
4545+
volumeName := "test-vol-1"
4546+
4547+
mockBackend := mockstorage.NewMockBackend(mockCtrl)
4548+
mockBackend.EXPECT().BackendUUID().Return(backendUUID).AnyTimes()
4549+
mockStoreClient := mockpersistentstore.NewMockStoreClient(mockCtrl)
4550+
4551+
o := getOrchestrator(t, false)
4552+
o.storeClient = mockStoreClient
4553+
4554+
// Set up a node.
4555+
o.nodes.Set(nodeName, &models.Node{
4556+
Name: nodeName,
4557+
IPs: []string{"10.0.0.1"},
4558+
Deleted: false,
4559+
})
4560+
4561+
// Set up a volume.
4562+
o.volumes = map[string]*storage.Volume{
4563+
volumeName: {
4564+
BackendUUID: backendUUID,
4565+
Config: &storage.VolumeConfig{Name: volumeName},
4566+
},
4567+
}
4568+
4569+
// Set up a single volume publication.
4570+
err := o.volumePublications.Set(volumeName, nodeName, &models.VolumePublication{
4571+
NodeName: nodeName,
4572+
VolumeName: volumeName,
4573+
})
4574+
assert.NoError(t, err)
4575+
4576+
// --- Step 1: Delete the node (soft-delete because publication exists) ---
4577+
// Note: Don't add the mock backend to o.backends yet, because DeleteNode calls
4578+
// updateMetrics() which would call backend methods we don't want to mock here.
4579+
mockStoreClient.EXPECT().AddOrUpdateNode(gomock.Any(), gomock.Any()).Return(nil)
4580+
4581+
err = o.DeleteNode(ctx(), nodeName)
4582+
assert.NoError(t, err)
4583+
4584+
// Node should still exist in cache (soft-deleted).
4585+
node := o.nodes.Get(nodeName)
4586+
assert.NotNil(t, node, "Node should still be in cache after soft-delete")
4587+
4588+
// --- Step 2: Unpublish the last volume from the soft-deleted node ---
4589+
// Now add the backend (needed for the unpublish flow).
4590+
o.backends[backendUUID] = mockBackend
4591+
4592+
// Mock expectations for the unpublish flow.
4593+
mockBackend.EXPECT().UnpublishVolume(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil)
4594+
mockStoreClient.EXPECT().UpdateVolume(gomock.Any(), gomock.Any()).Return(nil)
4595+
mockStoreClient.EXPECT().DeleteVolumePublication(gomock.Any(), gomock.Any()).Return(nil)
4596+
4597+
// These are called by deleteNode() when the soft-deleted node is finally cleaned up.
4598+
// We require exactly one call each to verify the persistent-store cleanup and
4599+
// backend invalidation actually occur on the success path.
4600+
nodeDeleted := false
4601+
mockStoreClient.EXPECT().DeleteNode(gomock.Any(), gomock.Any()).
4602+
DoAndReturn(func(_ context.Context, _ *models.Node) error {
4603+
nodeDeleted = true
4604+
return nil
4605+
}).Times(1)
4606+
mockBackend.EXPECT().InvalidateNodeAccess().Times(1)
4607+
4608+
err = o.unpublishVolume(coreCtx, volumeName, nodeName)
4609+
assert.NoError(t, err)
4610+
4611+
// Assert: deleteNode was called to remove the soft-deleted node from the persistent store.
4612+
assert.True(t, nodeDeleted,
4613+
"storeClient.DeleteNode should have been called to clean up the soft-deleted node")
4614+
4615+
// Assert: The node should now be fully removed from cache.
4616+
node = o.nodes.Get(nodeName)
4617+
assert.Nil(t, node,
4618+
"Node should be removed from cache after last volume is unpublished from soft-deleted node")
4619+
}
4620+
44874621
func TestSnapshotVolumes(t *testing.T) {
44884622
mockPools := tu.GetFakePools()
44894623
orchestrator := getOrchestrator(t, false)

storage_drivers/ontap/ontap_common_test.go

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3369,6 +3369,116 @@ func TestRemoveExportPolicyRules_CNVA_ContinueOnPartialErrors(t *testing.T) {
33693369
assert.NoError(t, err)
33703370
}
33713371

3372+
// TestRemoveExportPolicyRules_SameIPOnDepartingAndActiveNode reproduces the customer bug
3373+
// where a volume migrates from an old node to a new node that received the same IP address (DHCP/IP reuse).
3374+
// The publish for the new node arrives first (no-op since IP already in policy), then the unpublish for the old
3375+
// node arrives and must NOT remove the IP because the new node still needs it.
3376+
func TestRemoveExportPolicyRules_SameIPOnDepartingAndActiveNode(t *testing.T) {
3377+
ctx := context.TODO()
3378+
mockAPI := newMockOntapAPI(t)
3379+
exportPolicy := "trident_pvc_test"
3380+
3381+
// Scenario: NodeA (old) is being unpublished. NodeB (new) remains active.
3382+
// Both nodes share the same IP 10.10.1.1 due to DHCP/IP reuse after migration.
3383+
sharedIP := "10.10.1.1"
3384+
3385+
// publishInfo as constructed by the orchestrator during unpublish:
3386+
// - HostIP: departing node's IPs (NodeA)
3387+
// - Nodes: remaining active nodes (NodeB), which has the same IP
3388+
publishInfo := &tridentmodels.VolumePublishInfo{
3389+
HostName: "node-a",
3390+
HostIP: []string{sharedIP, "10.10.1.2"},
3391+
Nodes: []*tridentmodels.Node{
3392+
createTestNode("node-b", []string{sharedIP}),
3393+
},
3394+
}
3395+
3396+
// Export policy currently has rules for both of NodeA's IPs
3397+
existingRules := map[int]string{
3398+
1: sharedIP, // Shared IP - must be PRESERVED (NodeB still needs it)
3399+
2: "10.10.1.2", // NodeA-only IP - should be removed
3400+
}
3401+
finalRules := map[int]string{
3402+
1: sharedIP, // Preserved because NodeB is still active with this IP
3403+
}
3404+
3405+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(existingRules, nil)
3406+
mockAPI.EXPECT().ExportRuleDestroy(ctx, exportPolicy, 2).Return(nil) // Only remove NodeA-only IP
3407+
// ExportRuleDestroy for rule 1 (shared IP) must NOT be called
3408+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(finalRules, nil)
3409+
3410+
err := removeExportPolicyRules(ctx, exportPolicy, publishInfo, mockAPI)
3411+
assert.NoError(t, err)
3412+
}
3413+
3414+
// TestRemoveExportPolicyRules_SameIPOnDepartingAndActiveNode_OnlyIPIsShared tests the edge case
3415+
// where the departing node's only IP is the same as the active node's IP.
3416+
// No export policy rules should be removed since the active node still needs the rule.
3417+
func TestRemoveExportPolicyRules_SameIPOnDepartingAndActiveNode_OnlyIPIsShared(t *testing.T) {
3418+
ctx := context.TODO()
3419+
mockAPI := newMockOntapAPI(t)
3420+
exportPolicy := "trident_pvc_test"
3421+
3422+
sharedIP := "10.10.1.1"
3423+
3424+
publishInfo := &tridentmodels.VolumePublishInfo{
3425+
HostName: "node-a",
3426+
HostIP: []string{sharedIP},
3427+
Nodes: []*tridentmodels.Node{
3428+
createTestNode("node-b", []string{sharedIP}),
3429+
},
3430+
}
3431+
3432+
existingRules := map[int]string{
3433+
1: sharedIP,
3434+
}
3435+
3436+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(existingRules, nil)
3437+
// No ExportRuleDestroy calls expected - the shared IP must be preserved
3438+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(existingRules, nil)
3439+
3440+
err := removeExportPolicyRules(ctx, exportPolicy, publishInfo, mockAPI)
3441+
assert.NoError(t, err)
3442+
}
3443+
3444+
// TestRemoveExportPolicyRules_SameIPOnDepartingAndMultipleActiveNodes tests IP reuse across
3445+
// multiple active nodes where the departing node shares IPs with more than one remaining node.
3446+
func TestRemoveExportPolicyRules_SameIPOnDepartingAndMultipleActiveNodes(t *testing.T) {
3447+
ctx := context.TODO()
3448+
mockAPI := newMockOntapAPI(t)
3449+
exportPolicy := "trident_pvc_test"
3450+
3451+
sharedIP := "10.10.1.1"
3452+
3453+
publishInfo := &tridentmodels.VolumePublishInfo{
3454+
HostName: "node-a",
3455+
HostIP: []string{sharedIP, "10.10.1.2"},
3456+
Nodes: []*tridentmodels.Node{
3457+
createTestNode("node-b", []string{sharedIP}),
3458+
createTestNode("node-c", []string{"10.10.1.3"}),
3459+
},
3460+
}
3461+
3462+
existingRules := map[int]string{
3463+
1: sharedIP, // Keep - NodeB still needs it
3464+
2: "10.10.1.2", // Remove - only NodeA had this
3465+
3: "10.10.1.3", // Keep - NodeC still needs it
3466+
4: "10.10.1.4", // Remove - not in any active node
3467+
}
3468+
finalRules := map[int]string{
3469+
1: sharedIP,
3470+
3: "10.10.1.3",
3471+
}
3472+
3473+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(existingRules, nil)
3474+
mockAPI.EXPECT().ExportRuleDestroy(ctx, exportPolicy, 2).Return(nil)
3475+
mockAPI.EXPECT().ExportRuleDestroy(ctx, exportPolicy, 4).Return(nil)
3476+
mockAPI.EXPECT().ExportRuleList(ctx, exportPolicy).Return(finalRules, nil)
3477+
3478+
err := removeExportPolicyRules(ctx, exportPolicy, publishInfo, mockAPI)
3479+
assert.NoError(t, err)
3480+
}
3481+
33723482
func TestDeleteExportPolicy(t *testing.T) {
33733483
// Test-1: Positive flow
33743484

0 commit comments

Comments
 (0)