Skip to content

Commit 9094db7

Browse files
committed
e2e: add nvmeof GroupLock stress tests for concurrent operations
Add e2e tests to validate nvmeof NodeServer GroupLock implementation under concurrent NodeStage (Group A) and NodeUnstage (Group B) operations. The tests ensure no deadlock occurs when multiple PVCs and Pods are created and deleted simultaneously. New helper file (nvmeof_helper.go) provides reusable functions for concurrent PVC/Pod operations with proper error tracking. Two test cases cover: 1) sequential concurrent batches (create all, then delete all) 2) mixed operations with pre-created batch to guarantee continuous Group A/B switching.. Signed-off-by: gadi-didi <gadi.didi@ibm.com>
1 parent 8dd6a8e commit 9094db7

2 files changed

Lines changed: 516 additions & 0 deletions

File tree

e2e/nvmeof.go

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ package e2e
1818

1919
import (
2020
"context"
21+
"fmt"
2122

23+
"github.com/google/uuid"
2224
"github.com/onsi/ginkgo/v2"
2325
. "github.com/onsi/gomega"
2426
v1 "k8s.io/api/core/v1"
@@ -189,5 +191,125 @@ var _ = ginkgo.Describe("nvmeof", func() {
189191
validateRBDImageCount(f, 0, nvmeofPool)
190192
validateOmapCount(f, 0, rbdType, nvmeofPool, volumesType)
191193
})
194+
195+
ginkgo.It("Test GroupLock: Concurrent Create/Delete Pods Only", func() {
196+
// This test validates the GroupLock implementation in the NVMeoF NodeServer
197+
// by creating and deleting multiple Pods (not PVCs) concurrently.
198+
//
199+
// Test flow:
200+
// 1. Create 3 PVCs sequentially and validate they're Bound
201+
// 2. Create 3 Pods concurrently using those PVCs (triggers NodeStage -> Group A lock)
202+
// 3. Wait for all Pods to be Running
203+
// 4. Delete all 3 Pods concurrently (triggers NodeUnstage -> Group B lock)
204+
// 5. Delete all 3 PVCs sequentially
205+
// 6. Verify no timeouts/deadlocks and all operations succeed
206+
//
207+
// This tests GroupLock in the NodeServer without involving ControllerServer operations.
208+
totalCount := 3
209+
210+
ginkgo.By("Creating PVCs sequentially")
211+
pvc, err := loadPVC(pvcPath)
212+
Expect(err).ShouldNot(HaveOccurred())
213+
pvc.Namespace = f.UniqueName
214+
pvc.Spec.StorageClassName = &nvmeofStorageClass
215+
216+
pvcBaseName := uuid.NewString()
217+
for i := range totalCount {
218+
pvcName := fmt.Sprintf("%s-%d", pvcBaseName, i)
219+
pvcCopy := pvc.DeepCopy()
220+
pvcCopy.Name = pvcName
221+
222+
framework.Logf("Creating PVC %d/%d: %s", i+1, totalCount, pvcName)
223+
err = createPVCAndvalidatePV(f.ClientSet, pvcCopy, deployTimeout)
224+
Expect(err).ShouldNot(HaveOccurred())
225+
}
226+
227+
ginkgo.By("Validating backend RBD images were created")
228+
validateRBDImageCount(f, totalCount, nvmeofPool)
229+
validateOmapCount(f, totalCount, rbdType, nvmeofPool, volumesType)
230+
231+
ginkgo.By("Creating Pods concurrently using those PVCs")
232+
createResult := createConcurrentPods(totalCount, pvcBaseName, 0, appPath, f)
233+
234+
// Log any errors
235+
if createResult.HasErrors() {
236+
createResult.LogErrors()
237+
}
238+
239+
// Verify all creations succeeded
240+
Expect(createResult.failed).To(Equal(0),
241+
"Expected all %d Pod create operations to succeed, but %d failed",
242+
totalCount, createResult.failed)
243+
244+
ginkgo.By("Waiting for all Pods to be Running")
245+
for i := range totalCount {
246+
podName := fmt.Sprintf("%s-%d", createResult.uniqueName, i)
247+
err = waitForPodInRunningState(podName, f.UniqueName, f.ClientSet, deployTimeout, noError)
248+
Expect(err).ShouldNot(HaveOccurred())
249+
}
250+
251+
ginkgo.By("Deleting Pods concurrently")
252+
deleteResult := deleteConcurrentPods(createResult, f)
253+
254+
// Log any errors
255+
if deleteResult.HasErrors() {
256+
deleteResult.LogErrors()
257+
}
258+
259+
// Verify all deletions succeeded
260+
Expect(deleteResult.failed).To(Equal(0),
261+
"Expected all %d Pod delete operations to succeed, but %d failed",
262+
totalCount, deleteResult.failed)
263+
264+
ginkgo.By("Deleting PVCs sequentially")
265+
for i := range totalCount {
266+
pvcName := fmt.Sprintf("%s-%d", pvcBaseName, i)
267+
pvcCopy := pvc.DeepCopy()
268+
pvcCopy.Name = pvcName
269+
270+
framework.Logf("Deleting PVC %d/%d: %s", i+1, totalCount, pvcName)
271+
err = deletePVCAndValidatePV(f.ClientSet, pvcCopy, deployTimeout)
272+
Expect(err).ShouldNot(HaveOccurred())
273+
}
274+
275+
ginkgo.By("Validating all backend RBD images were deleted")
276+
validateRBDImageCount(f, 0, nvmeofPool)
277+
validateOmapCount(f, 0, rbdType, nvmeofPool, volumesType)
278+
279+
framework.Logf("GroupLock test passed: %d concurrent Pod creates and %d concurrent Pod deletes completed successfully",
280+
totalCount, totalCount)
281+
})
282+
283+
ginkgo.It("Test GroupLock: Mixed Create/Delete Pods with Rapid Switching", func() {
284+
// This test validates the GroupLock implementation under rapid switching
285+
// between Group A (NodeStage) and Group B (NodeUnstage) operations.
286+
//
287+
// Test flow:
288+
// 1. Create 15 PVCs sequentially
289+
// 2. Create 5 Pods using PVCs 0-4 (Group A)
290+
// 3. Concurrently: Create 5 Pods using PVCs 5-9 (Group A) + Delete previous 5 Pods (Group B)
291+
// 4. Concurrently: Create 5 Pods using PVCs 10-14 (Group A) + Delete previous 5 Pods (Group B)
292+
// 5. Delete final 5 Pods
293+
// 6. Delete all 15 PVCs sequentially
294+
//
295+
// This tests rapid GroupLock switching between Group A and B in the NodeServer only,
296+
// without involving ControllerServer operations.
297+
totalCount := 15
298+
batchSize := 5
299+
300+
ginkgo.By(fmt.Sprintf("Running Pods-only mixed test: %d total PVCs, batches of %d Pods",
301+
totalCount, batchSize))
302+
303+
err := mixedCreateDeletePodsOnly(totalCount, batchSize, pvcPath, appPath, nvmeofStorageClass, f)
304+
Expect(err).ShouldNot(HaveOccurred(),
305+
"Mixed Pods-only operations should complete without errors")
306+
307+
ginkgo.By("Validating all backend RBD images were cleaned up")
308+
validateRBDImageCount(f, 0, nvmeofPool)
309+
validateOmapCount(f, 0, rbdType, nvmeofPool, volumesType)
310+
311+
framework.Logf("GroupLock Pods-only test passed: %d Pods created and deleted with rapid Group A/B switching",
312+
totalCount)
313+
})
192314
})
193315
})

0 commit comments

Comments
 (0)