Skip to content

Commit 56fc413

Browse files
committed
fixup storage tracking
1 parent 9be78b8 commit 56fc413

7 files changed

Lines changed: 242 additions & 42 deletions

File tree

apis/bases/rabbitmq.openstack.org_rabbitmqs.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,6 +1969,14 @@ spec:
19691969
the opentack-operator in the top-level CR (e.g. the ContainerImage)
19701970
format: int64
19711971
type: integer
1972+
pvsBeingDeleted:
1973+
description: |-
1974+
PVsBeingDeleted - list of PV names that are expected to be deleted during storage wipe
1975+
Tracked to avoid scanning all PVs in the cluster during cleanup verification
1976+
items:
1977+
type: string
1978+
type: array
1979+
x-kubernetes-list-type: set
19721980
queueType:
19731981
description: QueueType - store whether default ha-all policy is present
19741982
or not

apis/rabbitmq/v1beta1/rabbitmq_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,11 @@ type RabbitMqStatus struct {
151151
// StorageWipeStartedAt - timestamp when storage wipe process started
152152
// Used to implement timeout protection against stuck PV/PVC deletions
153153
StorageWipeStartedAt *metav1.Time `json:"storageWipeStartedAt,omitempty"`
154+
155+
// PVsBeingDeleted - list of PV names that are expected to be deleted during storage wipe
156+
// Tracked to avoid scanning all PVs in the cluster during cleanup verification
157+
// +listType=set
158+
PVsBeingDeleted []string `json:"pvsBeingDeleted,omitempty"`
154159
}
155160

156161
//+kubebuilder:object:root=true

apis/rabbitmq/v1beta1/zz_generated.deepcopy.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/rabbitmq.openstack.org_rabbitmqs.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,6 +1969,14 @@ spec:
19691969
the opentack-operator in the top-level CR (e.g. the ContainerImage)
19701970
format: int64
19711971
type: integer
1972+
pvsBeingDeleted:
1973+
description: |-
1974+
PVsBeingDeleted - list of PV names that are expected to be deleted during storage wipe
1975+
Tracked to avoid scanning all PVs in the cluster during cleanup verification
1976+
items:
1977+
type: string
1978+
type: array
1979+
x-kubernetes-list-type: set
19721980
queueType:
19731981
description: QueueType - store whether default ha-all policy is present
19741982
or not

internal/controller/rabbitmq/rabbitmq_controller.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
598598
CurrentQueueType: string(instance.Status.QueueType),
599599
Reason: wipeReason,
600600
StorageWipeStartedAt: storageWipeStartedAt,
601+
PVsBeingDeleted: instance.Status.PVsBeingDeleted,
601602
DeleteCluster: func(ctx context.Context) error {
602603
err := rmqCluster.Delete(ctx, helper)
603604
if err != nil && !k8s_errors.IsNotFound(err) {
@@ -618,6 +619,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
618619
return result, err
619620
}
620621

622+
// Update status with PV tracking information (if changed during this reconcile)
623+
if len(wipeParams.PVsBeingDeleted) > 0 {
624+
instance.Status.PVsBeingDeleted = wipeParams.PVsBeingDeleted
625+
}
626+
621627
// If result has Requeue set, we're still in progress
622628
if result.Requeue || result.RequeueAfter > 0 {
623629
return result, nil
@@ -628,9 +634,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
628634
if instance.Annotations != nil {
629635
if targetVersion, hasTarget := instance.Annotations[rabbitmqv1beta1.AnnotationTargetVersion]; hasTarget && targetVersion != "" {
630636
instance.Status.CurrentVersion = targetVersion
631-
// Clear the upgrade phase and timestamp
637+
// Clear the upgrade phase, timestamp, and PV tracking
632638
instance.Status.UpgradePhase = ""
633639
instance.Status.StorageWipeStartedAt = nil
640+
instance.Status.PVsBeingDeleted = nil
634641

635642
// If queue type changed during upgrade, update Status.QueueType to prevent
636643
// triggering another wipe for "queue type migration"
@@ -654,6 +661,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ct
654661
// Queue migration complete - update Status.QueueType
655662
instance.Status.UpgradePhase = ""
656663
instance.Status.StorageWipeStartedAt = nil
664+
instance.Status.PVsBeingDeleted = nil
657665
if instance.Spec.QueueType != nil {
658666
switch *instance.Spec.QueueType {
659667
case rabbitmqv1beta1.QueueTypeQuorum:

internal/controller/rabbitmq/storage_upgrade.go

Lines changed: 111 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ import (
7575
"context"
7676
"fmt"
7777
"sort"
78-
"strings"
7978
"time"
8079

8180
"github.com/go-logr/logr"
@@ -184,33 +183,102 @@ func removeProtectionFinalizer(pvc *corev1.PersistentVolumeClaim, log logr.Logge
184183
return false
185184
}
186185

187-
// verifyPVCleanupComplete checks if all PVs associated with the RabbitMQ instance
188-
// have been deleted. Returns true if cleanup is complete, false if still in progress.
186+
// removePVProtectionFinalizer removes the kubernetes.io/pv-protection finalizer from a PV.
187+
// Returns true if the finalizer was found and removed.
188+
//
189+
// Other finalizers (CSI drivers, storage backend) are preserved.
190+
//
191+
// The kubernetes.io/pv-protection finalizer prevents PV deletion while bound to a PVC.
192+
// When a PVC is deleted, the PV goes to Released state, but sometimes the finalizer
193+
// isn't automatically removed by Kubernetes. We remove it to allow the Delete reclaim
194+
// policy to take effect.
195+
func removePVProtectionFinalizer(pv *corev1.PersistentVolume, log logr.Logger) bool {
196+
originalCount := len(pv.Finalizers)
197+
if originalCount == 0 {
198+
return false
199+
}
200+
201+
// Filter out only the kubernetes.io/pv-protection finalizer
202+
var filtered []string
203+
for _, f := range pv.Finalizers {
204+
if f != "kubernetes.io/pv-protection" {
205+
filtered = append(filtered, f)
206+
}
207+
}
208+
209+
if len(filtered) < originalCount {
210+
log.Info("Removing PV protection finalizer to allow deletion during upgrade",
211+
"pvName", pv.Name,
212+
"removedFinalizer", "kubernetes.io/pv-protection",
213+
"remainingFinalizers", filtered)
214+
pv.Finalizers = filtered
215+
return true
216+
}
217+
218+
return false
219+
}
220+
221+
// verifyPVCleanupComplete checks if all PVs from the provided list have been deleted.
222+
// Returns true if cleanup is complete, false if still in progress.
189223
//
190224
// Verifies clean storage before recreating the cluster.
191225
//
192226
// Implements timeout protection (30 minutes) for PV deletion.
193-
func (r *Reconciler) verifyPVCleanupComplete(ctx context.Context, namespace, instanceName string, storageWipeStartedAt *time.Time, log logr.Logger) (bool, error) {
194-
pvList := &corev1.PersistentVolumeList{}
195-
if err := r.List(ctx, pvList); err != nil {
196-
return false, fmt.Errorf("failed to list PVs during upgrade verification: %w. "+
197-
"List PVs: kubectl get pv -A", err)
227+
//
228+
// pvNames is the list of PV names that were captured during PVC deletion.
229+
// This optimization avoids scanning all PVs in the cluster.
230+
func (r *Reconciler) verifyPVCleanupComplete(ctx context.Context, pvNames []string, storageWipeStartedAt *time.Time, log logr.Logger) (bool, error) {
231+
// If no PVs were tracked, cleanup is complete
232+
if len(pvNames) == 0 {
233+
log.Info("No PVs to clean up")
234+
return true, nil
198235
}
199236

200-
// Check if any PVs are still referencing our PVCs
201-
pvcPrefix := fmt.Sprintf("persistence-%s-", instanceName)
237+
// Check which PVs still exist and remove protection finalizers if needed
202238
var stuckPVs []string
203-
for _, pv := range pvList.Items {
204-
if pv.Spec.ClaimRef != nil &&
205-
pv.Spec.ClaimRef.Namespace == namespace &&
206-
strings.HasPrefix(pv.Spec.ClaimRef.Name, pvcPrefix) {
207-
208-
log.Info("Waiting for PV cleanup to complete",
209-
"pvName", pv.Name,
210-
"phase", pv.Status.Phase,
211-
"pvcName", pv.Spec.ClaimRef.Name)
212-
stuckPVs = append(stuckPVs, pv.Name)
239+
for _, pvName := range pvNames {
240+
pv := &corev1.PersistentVolume{}
241+
err := r.Get(ctx, types.NamespacedName{Name: pvName}, pv)
242+
243+
// If PV doesn't exist, it's been cleaned up (expected)
244+
if k8s_errors.IsNotFound(err) {
245+
continue
246+
}
247+
248+
// If we can't get the PV for other reasons, include it to be safe
249+
if err != nil {
250+
log.V(1).Info("Unable to get PV, including in cleanup check",
251+
"pvName", pvName,
252+
"error", err)
253+
stuckPVs = append(stuckPVs, pvName)
254+
continue
213255
}
256+
257+
// If PV is in Released phase (PVC deleted but PV still exists),
258+
// remove kubernetes.io/pv-protection finalizer to allow deletion
259+
if pv.Status.Phase == corev1.VolumeReleased {
260+
if removePVProtectionFinalizer(pv, log) {
261+
if err := r.Update(ctx, pv); err != nil {
262+
log.Error(err, "Failed to remove pv-protection finalizer from PV",
263+
"pvName", pvName)
264+
// Don't fail - just log and continue, will retry next reconcile
265+
} else {
266+
log.Info("Removed pv-protection finalizer from Released PV",
267+
"pvName", pvName)
268+
}
269+
}
270+
}
271+
272+
// PV still exists - track it
273+
pvcName := ""
274+
if pv.Spec.ClaimRef != nil {
275+
pvcName = pv.Spec.ClaimRef.Name
276+
}
277+
log.Info("Waiting for PV cleanup to complete",
278+
"pvName", pvName,
279+
"phase", pv.Status.Phase,
280+
"pvcName", pvcName)
281+
stuckPVs = append(stuckPVs, pvName)
214282
}
215283

216284
if len(stuckPVs) > 0 {
@@ -250,10 +318,10 @@ func (r *Reconciler) verifyPVCleanupComplete(ctx context.Context, namespace, ins
250318
// 1. PV deletion (due to Delete reclaim policy)
251319
// 2. Storage backend cleanup
252320
//
253-
// Returns true if PVCs were deleted.
254-
func (r *Reconciler) deletePVCsForUpgrade(ctx context.Context, pvcList *corev1.PersistentVolumeClaimList, log logr.Logger) (bool, error) {
321+
// Returns true if PVCs were deleted and the list of PV names that should be deleted.
322+
func (r *Reconciler) deletePVCsForUpgrade(ctx context.Context, pvcList *corev1.PersistentVolumeClaimList, log logr.Logger) (bool, []string, error) {
255323
if len(pvcList.Items) == 0 {
256-
return false, nil
324+
return false, nil, nil
257325
}
258326

259327
// Sort PVCs by name to ensure deterministic deletion order
@@ -264,14 +332,17 @@ func (r *Reconciler) deletePVCsForUpgrade(ctx context.Context, pvcList *corev1.P
264332

265333
log.Info("Deleting PVCs in deterministic order", "pvcCount", len(pvcList.Items))
266334

335+
// Track which PVs should be deleted (for efficient cleanup verification)
336+
pvNames := make([]string, 0, len(pvcList.Items))
337+
267338
for i := range pvcList.Items {
268339
pvc := &pvcList.Items[i]
269340

270341
// Remove kubernetes.io/pvc-protection finalizer to allow deletion
271342
// Other finalizers are preserved for CSI driver cleanup
272343
if removeProtectionFinalizer(pvc, log) {
273344
if err := r.Update(ctx, pvc); err != nil {
274-
return false, fmt.Errorf("failed to remove finalizer from PVC %s: %w. "+
345+
return false, nil, fmt.Errorf("failed to remove finalizer from PVC %s: %w. "+
275346
"Check PVC: kubectl get pvc %s -n %s -o yaml",
276347
pvc.Name, err, pvc.Name, pvc.Namespace)
277348
}
@@ -280,17 +351,22 @@ func (r *Reconciler) deletePVCsForUpgrade(ctx context.Context, pvcList *corev1.P
280351
// Delete the PVC
281352
// The Delete reclaim policy on the PV triggers automatic storage cleanup
282353
if err := r.Delete(ctx, pvc); err != nil && !k8s_errors.IsNotFound(err) {
283-
return false, fmt.Errorf("failed to delete PVC %s: %w. "+
354+
return false, nil, fmt.Errorf("failed to delete PVC %s: %w. "+
284355
"Check PVC: kubectl describe pvc %s -n %s",
285356
pvc.Name, err, pvc.Name, pvc.Namespace)
286357
}
287358

288359
log.Info("Deleted PVC - storage will be automatically wiped by Delete reclaim policy",
289360
"pvcName", pvc.Name,
290361
"pvName", pvc.Spec.VolumeName)
362+
363+
// Track the PV for cleanup verification
364+
if pvc.Spec.VolumeName != "" {
365+
pvNames = append(pvNames, pvc.Spec.VolumeName)
366+
}
291367
}
292368

293-
return true, nil
369+
return true, pvNames, nil
294370
}
295371

296372
// waitForPodsTermination checks if all pods associated with the RabbitMQ instance
@@ -340,6 +416,8 @@ type StorageWipeParams struct {
340416
Reason string
341417
// Timestamp when storage wipe started (for timeout tracking)
342418
StorageWipeStartedAt *time.Time
419+
// PV names being deleted (populated during PVC deletion for efficient verification)
420+
PVsBeingDeleted []string
343421
// Function to delete the RabbitMQCluster
344422
DeleteCluster func(ctx context.Context) error
345423
// Function to delete the ha-all mirrored policy (optional)
@@ -429,20 +507,24 @@ func (r *Reconciler) performStorageWipe(
429507

430508
// Step 4b: Delete all PVCs
431509
// The Delete reclaim policy will trigger automatic storage cleanup
432-
pvcDeleted, err := r.deletePVCsForUpgrade(ctx, pvcList, log)
510+
pvcDeleted, pvNames, err := r.deletePVCsForUpgrade(ctx, pvcList, log)
433511
if err != nil {
434512
log.Error(err, "Failed to delete PVCs during storage wipe")
435513
return ctrl.Result{}, err
436514
}
437515

438516
if pvcDeleted {
439-
log.Info("Deleted all PVCs, waiting for PV cleanup to complete", "pvcCount", len(pvcList.Items))
517+
// Store the PV names for efficient cleanup verification
518+
params.PVsBeingDeleted = pvNames
519+
log.Info("Deleted all PVCs, waiting for PV cleanup to complete",
520+
"pvcCount", len(pvcList.Items),
521+
"pvCount", len(pvNames))
440522
return ctrl.Result{RequeueAfter: UpgradeCheckInterval}, nil
441523
}
442524
}
443525

444526
// Step 5: Verify all PVs are completely cleaned up
445-
cleanupComplete, err := r.verifyPVCleanupComplete(ctx, params.Namespace, params.InstanceName, params.StorageWipeStartedAt, log)
527+
cleanupComplete, err := r.verifyPVCleanupComplete(ctx, params.PVsBeingDeleted, params.StorageWipeStartedAt, log)
446528
if err != nil {
447529
log.Error(err, "Failed to verify PV cleanup")
448530
return ctrl.Result{}, fmt.Errorf("PV cleanup verification failed: %w. "+

0 commit comments

Comments
 (0)