Skip to content

Commit 3ef7f86

Browse files
stuggiclaude
andcommitted
[b/r] Improve cleanup playbook: faster pod termination, delete all resources
- Replace 30s pause + multi-task pod polling with single loop that polls every 5s and force-deletes stuck pods after 60s - Add live progress via /dev/tty showing remaining/terminating pods - Delete Certificate CRs (cert-manager) - Fix cert secret exclusion: grep -v ceph-conf instead of grep -v ceph - Delete remaining user-provided secrets (osp-secret, ceph-conf-files, etc.) - Delete ConfigMaps and DNSData CRs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d08d483 commit 3ef7f86

1 file changed

Lines changed: 60 additions & 39 deletions

File tree

docs/dev/backup-restore/cleanup/cleanup-openstack.yaml

Lines changed: 60 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999
- OpenStackBackupConfig and OpenStackVersion CRs
100100
- Galera database PVCs (app=galera)
101101
- RabbitMQUser CRs and restored-user secrets
102-
- Certificate secrets (excluding EDPM/compute/Ceph certs)
102+
- Certificate CRs (cert-manager) and TLS secrets (excluding EDPM/compute certs, ceph-conf)
103103
- CA bundle secrets
104104
- PVC-pin dummy Deployments (from failed restores)
105105
{% endif %}
@@ -228,47 +228,36 @@
228228
failed_when: false
229229
when: cleanup_ctlplane | bool
230230

231-
- name: Wait for initial cleanup (30 seconds)
232-
ansible.builtin.pause:
233-
seconds: 30
234-
when: cleanup_ctlplane | bool
235-
236231
- name: Wait for pods to terminate
237232
ansible.builtin.shell: |
238-
oc get pods -n {{ openstack_namespace }} --field-selector='status.phase!=Succeeded' --no-headers 2>/dev/null | wc -l
239-
register: pod_count
240-
until: pod_count.stdout|int == 0
241-
retries: 60
242-
delay: 10
233+
TIMEOUT=300
234+
FORCE_AFTER=60
235+
ELAPSED=0
236+
FORCED=false
237+
while [ $ELAPSED -lt $TIMEOUT ]; do
238+
RUNNING=$(oc get pods -n {{ openstack_namespace }} --field-selector='status.phase!=Succeeded' --no-headers 2>/dev/null | wc -l)
239+
TERMINATING=$(oc get pods -n {{ openstack_namespace }} --no-headers 2>/dev/null | grep -c Terminating || true)
240+
if [ "${RUNNING}" -eq 0 ]; then
241+
echo "All pods terminated"
242+
exit 0
243+
fi
244+
echo " $(date +%H:%M:%S) Pods: ${RUNNING} remaining (${TERMINATING} terminating)" > /dev/tty
245+
if [ $ELAPSED -ge $FORCE_AFTER ] && [ "${FORCED}" = "false" ]; then
246+
STUCK=$(oc get pods -n {{ openstack_namespace }} --no-headers -o name 2>/dev/null | head -50)
247+
if [ -n "${STUCK}" ]; then
248+
echo " $(date +%H:%M:%S) Force deleting stuck pods..." > /dev/tty
249+
echo ${STUCK} | xargs oc delete -n {{ openstack_namespace }} --force --grace-period=0 2>/dev/null || true
250+
FORCED=true
251+
fi
252+
fi
253+
sleep 5
254+
ELAPSED=$((ELAPSED + 5))
255+
done
256+
echo "WARNING: ${RUNNING} pods still remaining after timeout" >&2
257+
exit 0
243258
changed_when: false
244-
failed_when: false
245259
when: cleanup_ctlplane | bool
246260

247-
- name: Check for stuck pods
248-
ansible.builtin.shell: |
249-
oc get pods -n {{ openstack_namespace }} --field-selector='status.phase!=Succeeded' --no-headers -o name 2>/dev/null
250-
register: stuck_pods
251-
changed_when: false
252-
failed_when: false
253-
when: cleanup_ctlplane | bool and pod_count.stdout|int > 0
254-
255-
- name: Force delete stuck pods
256-
ansible.builtin.shell: |
257-
oc delete {{ stuck_pods.stdout_lines | join(' ') }} -n {{ openstack_namespace }} --force --grace-period=0
258-
changed_when: true
259-
when: cleanup_ctlplane | bool and pod_count.stdout|int > 0 and stuck_pods.stdout != ""
260-
261-
- name: Wait after force delete
262-
ansible.builtin.shell: |
263-
oc get pods -n {{ openstack_namespace }} --field-selector='status.phase!=Succeeded' --no-headers 2>/dev/null | wc -l
264-
register: pod_count_final
265-
until: pod_count_final.stdout|int == 0
266-
retries: 30
267-
delay: 5
268-
changed_when: false
269-
failed_when: false
270-
when: cleanup_ctlplane | bool and pod_count.stdout|int > 0 and stuck_pods.stdout != ""
271-
272261
- name: Print pod cleanup status
273262
ansible.builtin.debug:
274263
msg: "All pods terminated"
@@ -319,9 +308,16 @@
319308
failed_when: false
320309
when: cleanup_ctlplane | bool
321310

311+
- name: Delete Certificate CRs (cert-manager)
312+
ansible.builtin.shell: |
313+
oc delete certificate --all -n {{ openstack_namespace }}
314+
changed_when: true
315+
failed_when: false
316+
when: cleanup_ctlplane | bool
317+
322318
- name: Delete cert secrets (with compute prefix exclusion)
323319
ansible.builtin.shell: |
324-
for i in $(oc get secret -n {{ openstack_namespace }} -o name | grep cert | grep -v edpm | grep -vE "({{ compute_prefixes.stdout }})" | grep -v ceph); do
320+
for i in $(oc get secret -n {{ openstack_namespace }} -o name | grep cert | grep -v edpm | grep -vE "({{ compute_prefixes.stdout }})" | grep -v ceph-conf); do
325321
oc delete -n {{ openstack_namespace }} $i
326322
done
327323
changed_when: true
@@ -330,7 +326,7 @@
330326

331327
- name: Delete cert secrets (fallback without compute prefix)
332328
ansible.builtin.shell: |
333-
for i in $(oc get secret -n {{ openstack_namespace }} -o name | grep cert | grep -v edpm | grep -v ceph); do
329+
for i in $(oc get secret -n {{ openstack_namespace }} -o name | grep cert | grep -v edpm | grep -v ceph-conf); do
334330
oc delete -n {{ openstack_namespace }} $i
335331
done
336332
changed_when: true
@@ -343,6 +339,31 @@
343339
changed_when: true
344340
when: cleanup_ctlplane | bool
345341

342+
- name: Delete remaining user-provided secrets
343+
ansible.builtin.shell: |
344+
for i in $(oc get secret -n {{ openstack_namespace }} -o name \
345+
| grep -v dockercfg \
346+
| grep -v service-account-token); do
347+
oc delete -n {{ openstack_namespace }} $i
348+
done
349+
changed_when: true
350+
failed_when: false
351+
when: cleanup_ctlplane | bool
352+
353+
- name: Delete ConfigMaps
354+
ansible.builtin.shell: |
355+
oc delete configmap --all -n {{ openstack_namespace }}
356+
changed_when: true
357+
failed_when: false
358+
when: cleanup_ctlplane | bool
359+
360+
- name: Delete DNSData CRs
361+
ansible.builtin.shell: |
362+
oc delete dnsdata --all -n {{ openstack_namespace }}
363+
changed_when: true
364+
failed_when: false
365+
when: cleanup_ctlplane | bool
366+
346367
- name: Print controlplane cleanup status
347368
ansible.builtin.debug:
348369
msg: "ControlPlane resources deleted"

0 commit comments

Comments
 (0)