Skip to content

Commit 5ef8fb8

Browse files
stuggiclaude
authored andcommitted
[cifmw_backup_restore] Fix post-restore validation and cleanup
- Wait for compute services and network agents to be ready with retry loops before proceeding to workload validation, preventing tempest from running against a partially recovered control plane - Delete test-operator CRs (Tempest, Tobiko, AnsibleTest, HorizonTest) at the beginning of cleanup while controllers and dependencies are still running, so finalizers get processed properly - Wait for test-operator pods to terminate after CR deletion - Adapt GaleraRestore pod discovery to the shortened resource names from mariadb-operator which drops the galera instance name prefix from generated resources (restore-<name> instead of <galera>-restore-<name>). Uses the galerarestore/name label selector when available, with fallback to the old naming convention so this change can land independently of the mariadb-operator PR - Increase control plane ready timeout from 10m to 30m - Fix loop_var collision with _delete_all_of_kind.yml Related-To: openstack-k8s-operators/mariadb-operator#463 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Martin Schuppert <mschuppert@redhat.com>
1 parent b578874 commit 5ef8fb8

5 files changed

Lines changed: 83 additions & 16 deletions

File tree

roles/cifmw_backup_restore/defaults/main.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,15 @@ cifmw_backup_restore_ovn_db_ready_timeout: 5m
6666
cifmw_backup_restore_restore_timeout: 900
6767
cifmw_backup_restore_edpm_deploy_timeout: 40m
6868
cifmw_backup_restore_infra_ready_timeout: 20m
69-
cifmw_backup_restore_ctlplane_ready_timeout: 10m
69+
cifmw_backup_restore_ctlplane_ready_timeout: 30m
7070
cifmw_backup_restore_strict_restore: true
7171
cifmw_backup_restore_restore_content: data
7272
cifmw_backup_restore_pin_pvcs: false
7373

74+
# Post-restore service readiness
75+
cifmw_backup_restore_service_retry_count: 30
76+
cifmw_backup_restore_service_retry_delay: 10
77+
7478
# Cleanup
7579
cifmw_backup_restore_cleanup_ctlplane: true
7680
cifmw_backup_restore_cleanup_dataplane: true

roles/cifmw_backup_restore/tasks/cleanup.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,38 @@
7171
msg: "Cleanup cancelled by user"
7272
when: not (cifmw_backup_restore_auto_ack | bool) and _delete_confirm.user_input != "yes"
7373

74+
# ========================================
75+
# Test-operator Cleanup (must happen first while controllers and
76+
# their dependencies are still running, so finalizers get processed)
77+
# ========================================
78+
- name: Delete test-operator CRs
79+
ansible.builtin.include_tasks: _delete_all_of_kind.yml
80+
vars:
81+
_resource_api_version: test.openstack.org/v1beta1
82+
_resource_kind: "{{ _test_cr_kind }}"
83+
_resource_wait: true
84+
loop:
85+
- Tempest
86+
- Tobiko
87+
- AnsibleTest
88+
- HorizonTest
89+
loop_control:
90+
loop_var: _test_cr_kind
91+
when: cifmw_backup_restore_cleanup_ctlplane | bool
92+
93+
- name: Wait for test-operator pods to terminate
94+
kubernetes.core.k8s_info:
95+
api_version: v1
96+
kind: Pod
97+
namespace: "{{ cifmw_backup_restore_namespace }}"
98+
label_selectors:
99+
- operator=test-operator
100+
register: _test_pods
101+
until: _test_pods.resources | length == 0
102+
retries: 12
103+
delay: 5
104+
when: cifmw_backup_restore_cleanup_ctlplane | bool
105+
74106
# ========================================
75107
# DataPlane Cleanup
76108
# ========================================

roles/cifmw_backup_restore/tasks/e2e.yml

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,29 +131,51 @@
131131
_os_exec: >-
132132
oc exec -t openstackclient -n {{ cifmw_backup_restore_namespace }} --
133133
block:
134-
- name: Verify compute services are up
134+
- name: Wait for compute services to be up
135135
ansible.builtin.shell: |
136136
set -o pipefail
137137
{{ _os_exec }} openstack compute service list -f json | \
138-
jq -r '.[] | "\(.Binary) \(.Host) \(.State)"'
138+
jq -e '[.[] | select(.State != "up")] | length == 0'
139139
register: _compute_services
140140
changed_when: false
141+
retries: "{{ cifmw_backup_restore_service_retry_count }}"
142+
delay: "{{ cifmw_backup_restore_service_retry_delay }}"
143+
until: _compute_services.rc == 0
144+
145+
- name: Display compute services
146+
ansible.builtin.shell: |
147+
set -o pipefail
148+
{{ _os_exec }} openstack compute service list -f json | \
149+
jq -r '.[] | "\(.Binary) \(.Host) \(.State)"'
150+
register: _compute_services_display
151+
changed_when: false
141152

142153
- name: Display compute services
143154
ansible.builtin.debug:
144-
msg: "{{ _compute_services.stdout_lines }}"
155+
msg: "{{ _compute_services_display.stdout_lines }}"
145156

146-
- name: Verify network agents are up
157+
- name: Wait for network agents to be alive
147158
ansible.builtin.shell: |
148159
set -o pipefail
149160
{{ _os_exec }} openstack network agent list -f json | \
150-
jq -r '.[] | "\(.["Agent Type"]) \(.Host) \(.Alive)"'
161+
jq -e '[.[] | select(.Alive != true)] | length == 0'
151162
register: _network_agents
152163
changed_when: false
164+
retries: "{{ cifmw_backup_restore_service_retry_count }}"
165+
delay: "{{ cifmw_backup_restore_service_retry_delay }}"
166+
until: _network_agents.rc == 0
167+
168+
- name: Display network agents
169+
ansible.builtin.shell: |
170+
set -o pipefail
171+
{{ _os_exec }} openstack network agent list -f json | \
172+
jq -r '.[] | "\(.["Agent Type"]) \(.Host) \(.Alive)"'
173+
register: _network_agents_display
174+
changed_when: false
153175

154176
- name: Display network agents
155177
ansible.builtin.debug:
156-
msg: "{{ _network_agents.stdout_lines }}"
178+
msg: "{{ _network_agents_display.stdout_lines }}"
157179

158180
- name: Get instance info
159181
ansible.builtin.shell: |

roles/cifmw_backup_restore/tasks/restore.yml

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -325,19 +325,28 @@
325325

326326
- name: Wait for GaleraRestore pods to be ready
327327
ansible.builtin.shell: |
328-
RESTORE_NAME="{{ item }}restore"
329-
BACKUP_SOURCE="{{ item }}"
330-
POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}"
331-
oc wait --for=condition=Ready pod/${POD_NAME} -n {{ cifmw_backup_restore_namespace }} --timeout=120s
328+
POD_NAME=$(oc get pod \
329+
-l galerarestore/name={{ item }} \
330+
-n {{ cifmw_backup_restore_namespace }} \
331+
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
332+
if [ -z "${POD_NAME}" ]; then
333+
POD_NAME="{{ item }}-restore-{{ item }}"
334+
fi
335+
oc wait --for=condition=Ready pod/${POD_NAME} \
336+
-n {{ cifmw_backup_restore_namespace }} --timeout=120s
332337
loop: "{{ _galerabackup_list }}"
333338
changed_when: false
334339
when: _galerabackup_list | length > 0
335340

336341
- name: Execute database restore for each GaleraRestore
337342
ansible.builtin.shell: |
338-
RESTORE_NAME="{{ item }}restore"
339-
BACKUP_SOURCE="{{ item }}"
340-
POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}"
343+
POD_NAME=$(oc get pod \
344+
-l galerarestore/name={{ item }} \
345+
-n {{ cifmw_backup_restore_namespace }} \
346+
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
347+
if [ -z "${POD_NAME}" ]; then
348+
POD_NAME="{{ item }}-restore-{{ item }}"
349+
fi
341350
TIMESTAMP="{{ cifmw_backup_restore_backup_timestamp }}"
342351
RESTORE_PATTERN="/backup/data/*_${TIMESTAMP}.sql.gz"
343352
oc exec -n {{ cifmw_backup_restore_namespace }} ${POD_NAME} -- \
@@ -348,7 +357,7 @@
348357

349358
- name: List GaleraRestore CRs kept for validation
350359
ansible.builtin.debug:
351-
msg: "GaleraRestore CR '{{ item }}restore' kept for post-restore validation (cleaned up by cleanup step)"
360+
msg: "GaleraRestore CR '{{ item }}' kept for post-restore validation (cleaned up by cleanup step)"
352361
loop: "{{ _galerabackup_list }}"
353362
when: _galerabackup_list | length > 0
354363

roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
apiVersion: mariadb.openstack.org/v1beta1
55
kind: GaleraRestore
66
metadata:
7-
name: {{ backup_name }}restore
7+
name: {{ backup_name }}
88
namespace: {{ cifmw_backup_restore_namespace }}
99
spec:
1010
backupSource: {{ backup_name }}

0 commit comments

Comments
 (0)