Skip to content

Commit 1b5c94c

Browse files
abaysclaude
authored andcommitted
[cifmw_backup_restore] Add backup/restore orchestration role
Orchestrate backup, restore, and cleanup of OpenStack control plane and data plane resources, including Galera database dumps, Velero CSI volume snapshots, and ordered multi-phase restore sequences. Also adds playbooks (backup_restore.yaml) and integrates backup and restore into the post-deployment pipeline. Signed-off-by: Andrew Bays <abays@redhat.com> Signed-off-by: Martin Schuppert <mschuppert@redhat.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Martin Schuppert <mschuppert@redhat.com>
1 parent 0f23480 commit 1b5c94c

31 files changed

Lines changed: 2564 additions & 0 deletions

docs/dictionary/en-custom.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ fsid
232232
fultonj
233233
fusco
234234
fwcybtb
235+
Galera
235236
gapped
236237
genericcloud
237238
genindex
@@ -504,6 +505,8 @@ psathyan
504505
pubkey
505506
publicdomain
506507
pullsecret
508+
PVC
509+
PVCs
507510
pvs
508511
pwd
509512
pxe
@@ -580,6 +583,7 @@ sso
580583
stateful
581584
stderr
582585
stdout
586+
StorageClass
583587
stp
584588
str
585589
stricthostkeychecking
@@ -667,6 +671,7 @@ vvvv
667671
vxlan
668672
vynxgdagahaac
669673
vzcg
674+
WaitForFirstConsumer
670675
websso
671676
wget
672677
whitebox

playbooks/backup_restore.yaml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
---
2+
# End-to-end backup/restore test playbook
3+
#
4+
# Aligns with the openstack-k8s-operators backup-restore user guide (Galera,
5+
# optional OVN NB/SB on PVC, OADP, ordered restore, Neutron–OVN sync post-EDPM).
6+
#
7+
# Used standalone or from post-deployment.yml (gated by
8+
# cifmw_run_backup_restore_test). Logic lives in
9+
# roles/cifmw_backup_restore/tasks/e2e.yml; variables are in the role defaults.
10+
#
11+
# Each step can be enabled/disabled independently for iterative testing.
12+
#
13+
# Prerequisites:
14+
# - OpenStack control plane deployed and healthy
15+
# - OpenStackBackupConfig CR created (for backup labeling)
16+
# - For manual testing on a reproducer, run post_deployment.sh first:
17+
# ./post_deployment.sh -e zuul_log_collection=true \
18+
# -e cifmw_nolog=false -e cifmw_run_tests=false
19+
#
20+
# Manual usage (reproducer):
21+
# COMMON_ARGS="-i ~/ci-framework-data/artifacts/zuul_inventory.yml \
22+
# -e @~/ci-framework-data/parameters/reproducer-variables.yml \
23+
# -e @~/ci-framework-data/parameters/openshift-environment.yml"
24+
#
25+
# # Full run (with test workload):
26+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
27+
# -e cifmw_backup_restore_create_workload=true
28+
#
29+
# # Full run (without workload):
30+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml
31+
#
32+
# # Install deps only:
33+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
34+
# -e cifmw_backup_restore_run_backup=false \
35+
# -e cifmw_backup_restore_run_cleanup=false \
36+
# -e cifmw_backup_restore_run_restore=false
37+
#
38+
# # Backup only (deps already installed):
39+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
40+
# -e cifmw_backup_restore_install_deps=false \
41+
# -e cifmw_backup_restore_run_cleanup=false \
42+
# -e cifmw_backup_restore_run_restore=false
43+
#
44+
# # Cleanup + restore (backup already done):
45+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
46+
# -e cifmw_backup_restore_install_deps=false \
47+
# -e cifmw_backup_restore_run_backup=false \
48+
# -e cifmw_backup_restore_backup_timestamp=20260323-144546
49+
#
50+
# # Restore only (cleanup already done):
51+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
52+
# -e cifmw_backup_restore_install_deps=false \
53+
# -e cifmw_backup_restore_run_backup=false \
54+
# -e cifmw_backup_restore_run_cleanup=false \
55+
# -e cifmw_backup_restore_backup_timestamp=20260323-144546
56+
#
57+
# # With PVC pinning (WaitForFirstConsumer storage):
58+
# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \
59+
# -e cifmw_backup_restore_pin_pvcs=true
60+
61+
- name: Backup and Restore end-to-end test
62+
hosts: "{{ cifmw_target_host | default('localhost') }}"
63+
gather_facts: true
64+
tasks:
65+
- name: Run backup/restore end-to-end orchestration
66+
ansible.builtin.import_role:
67+
name: cifmw_backup_restore
68+
tasks_from: e2e.yml

post-deployment.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,18 @@
7373
tags:
7474
- compliance
7575

76+
- name: Run backup and restore test
77+
hosts: "{{ cifmw_target_host | default('localhost') }}"
78+
gather_facts: true
79+
tasks:
80+
- name: Run backup/restore end-to-end orchestration
81+
ansible.builtin.import_role:
82+
name: cifmw_backup_restore
83+
tasks_from: e2e.yml
84+
when: cifmw_run_backup_restore_test | default(false) | bool
85+
tags:
86+
- backup-restore
87+
7688
- name: Run hooks and inject status flag
7789
hosts: "{{ cifmw_target_host | default('localhost') }}"
7890
gather_facts: true
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# cifmw_backup_restore
2+
3+
Automate OpenStack on OpenShift backup and restore operations using OADP
4+
(OpenShift API for Data Protection) and Velero. The role supports three
5+
actions: **backup**, **restore**, and **cleanup**.
6+
7+
- **backup** — creates Galera database dumps, optionally backs up OVN NB/SB
8+
databases onto their PVCs, then creates Velero backups of labeled PVCs
9+
(via CSI snapshots) and cluster resources.
10+
- **restore** — performs an ordered Velero restore sequence (PVCs,
11+
foundation, infrastructure, control plane, Galera, optional OVN file restore,
12+
full control plane resume, dataplane, EDPM), then Neutron–OVN verification and
13+
sync (**log** mode, then **repair**, matching the backup-restore user guide Step 12).
14+
- **cleanup** — tears down dataplane and control-plane resources so the
15+
namespace is ready for a fresh restore.
16+
17+
## Privilege escalation
18+
19+
None. All cluster operations are performed through `oc` against the target
20+
OpenShift cluster.
21+
22+
## Parameters
23+
24+
### Common
25+
26+
* `cifmw_backup_restore_action`: (String) Action to perform. Must be one of `backup`, `restore`, or `cleanup`. Defaults to `""` (role will fail if unset).
27+
* `cifmw_backup_restore_namespace`: (String) Target OpenStack namespace. Defaults to `openstack`.
28+
* `cifmw_backup_restore_oadp_namespace`: (String) Namespace where Velero/OADP is running. Defaults to `openshift-adp`.
29+
* `cifmw_backup_restore_auto_ack`: (Boolean) Skip interactive pause prompts when `true`. Defaults to `false`.
30+
* `cifmw_backup_restore_ovn_db`: (Boolean) When `true` (default), the **backup** path labels OVN NB/SB PVCs and runs `ovsdb-client` backup before the OADP PVC backup, and the **restore** path runs OVN NB/SB file restore after Galera (when timestamped files exist on the PVC) before resuming the full control plane. Set to `false` to skip both; post-EDPM `neutron-ovn-db-sync` still runs when OVN files were not backed up.
31+
* `cifmw_backup_restore_ovn_db_ready_timeout`: (String) Timeout for `oc wait` on OVN database pods during OVN backup/restore. Defaults to `5m`.
32+
33+
### Backup
34+
35+
* `cifmw_backup_restore_galera_backup_timeout`: (String) Timeout for `oc wait` on Galera backup jobs. Defaults to `10m`.
36+
* `cifmw_backup_restore_galera_storage_class`: (String) StorageClass for Galera backup PVCs. Empty string uses the cluster default. Defaults to `""`.
37+
* `cifmw_backup_restore_galera_storage_request`: (String) Size of the Galera backup PVC. Defaults to `5Gi`.
38+
* `cifmw_backup_restore_galera_transfer_storage_request`: (String) Size of the Galera transfer storage PVC. Defaults to `5Gi`.
39+
* `cifmw_backup_restore_oadp_backup_timeout`: (String) Timeout for OADP PVC and resource backup completion. Defaults to `30m`.
40+
* `cifmw_backup_restore_storage_location`: (String) Velero `BackupStorageLocation` name. Defaults to `velero-1`.
41+
* `cifmw_backup_restore_backup_ttl`: (String) TTL for Velero backups. Defaults to `720h`.
42+
* `cifmw_backup_restore_snapshot_move_data`: (Boolean) Enable Velero snapshot data mover. When `true`, cleanup also deletes labeled PVCs. Defaults to `true`.
43+
44+
### Restore
45+
46+
* `cifmw_backup_restore_backup_timestamp`: (String) Timestamp suffix that identifies the backup to restore (e.g. `20260311-081234`). **Required** when `cifmw_backup_restore_action` is `restore`.
47+
* `cifmw_backup_restore_restore_timeout`: (Integer) Seconds to wait for each Velero Restore to reach a terminal phase. Defaults to `900`.
48+
* `cifmw_backup_restore_infra_ready_timeout`: (String) Timeout for `oc wait` on `OpenStackControlPlaneInfrastructureReady`. Defaults to `20m`.
49+
* `cifmw_backup_restore_ctlplane_ready_timeout`: (String) Timeout for `oc wait` on control plane `Ready` after removing the deployment-stage annotation. Defaults to `10m`.
50+
* `cifmw_backup_restore_strict_restore`: (Boolean) Fail on Velero `PartiallyFailed` status when `true`; only warn when `false`. Defaults to `true`.
51+
* `cifmw_backup_restore_restore_content`: (String) Content flag passed to `restore_galera` (`--content`). Defaults to `data`.
52+
* `cifmw_backup_restore_edpm_deploy_timeout`: (String) Timeout for `oc wait` on the post-restore EDPM deployment. Defaults to `40m`.
53+
* `cifmw_backup_restore_pin_pvcs`: (Boolean) Enable PVC-to-node pinning during restore for WaitForFirstConsumer storage classes. Defaults to `false`.
54+
* Post-EDPM **Neutron–OVN** steps follow [user guide Step 12](https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-12-verify-and-sync-neutron-to-ovn): run `neutron-ovn-db-sync-util` in `log` mode first (`neutron-dist.conf`, `neutron.conf`, `neutron.conf.d`). **Repair** runs if `cifmw_backup_restore_ovn_db` is `false` (no OVN NB/SB file backup was taken), or if log-mode stdout/stderr contains a `WARNING` line—Neutron reports drift that way while still exiting 0. If OVN file backup/restore was enabled and log output has no `WARNING` lines, repair is skipped as redundant.
55+
56+
### Cleanup
57+
58+
* `cifmw_backup_restore_cleanup_ctlplane`: (Boolean) Delete control-plane resources during cleanup. Defaults to `true`.
59+
* `cifmw_backup_restore_cleanup_dataplane`: (Boolean) Delete dataplane resources during cleanup. Defaults to `true`.
60+
61+
## Examples
62+
63+
### Running a backup
64+
65+
```YAML
66+
- hosts: localhost
67+
tasks:
68+
- name: Backup OpenStack
69+
ansible.builtin.include_role:
70+
name: cifmw_backup_restore
71+
vars:
72+
cifmw_backup_restore_action: backup
73+
cifmw_backup_restore_namespace: openstack
74+
cifmw_backup_restore_auto_ack: true
75+
```
76+
77+
### Restoring from a backup
78+
79+
```YAML
80+
- hosts: localhost
81+
tasks:
82+
- name: Restore OpenStack
83+
ansible.builtin.include_role:
84+
name: cifmw_backup_restore
85+
vars:
86+
cifmw_backup_restore_action: restore
87+
cifmw_backup_restore_backup_timestamp: "20260311-081234"
88+
cifmw_backup_restore_auto_ack: true
89+
```
90+
91+
### Cleaning up before a restore
92+
93+
```YAML
94+
- hosts: localhost
95+
tasks:
96+
- name: Cleanup namespace
97+
ansible.builtin.include_role:
98+
name: cifmw_backup_restore
99+
vars:
100+
cifmw_backup_restore_action: cleanup
101+
cifmw_backup_restore_auto_ack: true
102+
cifmw_backup_restore_cleanup_ctlplane: true
103+
cifmw_backup_restore_cleanup_dataplane: true
104+
```
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
# All variables intended for modification should be placed in this file.
19+
# All variables within this role should have a prefix of "cifmw_backup_restore"
20+
21+
# Action to perform: backup, restore, or cleanup
22+
cifmw_backup_restore_action: ""
23+
24+
# Common
25+
cifmw_backup_restore_namespace: "{{ cifmw_openstack_namespace | default('openstack') }}"
26+
cifmw_backup_restore_oadp_namespace: openshift-adp
27+
cifmw_backup_restore_auto_ack: false
28+
29+
# End-to-end orchestration (tasks/e2e.yml; invoked from post-deployment or playbooks/backup_restore.yaml)
30+
cifmw_backup_restore_install_deps: true
31+
cifmw_backup_restore_create_workload: true
32+
cifmw_backup_restore_run_backup: true
33+
cifmw_backup_restore_run_cleanup: true
34+
cifmw_backup_restore_run_restore: true
35+
cifmw_backup_restore_run_post_tempest: false
36+
37+
# Passthrough to update role when creating the test workload (prefix matches update role, not this role)
38+
cifmw_update_ping_test: true
39+
cifmw_update_control_plane_check: false
40+
cifmw_update_artifacts_basedir_suffix: "tests/update"
41+
cifmw_update_artifacts_basedir: "{{ ansible_user_dir }}/ci-framework-data/{{ cifmw_update_artifacts_basedir_suffix }}"
42+
cifmw_update_workload_launch_script: "{{ cifmw_update_artifacts_basedir }}/workload_launch.sh"
43+
cifmw_update_timestamper_cmd: >-
44+
| awk '{ print strftime("%Y-%m-%d %H:%M:%S |"), $0; fflush(); }'
45+
cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_start_ping.sh"
46+
cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh"
47+
cifmw_update_namespace: "{{ cifmw_backup_restore_namespace }}"
48+
49+
# Backup
50+
cifmw_backup_restore_galera_backup_timeout: 10m
51+
cifmw_backup_restore_galera_storage_class: ""
52+
cifmw_backup_restore_galera_storage_request: 5Gi
53+
cifmw_backup_restore_galera_transfer_storage_request: 5Gi
54+
cifmw_backup_restore_oadp_backup_timeout: 30m
55+
cifmw_backup_restore_storage_location: velero-1
56+
cifmw_backup_restore_backup_ttl: 720h
57+
cifmw_backup_restore_snapshot_move_data: true
58+
cifmw_backup_restore_swift_xattr_timeout: 600s
59+
60+
# OVN NB/SB database files on PVCs (user-guide backup Step 3 / restore Step 8)
61+
cifmw_backup_restore_ovn_db: true
62+
cifmw_backup_restore_ovn_db_ready_timeout: 5m
63+
64+
# Restore
65+
# cifmw_backup_restore_backup_timestamp: REQUIRED for restore (e.g., 20260311-081234)
66+
cifmw_backup_restore_restore_timeout: 900
67+
cifmw_backup_restore_edpm_deploy_timeout: 40m
68+
cifmw_backup_restore_infra_ready_timeout: 20m
69+
cifmw_backup_restore_ctlplane_ready_timeout: 10m
70+
cifmw_backup_restore_strict_restore: true
71+
cifmw_backup_restore_restore_content: data
72+
cifmw_backup_restore_pin_pvcs: false
73+
74+
# Cleanup
75+
cifmw_backup_restore_cleanup_ctlplane: true
76+
cifmw_backup_restore_cleanup_dataplane: true
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
18+
galaxy_info:
19+
author: CI Framework
20+
description: CI Framework Role -- OpenStack Backup and Restore
21+
company: Red Hat
22+
license: Apache-2.0
23+
min_ansible_version: "2.14"
24+
namespace: cifmw
25+
galaxy_tags:
26+
- cifmw
27+
- openstack
28+
- backup
29+
- restore
30+
31+
dependencies: []
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
---
2+
# Copyright Red Hat, Inc.
3+
# All Rights Reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
# Helper: delete all resources of a given kind in the backup/restore namespace.
18+
#
19+
# Required variables:
20+
# _resource_api_version - e.g. "core.openstack.org/v1beta1"
21+
# _resource_kind - e.g. "OpenStackControlPlane"
22+
#
23+
# Optional variables:
24+
# _resource_label_selectors - label selectors list (default: omitted)
25+
# _resource_wait - wait for deletion (default: false)
26+
# _resource_wait_timeout - wait timeout in seconds (default: 120)
27+
28+
- name: "List resources - {{ _resource_kind }}"
29+
kubernetes.core.k8s_info:
30+
api_version: "{{ _resource_api_version }}"
31+
kind: "{{ _resource_kind }}"
32+
namespace: "{{ cifmw_backup_restore_namespace }}"
33+
label_selectors: "{{ _resource_label_selectors | default(omit) }}"
34+
register: _resources_to_delete
35+
failed_when: false
36+
37+
- name: "Delete resources - {{ _resource_kind }}"
38+
kubernetes.core.k8s:
39+
api_version: "{{ _resource_api_version }}"
40+
kind: "{{ _resource_kind }}"
41+
namespace: "{{ cifmw_backup_restore_namespace }}"
42+
name: "{{ item.metadata.name }}"
43+
state: absent
44+
wait: "{{ _resource_wait | default(false) | bool }}"
45+
wait_timeout: "{{ _resource_wait_timeout | default(120) }}"
46+
loop: "{{ _resources_to_delete.resources | default([]) }}"
47+
loop_control:
48+
label: "{{ item.metadata.name }}"
49+
failed_when: false

0 commit comments

Comments
 (0)