Skip to content

Commit 505ec16

Browse files
committed
Improve chainsaw error reporting
On error, chainsaw deletes all resources that were created during the execution of a run, so it is difficult to pinpoint an error as CI jobs capture must-gathers which are already empty. Add a script for error reporting so errors happening during a run can be reported as a sort of exception. Also improve the cleanup steps of currently failing script, so they can easily be run locally with --skip-delete option without blocking.
1 parent d17019f commit 505ec16

4 files changed

Lines changed: 74 additions & 6 deletions

File tree

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# Dump various resources still running in the environment, for debugging purpose
4+
# This is useful because all resources created by chainsaw are automatically deleted
5+
# before the we capture logs or the environment state in CI
6+
oc -n $NAMESPACE get jobs
7+
oc -n $NAMESPACE get pods
8+
oc -n $NAMESPACE get pvc
9+
oc -n $NAMESPACE get pv
10+
11+
# Optional objects to be described
12+
for i in $*; do
13+
echo inspecting $i
14+
oc -n $NAMESPACE describe $i
15+
oc -n $NAMESPACE logs $i
16+
done
17+
true
18+

test/chainsaw/tests/backup-capture/chainsaw-test.yaml

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@ spec:
2727
cleanup:
2828
- script:
2929
content: |
30-
oc -n $NAMESPACE delete pvc mysql-backup-openstack || true
30+
if ! oc -n $NAMESPACE get galerabackup/openstack; then
31+
oc -n $NAMESPACE delete pvc mysql-backup-openstack || true
32+
fi
3133
3234
- name: setup a backup for a 1-node Galera cluster
3335
description: checks that backup CR is correctly set up
@@ -70,6 +72,16 @@ spec:
7072
value: ($replicas)
7173
content: |
7274
../../common/backup-assert.sh backup-${REPLICAS}node
75+
cleanup:
76+
- script: &backup_cleanup
77+
env:
78+
- name: REPLICAS
79+
value: ($replicas)
80+
content: |
81+
if oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=backup-${REPLICAS}node --no-headers | grep -v -q Completed; then
82+
echo ERROR during backup job
83+
../../common/dump-resources.sh `oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=backup-${REPLICAS}node -o name`
84+
fi
7385
7486
- name: Scale Galera cluster to 3-node
7587
bindings:
@@ -90,6 +102,8 @@ spec:
90102
try:
91103
- script: *backup
92104
- script: *backup_check
105+
cleanup:
106+
- script: *backup_cleanup
93107

94108
- name: Scale Galera cluster to 1-node
95109
bindings:
@@ -117,16 +131,17 @@ spec:
117131
cleanup:
118132
- script:
119133
content: |
120-
oc -n $NAMESPACE delete pvc mysql-backup-openstack2nd || true
121-
134+
if ! oc -n $NAMESPACE get galerabackup/openstack2nd; then
135+
oc -n $NAMESPACE delete pvc mysql-backup-openstack2nd || true
136+
fi
122137
123138
- name: backup the two running clusters
124139
description: checks that the two clusters can be backed up concurrently
125140
try:
126141
- script:
127142
content: |
128143
set -e
129-
oc -n $NAMESPACE delete --ignore-not-found=true job/backup-1node
144+
oc -n $NAMESPACE delete --ignore-not-found=true job/backup-1node --wait
130145
oc -n $NAMESPACE create job --from=cronjob/backup-openstack backup-1node
131146
oc -n $NAMESPACE create job --from=cronjob/backup-openstack2nd backup-1node-2nd
132147
- script:
@@ -137,6 +152,13 @@ spec:
137152
oc -n $NAMESPACE wait --timeout=${WAIT_TIMEOUT} --for=condition=complete job/backup-1node job/backup-1node-2nd
138153
../../common/backup-assert.sh backup-1node
139154
../../common/backup-assert.sh backup-1node-2nd
155+
cleanup:
156+
- script:
157+
content: |
158+
if [ "$(oc -n $NAMESPACE get pods | grep '^backup-1node' | grep -c Completed)" -ne 2 ]; then
159+
echo ERROR during backup job
160+
../../common/dump-resources.sh `oc -n $NAMESPACE get pods | sed -ne 's%^\(backup-1node[^ ]*\).*%pod/\1%p' | xargs echo`
161+
fi
140162
141163
- name: Trigger a backup with a custom timestamp marker
142164
description: make sure that a custom timestamp can be passed to the backup script
@@ -154,3 +176,10 @@ spec:
154176
oc -n $NAMESPACE wait --timeout=${WAIT_TIMEOUT} --for=condition=complete job/custom-timestamp
155177
../../common/backup-assert.sh custom-timestamp
156178
oc -n $NAMESPACE get pod -o name -l job-name=custom-timestamp | xargs oc -n $NAMESPACE logs | grep 'Starting backup .* - chainsaw_unit_test'
179+
cleanup:
180+
- script:
181+
content: |
182+
if oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=custom-timestamp --no-headers | grep -v -q Completed; then
183+
echo ERROR during backup job
184+
../../common/dump-resources.sh `oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=custom-timestamp -o name`
185+
fi

test/chainsaw/tests/backup-configs/chainsaw-test.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,9 @@ spec:
3333
cleanup:
3434
- script:
3535
content: |
36-
oc -n $NAMESPACE delete pvc mysql-backup-openstack mysql-transfer-openstack || true
36+
if ! oc -n $NAMESPACE get galerabackup/openstack; then
37+
oc -n $NAMESPACE delete pvc mysql-backup-openstack mysql-transfer-openstack || true
38+
fi
3739
3840
- name: backup the 1-node Galera cluster
3941
description: checks that the cluster is correctly backed up
@@ -56,3 +58,10 @@ spec:
5658
value: ($replicas)
5759
content: |
5860
../../common/backup-assert.sh backup-${REPLICAS}node
61+
cleanup:
62+
- script:
63+
content: |
64+
if oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=backup-1node --no-headers | grep -v -q Completed; then
65+
echo ERROR during backup job
66+
../../common/dump-resources.sh `oc -n $NAMESPACE get pods -l batch.kubernetes.io/job-name=backup-1node -o name`
67+
fi

test/chainsaw/tests/restore/chainsaw-test.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,12 @@ spec:
4444
cleanup:
4545
- script:
4646
content: |
47-
oc -n $NAMESPACE delete pvc mysql-backup-cluster2backup mysql-backup-openstackbackup || true
47+
if ! oc -n $NAMESPACE get galerabackup/openstackbackup; then
48+
oc -n $NAMESPACE delete pvc mysql-backup-openstackbackup || true
49+
fi
50+
if ! oc -n $NAMESPACE get galerabackup/cluster2backup; then
51+
oc -n $NAMESPACE delete pvc mysql-backup-cluster2backup || true
52+
fi
4853
4954
- name: trigger a backup of Galera cluster
5055
description: checks that the cluster is correctly backed up
@@ -58,6 +63,13 @@ spec:
5863
oc -n $NAMESPACE create job --from=cronjob/backup-cluster2backup backupjob2
5964
oc -n $NAMESPACE wait --for=condition=complete --timeout=${WAIT_TIMEOUT} job/backupjob1
6065
oc -n $NAMESPACE wait --for=condition=complete --timeout=${WAIT_TIMEOUT} job/backupjob2
66+
cleanup:
67+
- script:
68+
content: |
69+
if [ "$(oc -n $NAMESPACE get pods | grep -e '^openstackbackup' -e 'cluster2backup' | grep -c Completed)" -ne 2 ]; then
70+
echo ERROR during backup job
71+
../../common/dump-resources.sh `oc -n $NAMESPACE get pods | sed -ne 's%^\(\(openstackbackup|cluster2backup\)[^ ]*\).*%pod/\1%p' | xargs echo`
72+
fi
6173
6274
- name: remove data in the running Galera clusters
6375
try:

0 commit comments

Comments
 (0)