Skip to content

Commit 813d429

Browse files
feat(e2e): add backward compatibility e2e tests (#5681)
Signed-off-by: Monika Jakhar <jakharmonika364@gmail.com>
1 parent b5948c0 commit 813d429

6 files changed

Lines changed: 295 additions & 34 deletions

File tree

.github/scripts/build-all-images.sh

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,40 @@
11
#!/bin/bash
22
set -e
33

4-
function get_image_tag() {
4+
get_image_tag() {
55
version=$(grep "^VERSION := " ./Makefile)
6-
version=${version#VERSION := }
6+
version="${version#VERSION := }"
77

88
git_sha=$(git rev-parse --short HEAD || echo "HEAD")
9-
export IMAGE_TAG=${version}-${git_sha}
9+
export IMAGE_TAG="${version}-${git_sha}"
1010
}
1111

12-
function build_images() {
12+
build_images() {
1313
images=(
14-
${IMG_REPO}/dataset-controller:${IMAGE_TAG}
15-
${IMG_REPO}/application-controller:${IMAGE_TAG}
16-
${IMG_REPO}/alluxioruntime-controller:${IMAGE_TAG}
17-
${IMG_REPO}/jindoruntime-controller:${IMAGE_TAG}
18-
${IMG_REPO}/goosefsruntime-controller:${IMAGE_TAG}
19-
${IMG_REPO}/juicefsruntime-controller:${IMAGE_TAG}
20-
${IMG_REPO}/thinruntime-controller:${IMAGE_TAG}
21-
${IMG_REPO}/efcruntime-controller:${IMAGE_TAG}
22-
${IMG_REPO}/vineyardruntime-controller:${IMAGE_TAG}
23-
${IMG_REPO}/cacheruntime-controller:${IMAGE_TAG}
24-
${IMG_REPO}/fluid-csi:${IMAGE_TAG}
25-
${IMG_REPO}/fluid-webhook:${IMAGE_TAG}
26-
${IMG_REPO}/fluid-crd-upgrader:${IMAGE_TAG}
14+
"${IMG_REPO}/dataset-controller:${IMAGE_TAG}"
15+
"${IMG_REPO}/application-controller:${IMAGE_TAG}"
16+
"${IMG_REPO}/alluxioruntime-controller:${IMAGE_TAG}"
17+
"${IMG_REPO}/jindoruntime-controller:${IMAGE_TAG}"
18+
"${IMG_REPO}/goosefsruntime-controller:${IMAGE_TAG}"
19+
"${IMG_REPO}/juicefsruntime-controller:${IMAGE_TAG}"
20+
"${IMG_REPO}/thinruntime-controller:${IMAGE_TAG}"
21+
"${IMG_REPO}/efcruntime-controller:${IMAGE_TAG}"
22+
"${IMG_REPO}/vineyardruntime-controller:${IMAGE_TAG}"
23+
"${IMG_REPO}/cacheruntime-controller:${IMAGE_TAG}"
24+
"${IMG_REPO}/fluid-csi:${IMAGE_TAG}"
25+
"${IMG_REPO}/fluid-webhook:${IMAGE_TAG}"
26+
"${IMG_REPO}/fluid-crd-upgrader:${IMAGE_TAG}"
2727
)
2828

2929
make docker-build-all
3030

31-
for img in ${images[@]}; do
32-
echo "Loading image $img to kind cluster..."
33-
kind load docker-image $img --name ${KIND_CLUSTER}
31+
for img in "${images[@]}"; do
32+
echo "Loading image ${img} to kind cluster..."
33+
kind load docker-image "${img}" --name "${KIND_CLUSTER}"
3434
done
3535
}
3636

37-
function cleanup_docker_caches() {
37+
cleanup_docker_caches() {
3838
echo ">>> System disk usage after building fluid images"
3939
df -h
4040
echo ">>> Cleaning docker caches..."

.github/scripts/deploy-fluid-to-kind.sh

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,26 @@
1-
#!/bin/bash
1+
#! /bin/bash
22
set -e
33

4-
function get_image_tag() {
4+
get_image_tag() {
5+
local version=""
56
version=$(grep "^VERSION := " ./Makefile)
6-
version=${version#VERSION := }
7+
version="${version#VERSION := }"
78

9+
local git_sha=""
810
git_sha=$(git rev-parse --short HEAD || echo "HEAD")
9-
export IMAGE_TAG=${version}-${git_sha}
11+
export IMAGE_TAG="${version}-${git_sha}"
1012
}
1113

12-
function deploy_fluid() {
13-
echo "Replacing image tags in values.yaml with $IMAGE_TAG"
14-
sed -i -E "s/version: &defaultVersion v[0-9]\.[0-9]\.[0-9]-[a-z0-9]+$/version: \&defaultVersion $IMAGE_TAG/g" charts/fluid/fluid/values.yaml
15-
kubectl create ns fluid-system
16-
helm install --create-namespace --set runtime.jindo.smartdata.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs --set runtime.jindo.fuse.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs fluid charts/fluid/fluid
14+
deploy_fluid() {
15+
echo "Replacing image tags in values.yaml with ${IMAGE_TAG}"
16+
sed -i -E "s/version: &defaultVersion .+$/version: \&defaultVersion ${IMAGE_TAG}/g" charts/fluid/fluid/values.yaml
17+
kubectl create ns fluid-system || true
18+
helm upgrade --install --namespace fluid-system --create-namespace --set runtime.jindo.smartdata.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs --set runtime.jindo.fuse.imagePrefix=registry-cn-hongkong.ack.aliyuncs.com/acs --set runtime.cache.enabled=true fluid charts/fluid/fluid
1719
}
1820

19-
function main() {
21+
main() {
2022
get_image_tag
21-
if [[ -z "$IMAGE_TAG" ]];then
23+
if [[ -z "${IMAGE_TAG}" ]]; then
2224
echo "Failed to get image tag, exiting..."
2325
exit 1
2426
fi
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
#!/bin/bash
2+
3+
syslog() {
4+
echo ">>> ${1}"
5+
return 0
6+
}
7+
8+
panic() {
9+
local err_msg="${1}"
10+
syslog "backward compatibility test failed: ${err_msg}"
11+
exit 1
12+
}
13+
14+
check_control_plane_status() {
15+
echo "=== Unique image tags used by Fluid control plane ==="
16+
kubectl get pod -n fluid-system -o jsonpath='
17+
{range .items[*]}{range .spec.containers[*]}{.image}{"\n"}{end}{range .spec.initContainers[*]}{.image}{"\n"}{end}{end}' \
18+
| sed 's/.*://' \
19+
| sort -u
20+
21+
# Timeout counter (30 minutes = 360*5 seconds)
22+
local timeout=360
23+
local counter=0
24+
local status_interval=36
25+
26+
while true; do
27+
total_pods=$(kubectl get pod -n fluid-system --no-headers 2>/dev/null | grep -cv "Completed" || true)
28+
total_pods=${total_pods:-0}
29+
running_pods=$(kubectl get pod -n fluid-system --no-headers 2>/dev/null | grep -c "Running" || true)
30+
running_pods=${running_pods:-0}
31+
not_running_pods=$((total_pods - running_pods))
32+
33+
if ((counter % status_interval == 0)); then
34+
syslog "[Status Check $((counter / status_interval))] Pod status: ${running_pods}/${total_pods} running (${not_running_pods} not ready)"
35+
if [[ "${not_running_pods}" -gt 0 ]]; then
36+
echo "=== Not running pods ==="
37+
kubectl get pods -n fluid-system \
38+
--field-selector=status.phase!=Running \
39+
-o=custom-columns='NAME:.metadata.name,STATUS:.status.phase,REASON:.status.reason'
40+
fi
41+
fi
42+
43+
if [[ "${total_pods}" -ne 0 ]] && [[ "${total_pods}" -eq "${running_pods}" ]]; then
44+
break
45+
fi
46+
47+
if [[ "${counter}" -ge "${timeout}" ]]; then
48+
panic "Timeout waiting for control plane after ${counter} checks!"
49+
fi
50+
51+
sleep 5
52+
((counter++))
53+
done
54+
syslog "Fluid control plane is ready after ${counter} checks!"
55+
}
56+
57+
debug_alluxio_state() {
58+
local dataset_name="${1}"
59+
echo "=== AlluxioRuntime status ==="
60+
kubectl get alluxioruntime "${dataset_name}" -n default -ojsonpath='{.status}' 2>/dev/null | python3 -m json.tool 2>/dev/null || \
61+
kubectl get alluxioruntime "${dataset_name}" -n default 2>/dev/null || true
62+
echo "=== Pods for dataset ${dataset_name} ==="
63+
kubectl get pods -n default -l "release=${dataset_name}" -o wide 2>/dev/null || true
64+
echo "=== Recent events ==="
65+
kubectl get events -n default --sort-by=.metadata.creationTimestamp 2>/dev/null | tail -20 || true
66+
}
67+
68+
wait_dataset_bound() {
69+
local dataset_name="${1}"
70+
local deadline=600
71+
local log_interval=0
72+
local log_times=0
73+
74+
syslog "Waiting for dataset ${dataset_name} to be Bound (timeout: ${deadline}s)..."
75+
76+
while true; do
77+
last_state=$(kubectl get dataset "${dataset_name}" -n default -ojsonpath='{.status.phase}' 2>/dev/null || echo "Unknown")
78+
79+
if [[ "${last_state}" == "Bound" ]]; then
80+
break
81+
fi
82+
83+
if [[ "${log_interval}" -ge 3 ]]; then
84+
log_times=$(( log_times + 1 ))
85+
local elapsed=$(( log_times * 3 * 5 ))
86+
syslog "checking dataset.status.phase==Bound (elapsed: ${elapsed}s, current state: ${last_state})"
87+
if [[ "${elapsed}" -ge "${deadline}" ]]; then
88+
debug_alluxio_state "${dataset_name}"
89+
panic "timeout for ${deadline}s waiting for dataset ${dataset_name} to become bound!"
90+
fi
91+
log_interval=0
92+
fi
93+
94+
log_interval=$(( log_interval + 1 ))
95+
sleep 5
96+
done
97+
syslog "Found dataset ${dataset_name} status.phase==Bound"
98+
}
99+
100+
wait_job_completed() {
101+
local job_name="${1}"
102+
local deadline=600 # 10 minutes
103+
local counter=0
104+
while true; do
105+
# Handle missing fields gracefully
106+
succeed=$(kubectl get job "${job_name}" -ojsonpath='{.status.succeeded}' 2>/dev/null || echo "0")
107+
failed=$(kubectl get job "${job_name}" -ojsonpath='{.status.failed}' 2>/dev/null || echo "0")
108+
109+
# Ensure variables are treated as integers
110+
[[ -z "${succeed}" ]] && succeed=0
111+
[[ -z "${failed}" ]] && failed=0
112+
113+
if [[ "${failed}" -gt 0 ]]; then
114+
panic "job ${job_name} failed when accessing data"
115+
fi
116+
if [[ "${succeed}" -gt 0 ]]; then
117+
break
118+
fi
119+
120+
((counter++))
121+
if [[ $((counter * 5)) -ge "${deadline}" ]]; then
122+
panic "timeout for ${deadline}s waiting for job ${job_name} completion!"
123+
fi
124+
sleep 5
125+
done
126+
syslog "Found succeeded job ${job_name}"
127+
}
128+
129+
setup_old_fluid() {
130+
syslog "Setting up older version of Fluid from charts"
131+
helm repo add fluid https://fluid-cloudnative.github.io/charts
132+
helm repo update fluid
133+
134+
# We ignore errors in case namespace exists
135+
kubectl create ns fluid-system || true
136+
137+
helm install fluid fluid/fluid --namespace fluid-system --wait
138+
check_control_plane_status
139+
}
140+
141+
create_dataset() {
142+
syslog "Creating alluxio dataset..."
143+
kubectl apply -f test/gha-e2e/alluxio/dataset.yaml
144+
# give it 15s to let the CRDs and controllers settle
145+
sleep 15
146+
wait_dataset_bound "zookeeper"
147+
}
148+
149+
upgrade_fluid() {
150+
syslog "Upgrading Fluid to the locally built current version..."
151+
./.github/scripts/deploy-fluid-to-kind.sh
152+
check_control_plane_status
153+
}
154+
155+
verify_backward_compatibility() {
156+
syslog "Verifying backward compatibility..."
157+
# Ensure the dataset created earlier is still bound
158+
wait_dataset_bound "zookeeper"
159+
160+
# create job to access data over the runtime
161+
kubectl apply -f test/gha-e2e/alluxio/job.yaml
162+
wait_job_completed "fluid-test"
163+
164+
# Clean up
165+
kubectl delete -f test/gha-e2e/alluxio/job.yaml
166+
kubectl delete -f test/gha-e2e/alluxio/dataset.yaml
167+
}
168+
169+
main() {
170+
syslog "[BACKWARD COMPATIBILITY TEST STARTS AT $(date)]"
171+
172+
setup_old_fluid
173+
create_dataset
174+
upgrade_fluid
175+
verify_backward_compatibility
176+
177+
syslog "[BACKWARD COMPATIBILITY TEST SUCCEEDED AT $(date)]"
178+
}
179+
180+
main
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
name: E2E Backward Compatibility Check
2+
on:
3+
pull_request:
4+
branches: [master, release-*]
5+
paths-ignore:
6+
- "docs/**"
7+
- "addons/**"
8+
- "sdk/**"
9+
- "static/**"
10+
11+
permissions:
12+
contents: read
13+
actions: read
14+
15+
concurrency:
16+
group: ${{ github.workflow }}-${{ github.ref }}
17+
cancel-in-progress: true
18+
19+
env:
20+
GO_VERSION: 1.24.12
21+
22+
jobs:
23+
backward-compat-test:
24+
runs-on: ubuntu-latest
25+
strategy:
26+
fail-fast: false
27+
matrix:
28+
kubernetes-version:
29+
["v1.33.2", "v1.30.13", "v1.28.15", "v1.24.17", "v1.22.17"]
30+
env:
31+
GOPATH: ${{ github.workspace }}
32+
GO111MODULE: auto
33+
KIND_CLUSTER: fluid-cluster
34+
defaults:
35+
run:
36+
working-directory: ${{ env.GOPATH }}/src/github.com/fluid-cloudnative/fluid
37+
38+
steps:
39+
- name: Set up Go
40+
uses: actions/setup-go@7a3fe6cf4cb3a834922a1244abfce67bcef6a0c5 # v6.2.0
41+
with:
42+
go-version: ${{ env.GO_VERSION }}
43+
44+
- name: Set up Helm
45+
uses: azure/setup-helm@1a275c3b69536ee54be43f2070a358922e12c8d4 # v4.3.1
46+
47+
- name: Checkout code
48+
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
49+
with:
50+
path: ${{ env.GOPATH }}/src/github.com/fluid-cloudnative/fluid
51+
52+
- name: Create k8s Kind Cluster
53+
uses: helm/kind-action@92086f6be054225fa813e0a4b13787fc9088faab # v1.13.0
54+
with:
55+
version: v0.29.0
56+
node_image: kindest/node:${{ matrix.kubernetes-version }}
57+
cluster_name: ${{ env.KIND_CLUSTER }}
58+
kubectl_version: ${{ matrix.kubernetes-version }}
59+
60+
- name: Build current fluid docker images
61+
env:
62+
IMG_REPO: fluidcloudnative
63+
run: |
64+
echo ">>> System disk usage before build fluid images"
65+
df -h
66+
./.github/scripts/build-all-images.sh
67+
68+
- name: Run backward compatibility e2e tests
69+
timeout-minutes: 40
70+
run: |
71+
bash ./.github/scripts/gha-backward-compatibility.sh
72+
73+
- name: Dump environment
74+
if: ${{ !cancelled() }}
75+
uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
76+
with:
77+
name: gha-backward-compat-logs-${{ github.job }}-${{ matrix.kubernetes-version }}
78+
path: "src/github.com/fluid-cloudnative/fluid/e2e-tmp/testcase-*.tgz"
79+
retention-days: 14

test/gha-e2e/alluxio/dataset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ spec:
1616
tieredstore:
1717
levels:
1818
- mediumtype: SSD
19-
path: /var/lib/docker/alluxio
19+
path: /tmp/alluxio
2020
quota: 1Gi
2121
high: "0.95"
2222
low: "0.7"

test/gha-e2e/curvine/test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ function create_dataset() {
4747
}
4848

4949
function wait_dataset_bound() {
50-
local deadline=180 # 3 minutes
50+
local deadline=600 # 10 minutes
5151
local last_state=""
5252
local log_interval=0
5353
local log_times=0

0 commit comments

Comments
 (0)