Skip to content

Commit 90708f8

Browse files
committed
RHAIENG-2063: Add new RayCluster object
1 parent b4b26b9 commit 90708f8

38 files changed

Lines changed: 6203 additions & 659 deletions
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# e2e tests workflow for CodeFlare-SDK
2+
name: e2e-v2
3+
4+
on:
5+
pull_request:
6+
branches:
7+
- main
8+
- stable
9+
- "release-*"
10+
paths-ignore:
11+
- "docs/**"
12+
- "**.adoc"
13+
- "**.md"
14+
- "LICENSE"
15+
16+
concurrency:
17+
group: ${{ github.head_ref }}-${{ github.workflow }}
18+
cancel-in-progress: true
19+
20+
env:
21+
KUEUE_VERSION: v0.13.4
22+
KUBERAY_VERSION: v1.4.2
23+
24+
jobs:
25+
e2e_v2:
26+
runs-on: gpu-t4-4-core
27+
28+
steps:
29+
- name: Checkout code
30+
uses: actions/checkout@v4
31+
with:
32+
submodules: recursive
33+
34+
- name: Checkout common repo code
35+
uses: actions/checkout@v4
36+
with:
37+
repository: "project-codeflare/codeflare-common"
38+
ref: "main"
39+
path: "common"
40+
41+
- name: Set up specific Python version
42+
uses: actions/setup-python@v5
43+
with:
44+
python-version: '3.12'
45+
cache: 'pip' # caching pip dependencies
46+
47+
- name: Setup NVidia GPU environment for KinD
48+
uses: ./common/github-actions/nvidia-gpu-setup
49+
50+
- name: Setup and start KinD cluster
51+
uses: ./common/github-actions/kind
52+
with:
53+
worker-nodes: 1
54+
55+
- name: Install NVidia GPU operator for KinD
56+
uses: ./common/github-actions/nvidia-gpu-operator
57+
58+
- name: Deploy Kueue and KubeRay
59+
id: deploy
60+
run: |
61+
# Install Kueue
62+
echo "Installing Kueue ${KUEUE_VERSION}..."
63+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
64+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
65+
66+
# Install KubeRay from opendatahub-io fork (has RHOAI features)
67+
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
68+
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
69+
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
70+
71+
# Create default Kueue resources for the tests
72+
echo "Creating Kueue resources..."
73+
kubectl apply -f - <<EOF
74+
apiVersion: kueue.x-k8s.io/v1beta1
75+
kind: ResourceFlavor
76+
metadata:
77+
name: default-flavor
78+
---
79+
apiVersion: kueue.x-k8s.io/v1beta1
80+
kind: ClusterQueue
81+
metadata:
82+
name: cluster-queue
83+
spec:
84+
namespaceSelector: {}
85+
resourceGroups:
86+
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
87+
flavors:
88+
- name: default-flavor
89+
resources:
90+
- name: cpu
91+
nominalQuota: 100
92+
- name: memory
93+
nominalQuota: 100Gi
94+
- name: nvidia.com/gpu
95+
nominalQuota: 10
96+
---
97+
apiVersion: kueue.x-k8s.io/v1beta1
98+
kind: LocalQueue
99+
metadata:
100+
name: local-queue
101+
namespace: default
102+
annotations:
103+
kueue.x-k8s.io/default-queue: "true"
104+
spec:
105+
clusterQueue: cluster-queue
106+
EOF
107+
108+
- name: Add user to KinD
109+
uses: ./common/github-actions/kind-add-user
110+
with:
111+
user-name: sdk-user
112+
113+
- name: Configure RBAC for sdk user with limited permissions
114+
run: |
115+
kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses
116+
kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
117+
kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
118+
kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
119+
kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
120+
kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
121+
kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs
122+
kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user
123+
kubectl create clusterrole rayjob-status-reader --verb=get,list,patch,update --resource=rayjobs/status
124+
kubectl create clusterrolebinding sdk-user-rayjob-status-reader --clusterrole=rayjob-status-reader --user=sdk-user
125+
kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
126+
kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
127+
kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
128+
kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
129+
kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
130+
kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
131+
kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
132+
kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
133+
kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
134+
kubectl create clusterrolebinding sdk-user-pod-creator --clusterrole=pod-creator --user=sdk-user
135+
kubectl create clusterrole service-reader --verb=get,list,watch --resource=services
136+
kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
137+
kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
138+
kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
139+
kubectl config use-context sdk-user
140+
141+
- name: Run e2e tests
142+
run: |
143+
export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
144+
echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV
145+
146+
set -euo pipefail
147+
pip install poetry
148+
poetry install --with test,docs
149+
echo "Running e2e_v2 tests (with GPU)..."
150+
poetry run pytest -v -s ./tests/e2e_v2/ -m 'kind and gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
151+
env:
152+
GRPC_DNS_RESOLVER: "native"
153+
154+
- name: Switch to kind-cluster context to print logs
155+
if: always() && steps.deploy.outcome == 'success'
156+
run: kubectl config use-context kind-cluster
157+
158+
- name: Print Pytest output log
159+
if: always() && steps.deploy.outcome == 'success'
160+
run: |
161+
echo "Printing Pytest output logs"
162+
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
163+
164+
- name: Print KubeRay operator logs
165+
if: always() && steps.deploy.outcome == 'success'
166+
run: |
167+
echo "Printing KubeRay operator logs"
168+
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
169+
170+
- name: Print Kueue controller logs
171+
if: always() && steps.deploy.outcome == 'success'
172+
run: |
173+
echo "Printing Kueue controller logs"
174+
kubectl logs -n kueue-system --tail -1 -l control-plane=controller-manager | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kueue.log
175+
176+
- name: Export all KinD pod logs
177+
uses: ./common/github-actions/kind-export-logs
178+
if: always() && steps.deploy.outcome == 'success'
179+
with:
180+
output-directory: ${CODEFLARE_TEST_OUTPUT_DIR}
181+
182+
- name: Upload logs
183+
uses: actions/upload-artifact@v4
184+
if: always() && steps.deploy.outcome == 'success'
185+
with:
186+
name: logs
187+
retention-days: 10
188+
path: |
189+
${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log

0 commit comments

Comments
 (0)