Skip to content

Commit c14d2f2

Browse files
committed
RHOAIENG-45141: replace codeflare-operator with kuberay where applicable
1 parent 17485ee commit c14d2f2

7 files changed

Lines changed: 299 additions & 137 deletions

File tree

.github/workflows/e2e_tests.yaml

Lines changed: 50 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ concurrency:
1818
cancel-in-progress: true
1919

2020
env:
21-
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
21+
KUEUE_VERSION: v0.13.4
22+
KUBERAY_VERSION: v1.4.2
2223

2324
jobs:
2425
kubernetes:
@@ -37,23 +38,6 @@ jobs:
3738
ref: "main"
3839
path: "common"
3940

40-
- name: Checkout CodeFlare operator repository
41-
uses: actions/checkout@v4
42-
with:
43-
repository: project-codeflare/codeflare-operator
44-
path: codeflare-operator
45-
46-
- name: Set Go
47-
uses: actions/setup-go@v5
48-
with:
49-
go-version-file: "./codeflare-operator/go.mod"
50-
cache-dependency-path: "./codeflare-operator/go.sum"
51-
52-
- name: Set up gotestfmt
53-
uses: gotesttools/gotestfmt-action@v2
54-
with:
55-
token: ${{ secrets.GITHUB_TOKEN }}
56-
5741
- name: Set up specific Python version
5842
uses: actions/setup-python@v5
5943
with:
@@ -71,16 +55,55 @@ jobs:
7155
- name: Install NVidia GPU operator for KinD
7256
uses: ./common/github-actions/nvidia-gpu-operator
7357

74-
- name: Deploy CodeFlare stack
58+
- name: Deploy Kueue and KubeRay
7559
id: deploy
7660
run: |
77-
cd codeflare-operator
78-
echo Setting up CodeFlare stack
79-
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
80-
echo Deploying CodeFlare operator
81-
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83-
cd ..
61+
# Install Kueue
62+
echo "Installing Kueue ${KUEUE_VERSION}..."
63+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
64+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
65+
66+
# Install KubeRay from opendatahub-io fork (has RHOAI features)
67+
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
68+
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
69+
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
70+
71+
# Create default Kueue resources for the tests
72+
echo "Creating Kueue resources..."
73+
kubectl apply -f - <<EOF
74+
apiVersion: kueue.x-k8s.io/v1beta1
75+
kind: ResourceFlavor
76+
metadata:
77+
name: default-flavor
78+
---
79+
apiVersion: kueue.x-k8s.io/v1beta1
80+
kind: ClusterQueue
81+
metadata:
82+
name: cluster-queue
83+
spec:
84+
namespaceSelector: {}
85+
resourceGroups:
86+
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
87+
flavors:
88+
- name: default-flavor
89+
resources:
90+
- name: cpu
91+
nominalQuota: 100
92+
- name: memory
93+
nominalQuota: 100Gi
94+
- name: nvidia.com/gpu
95+
nominalQuota: 10
96+
---
97+
apiVersion: kueue.x-k8s.io/v1beta1
98+
kind: LocalQueue
99+
metadata:
100+
name: local-queue
101+
namespace: default
102+
annotations:
103+
kueue.x-k8s.io/default-queue: "true"
104+
spec:
105+
clusterQueue: cluster-queue
106+
EOF
84107
85108
- name: Add user to KinD
86109
uses: ./common/github-actions/kind-add-user
@@ -138,17 +161,11 @@ jobs:
138161
echo "Printing Pytest output logs"
139162
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
140163
141-
- name: Print CodeFlare operator logs
142-
if: always() && steps.deploy.outcome == 'success'
143-
run: |
144-
echo "Printing CodeFlare operator logs"
145-
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
146-
147164
- name: Print KubeRay operator logs
148165
if: always() && steps.deploy.outcome == 'success'
149166
run: |
150167
echo "Printing KubeRay operator logs"
151-
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
168+
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
152169
153170
- name: Print Kueue controller logs
154171
if: always() && steps.deploy.outcome == 'success'

.github/workflows/rayjob_e2e_tests.yaml

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ concurrency:
1818
cancel-in-progress: true
1919

2020
env:
21-
CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev"
21+
KUEUE_VERSION: v0.13.4
22+
KUBERAY_VERSION: v1.4.2
2223

2324
jobs:
2425
kubernetes-rayjob:
@@ -37,27 +38,10 @@ jobs:
3738
ref: "main"
3839
path: "common"
3940

40-
- name: Checkout CodeFlare operator repository
41-
uses: actions/checkout@v4
42-
with:
43-
repository: project-codeflare/codeflare-operator
44-
path: codeflare-operator
45-
46-
- name: Set Go
47-
uses: actions/setup-go@v5
48-
with:
49-
go-version-file: "./codeflare-operator/go.mod"
50-
cache-dependency-path: "./codeflare-operator/go.sum"
51-
52-
- name: Set up gotestfmt
53-
uses: gotesttools/gotestfmt-action@v2
54-
with:
55-
token: ${{ secrets.GITHUB_TOKEN }}
56-
5741
- name: Set up specific Python version
5842
uses: actions/setup-python@v5
5943
with:
60-
python-version: "3.11"
44+
python-version: "3.12"
6145
cache: "pip" # caching pip dependencies
6246

6347
- name: Setup NVidia GPU environment for KinD
@@ -71,16 +55,55 @@ jobs:
7155
- name: Install NVidia GPU operator for KinD
7256
uses: ./common/github-actions/nvidia-gpu-operator
7357

74-
- name: Deploy CodeFlare stack
58+
- name: Deploy Kueue and KubeRay
7559
id: deploy
7660
run: |
77-
cd codeflare-operator
78-
echo Setting up CodeFlare stack
79-
make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0
80-
echo Deploying CodeFlare operator
81-
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83-
cd ..
61+
# Install Kueue
62+
echo "Installing Kueue ${KUEUE_VERSION}..."
63+
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
64+
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
65+
66+
# Install KubeRay from opendatahub-io fork (has RHOAI features)
67+
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
68+
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
69+
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
70+
71+
# Create default Kueue resources for the tests
72+
echo "Creating Kueue resources..."
73+
kubectl apply -f - <<EOF
74+
apiVersion: kueue.x-k8s.io/v1beta1
75+
kind: ResourceFlavor
76+
metadata:
77+
name: default-flavor
78+
---
79+
apiVersion: kueue.x-k8s.io/v1beta1
80+
kind: ClusterQueue
81+
metadata:
82+
name: cluster-queue
83+
spec:
84+
namespaceSelector: {}
85+
resourceGroups:
86+
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
87+
flavors:
88+
- name: default-flavor
89+
resources:
90+
- name: cpu
91+
nominalQuota: 100
92+
- name: memory
93+
nominalQuota: 100Gi
94+
- name: nvidia.com/gpu
95+
nominalQuota: 10
96+
---
97+
apiVersion: kueue.x-k8s.io/v1beta1
98+
kind: LocalQueue
99+
metadata:
100+
name: local-queue
101+
namespace: default
102+
annotations:
103+
kueue.x-k8s.io/default-queue: "true"
104+
spec:
105+
clusterQueue: cluster-queue
106+
EOF
84107
85108
- name: Add user to KinD
86109
uses: ./common/github-actions/kind-add-user
@@ -142,17 +165,11 @@ jobs:
142165
echo "Printing Pytest output logs"
143166
cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output_rayjob.log
144167
145-
- name: Print CodeFlare operator logs
146-
if: always() && steps.deploy.outcome == 'success'
147-
run: |
148-
echo "Printing CodeFlare operator logs"
149-
kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log
150-
151168
- name: Print KubeRay operator logs
152169
if: always() && steps.deploy.outcome == 'success'
153170
run: |
154171
echo "Printing KubeRay operator logs"
155-
kubectl logs -n default --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
172+
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log
156173
157174
- name: Print Kueue controller logs
158175
if: always() && steps.deploy.outcome == 'success'

docs/designs/History/CodeFlareSDK_Design_Doc.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,6 @@ We will rely on the Kubernetes cluster’s default security, where users cannot
9797
* System tests of SDK as part of the entire CodeFlare stack for main scenarios
9898
* Unit testing, integration testing, and system testing approaches
9999
* Unit testing will occur with every PR.
100-
* For system testing we can leverage [current e2e](https://github.com/project-codeflare/codeflare-operator/tree/main/test/e2e) tests from the operator repo.
101100
* Validation criteria and expected outcomes
102101
* Minimum of 95% code coverage at all times.
103102
* Expect all unit tests to pass before a PR is merged.

0 commit comments

Comments
 (0)