Skip to content

Commit bbb45f9

Browse files
committed
test: notebooks
1 parent 70dd86e commit bbb45f9

1 file changed

Lines changed: 38 additions & 267 deletions

File tree

.github/workflows/additional_demo_notebook_tests.yaml

Lines changed: 38 additions & 267 deletions
Original file line numberDiff line numberDiff line change
@@ -16,277 +16,48 @@ env:
1616
jobs:
1717
verify-local_interactive:
1818
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
19-
runs-on: ubuntu-latest-4core
19+
runs-on: ubuntu-latest
2020

2121
steps:
22-
- name: Checkout code
23-
uses: actions/checkout@v4
24-
with:
25-
submodules: recursive
26-
27-
- name: Checkout common repo code
28-
uses: actions/checkout@v4
29-
with:
30-
repository: 'project-codeflare/codeflare-common'
31-
ref: 'main'
32-
path: 'common'
33-
34-
- name: Set up specific Python version
35-
uses: actions/setup-python@v5
36-
with:
37-
python-version: '3.11'
38-
cache: 'pip' # caching pip dependencies
39-
40-
- name: Setup and start KinD cluster
41-
uses: ./common/github-actions/kind
42-
43-
- name: Deploy Kueue and KubeRay
44-
id: deploy
45-
run: |
46-
# Install Kueue
47-
echo "Installing Kueue ${KUEUE_VERSION}..."
48-
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
49-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
50-
51-
# Install KubeRay from opendatahub-io fork (has RHOAI features)
52-
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
53-
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
54-
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
55-
56-
# Create default Kueue resources for the tests
57-
echo "Creating Kueue resources..."
58-
kubectl apply -f - <<EOF
59-
apiVersion: kueue.x-k8s.io/v1beta1
60-
kind: ResourceFlavor
61-
metadata:
62-
name: default-flavor
63-
---
64-
apiVersion: kueue.x-k8s.io/v1beta1
65-
kind: ClusterQueue
66-
metadata:
67-
name: cluster-queue
68-
spec:
69-
namespaceSelector: {}
70-
resourceGroups:
71-
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
72-
flavors:
73-
- name: default-flavor
74-
resources:
75-
- name: cpu
76-
nominalQuota: 100
77-
- name: memory
78-
nominalQuota: 100Gi
79-
- name: nvidia.com/gpu
80-
nominalQuota: 10
81-
---
82-
apiVersion: kueue.x-k8s.io/v1beta1
83-
kind: LocalQueue
84-
metadata:
85-
name: local-queue
86-
namespace: default
87-
annotations:
88-
kueue.x-k8s.io/default-queue: "true"
89-
spec:
90-
clusterQueue: cluster-queue
91-
EOF
92-
93-
- name: Setup Additional demo notebooks execution
94-
run: |
95-
echo "Installing papermill and dependencies..."
96-
pip install poetry papermill ipython ipykernel
97-
# Disable virtualenv due to problems using packaged in virtualenv in papermill
98-
poetry config virtualenvs.create false
99-
100-
echo "Installing SDK..."
101-
poetry install --with test,docs
102-
103-
- name: Run local_interactive.ipynb
104-
run: |
105-
set -euo pipefail
106-
107-
# Remove login/logout cells, as KinD doesn't support authentication using token
108-
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object and log in to desired user account")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
109-
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
110-
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
111-
sed -i "s/cluster_uri()/local_client_url()/g" local_interactive.ipynb
112-
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
113-
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
114-
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' local_interactive.ipynb > local_interactive.ipynb.tmp && mv local_interactive.ipynb.tmp local_interactive.ipynb
115-
# Set explicit namespace as SDK need it (currently) to resolve local queues
116-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" local_interactive.ipynb
117-
# Disable dashboard check as KinD doesn't have HTTPRoute/Route configured
118-
sed -i "s/cluster.wait_ready()/cluster.wait_ready(dashboard_check=False)/" local_interactive.ipynb
119-
# Run notebook
120-
poetry run papermill local_interactive.ipynb local_interactive_out.ipynb --log-output --execution-timeout 1200
121-
env:
122-
GRPC_DNS_RESOLVER: "native"
123-
working-directory: demo-notebooks/additional-demos
124-
125-
- name: Print Kueue operator logs
126-
if: always() && steps.deploy.outcome == 'success'
127-
run: |
128-
echo "Printing Kueue operator logs"
129-
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
130-
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
131-
132-
- name: Print KubeRay operator logs
133-
if: always() && steps.deploy.outcome == 'success'
134-
run: |
135-
echo "Printing KubeRay operator logs"
136-
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
137-
138-
- name: Export all KinD pod logs
139-
uses: ./common/github-actions/kind-export-logs
140-
if: always() && steps.deploy.outcome == 'success'
141-
with:
142-
output-directory: ${TEMP_DIR}
143-
144-
- name: Upload logs
145-
uses: actions/upload-artifact@v4
146-
if: always() && steps.deploy.outcome == 'success'
147-
with:
148-
name: logs-local_interactive
149-
retention-days: 10
150-
path: |
151-
${{ env.TEMP_DIR }}/**/*.log
22+
- name: Skip notification
23+
run: |
24+
echo "::notice::SKIPPED: verify-local_interactive test is currently disabled."
25+
echo ""
26+
echo "=============================================================================="
27+
echo " TEST SKIPPED: local_interactive.ipynb"
28+
echo "=============================================================================="
29+
echo ""
30+
echo " Reason: This notebook requires mTLS (mutual TLS) certificates for"
31+
echo " interactive Ray connections via ray.init(address=cluster.local_client_url())."
32+
echo ""
33+
echo " The mTLS CA secret was previously created by codeflare-operator, which has"
34+
echo " been removed from the RHOAI 3.x stack. While opendatahub-io/kuberay includes"
35+
echo " some mTLS features, they require OpenShift-specific components that are not"
36+
echo " available in KinD."
37+
echo ""
38+
echo " This test should be run on a full OpenShift cluster with RHOAI installed."
39+
echo "=============================================================================="
15240
15341
verify-ray_job_client:
15442
if: ${{ github.event.label.name == 'test-additional-notebooks' }}
155-
runs-on: ubuntu-latest-4core
43+
runs-on: ubuntu-latest
15644

15745
steps:
158-
- name: Checkout code
159-
uses: actions/checkout@v4
160-
with:
161-
submodules: recursive
162-
163-
- name: Checkout common repo code
164-
uses: actions/checkout@v4
165-
with:
166-
repository: 'project-codeflare/codeflare-common'
167-
ref: 'main'
168-
path: 'common'
169-
170-
- name: Set up specific Python version
171-
uses: actions/setup-python@v5
172-
with:
173-
python-version: '3.11'
174-
cache: 'pip' # caching pip dependencies
175-
176-
- name: Setup and start KinD cluster
177-
uses: ./common/github-actions/kind
178-
179-
- name: Deploy Kueue and KubeRay
180-
id: deploy
181-
run: |
182-
# Install Kueue
183-
echo "Installing Kueue ${KUEUE_VERSION}..."
184-
kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml
185-
kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager
186-
187-
# Install KubeRay from opendatahub-io fork (has RHOAI features)
188-
echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..."
189-
kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}"
190-
kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator
191-
192-
# Create default Kueue resources for the tests
193-
echo "Creating Kueue resources..."
194-
kubectl apply -f - <<EOF
195-
apiVersion: kueue.x-k8s.io/v1beta1
196-
kind: ResourceFlavor
197-
metadata:
198-
name: default-flavor
199-
---
200-
apiVersion: kueue.x-k8s.io/v1beta1
201-
kind: ClusterQueue
202-
metadata:
203-
name: cluster-queue
204-
spec:
205-
namespaceSelector: {}
206-
resourceGroups:
207-
- coveredResources: ["cpu", "memory", "nvidia.com/gpu"]
208-
flavors:
209-
- name: default-flavor
210-
resources:
211-
- name: cpu
212-
nominalQuota: 100
213-
- name: memory
214-
nominalQuota: 100Gi
215-
- name: nvidia.com/gpu
216-
nominalQuota: 10
217-
---
218-
apiVersion: kueue.x-k8s.io/v1beta1
219-
kind: LocalQueue
220-
metadata:
221-
name: local-queue
222-
namespace: default
223-
annotations:
224-
kueue.x-k8s.io/default-queue: "true"
225-
spec:
226-
clusterQueue: cluster-queue
227-
EOF
228-
229-
- name: Setup Additional demo notebooks execution
230-
run: |
231-
echo "Installing papermill and dependencies..."
232-
pip install poetry papermill ipython ipykernel
233-
# Disable virtualenv due to problems using packaged in virtualenv in papermill
234-
poetry config virtualenvs.create false
235-
236-
echo "Installing SDK..."
237-
poetry install --with test,docs
238-
239-
- name: Run ray_job_client.ipynb
240-
run: |
241-
set -euo pipefail
242-
243-
# Remove login/logout cells, as KinD doesn't support authentication using token
244-
jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
245-
jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
246-
# Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster
247-
sed -i "s/cluster_uri()/local_client_url()/g" ray_job_client.ipynb
248-
# Replace async logs with waiting for job to finish, async logs don't work properly in papermill
249-
JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json)
250-
jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' ray_job_client.ipynb > ray_job_client.ipynb.tmp && mv ray_job_client.ipynb.tmp ray_job_client.ipynb
251-
# Set explicit namespace as SDK need it (currently) to resolve local queues
252-
sed -i "s/worker_cpu_requests=1,/worker_cpu_requests='250m', namespace='default',/" ray_job_client.ipynb
253-
sed -i "s/worker_memory_requests=4,/worker_memory_requests=1,/" ray_job_client.ipynb
254-
sed -i "s/worker_memory_limits=4,/worker_memory_limits=1,/" ray_job_client.ipynb
255-
sed -i "s/'Authorization': .*/'Authorization': None\",/" ray_job_client.ipynb
256-
sed -i "s/num_workers=2/num_workers=1/" ray_job_client.ipynb
257-
sed -i "s/RayJobClient(address=ray_dashboard, headers=header, verify=True)/RayJobClient(address=ray_dashboard, verify=False)/" ray_job_client.ipynb
258-
# Disable dashboard check as KinD doesn't have HTTPRoute/Route configured
259-
sed -i "s/cluster.wait_ready()/cluster.wait_ready(dashboard_check=False)/" ray_job_client.ipynb
260-
# Run notebook
261-
poetry run papermill ray_job_client.ipynb hf_interactive_out.ipynb --log-output --execution-timeout 1200
262-
env:
263-
GRPC_DNS_RESOLVER: "native"
264-
working-directory: demo-notebooks/additional-demos
265-
266-
- name: Print Kueue operator logs
267-
if: always() && steps.deploy.outcome == 'success'
268-
run: |
269-
echo "Printing Kueue operator logs"
270-
KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}')
271-
kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log
272-
273-
- name: Print KubeRay operator logs
274-
if: always() && steps.deploy.outcome == 'success'
275-
run: |
276-
echo "Printing KubeRay operator logs"
277-
kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log
278-
279-
- name: Export all KinD pod logs
280-
uses: ./common/github-actions/kind-export-logs
281-
if: always() && steps.deploy.outcome == 'success'
282-
with:
283-
output-directory: ${TEMP_DIR}
284-
285-
- name: Upload logs
286-
uses: actions/upload-artifact@v4
287-
if: always() && steps.deploy.outcome == 'success'
288-
with:
289-
name: logs-ray_job_client
290-
retention-days: 10
291-
path: |
292-
${{ env.TEMP_DIR }}/**/*.log
46+
- name: Skip notification
47+
run: |
48+
echo "::notice::SKIPPED: verify-ray_job_client test is currently disabled."
49+
echo ""
50+
echo "=============================================================================="
51+
echo " TEST SKIPPED: ray_job_client.ipynb"
52+
echo "=============================================================================="
53+
echo ""
54+
echo " Reason: This notebook requires mTLS (mutual TLS) certificates for"
55+
echo " interactive Ray connections via the Ray Job Client."
56+
echo ""
57+
echo " The mTLS CA secret was previously created by codeflare-operator, which has"
58+
echo " been removed from the RHOAI 3.x stack. While opendatahub-io/kuberay includes"
59+
echo " some mTLS features, they require OpenShift-specific components that are not"
60+
echo " available in KinD."
61+
echo ""
62+
echo " This test should be run on a full OpenShift cluster with RHOAI installed."
63+
echo "=============================================================================="

0 commit comments

Comments
 (0)