2525 },
2626 {
2727 "cell_type" : " code" ,
28+ "execution_count" : null ,
2829 "metadata" : {},
30+ "outputs" : [],
2931 "source" : [
3032 " from codeflare_sdk import RayJob, ManagedClusterConfig, set_api_client, get_cluster\n " ,
3133 " from kube_authkit import AuthConfig, get_k8s_client\n " ,
3234 " import time"
33- ],
34- "execution_count" : null ,
35- "outputs" : []
35+ ]
3636 },
3737 {
3838 "cell_type" : " markdown" ,
4343 },
4444 {
4545 "cell_type" : " code" ,
46+ "execution_count" : null ,
4647 "metadata" : {},
48+ "outputs" : [],
4749 "source" : [
4850 " import urllib3\n " ,
4951 " \n " ,
8385 " JOB_NAME = \" checkpointing-job\"\n " ,
8486 " # Must match metadata.name of a LocalQueue in NAMESPACE (create per OpenShift Kueue docs).\n " ,
8587 " LOCAL_QUEUE = \" default\" "
86- ],
87- "execution_count" : null ,
88- "outputs" : []
88+ ]
8989 },
9090 {
9191 "cell_type" : " markdown" ,
105105 },
106106 {
107107 "cell_type" : " code" ,
108+ "execution_count" : null ,
108109 "metadata" : {},
110+ "outputs" : [],
109111 "source" : [
110112 " # Optional: verify Kueue objects exist (cluster admin / user with read access)\n " ,
111113 " # !oc get resourceflavor.kueue.x-k8s.io\n " ,
112114 " # !oc get clusterqueue.kueue.x-k8s.io\n " ,
113115 " # !oc get localqueue.kueue.x-k8s.io -n $NAMESPACE\n " ,
114116 " \n " ,
115117 " print(f\" Namespace: {NAMESPACE!r}, RayJob name: {JOB_NAME!r}, LocalQueue: {LOCAL_QUEUE!r}\" )"
116- ],
117- "execution_count" : null ,
118- "outputs" : []
118+ ]
119119 },
120120 {
121121 "cell_type" : " markdown" ,
126126 },
127127 {
128128 "cell_type" : " code" ,
129+ "execution_count" : null ,
129130 "metadata" : {},
131+ "outputs" : [],
130132 "source" : [
131133 " # Set your AWS credentials\n " ,
132134 " # WARNING: Do not commit credentials to version control. For production,\n " ,
142144 " # AWS_CREDENTIALS[\" AWS_SESSION_TOKEN\" ] = \" your-session-token\"\n " ,
143145 " \n " ,
144146 " print(f\" Using bucket: {AWS_CREDENTIALS['AWS_S3_BUCKET']}\" )"
145- ],
146- "execution_count" : null ,
147- "outputs" : []
148- },
149- {
150- "cell_type" : " markdown" ,
151- "metadata" : {},
152- "source" : [
153- " ## Grant workbench permissions (one-time, admin)\n " ,
154- " \n " ,
155- " To create RayJob resources from a workbench, the workbench service account typically needs elevated RBAC in the project. Run **once** with a user that has `cluster-admin` or sufficient privileges (replace `your-namespace`):\n " ,
156- " \n " ,
157- " ```bash\n " ,
158- " oc adm policy add-role-to-user admin \\\n " ,
159- " system:serviceaccount:your-namespace:your-namespace-wb \\\n " ,
160- " -n your-namespace\n " ,
161- " ```"
162147 ]
163148 },
164149 {
170155 },
171156 {
172157 "cell_type" : " code" ,
158+ "execution_count" : null ,
173159 "metadata" : {},
160+ "outputs" : [],
174161 "source" : [
175162 " managed = ManagedClusterConfig(\n " ,
176163 " num_workers=2,\n " ,
206193 " )\n " ,
207194 " print(f\" RayCluster name (when assigned): {job.cluster_name}\" )\n " ,
208195 " print(\" Watch logs for: NO CHECKPOINT FOUND - Starting fresh\" )"
209- ],
210- "execution_count" : null ,
211- "outputs" : []
196+ ]
212197 },
213198 {
214199 "cell_type" : " markdown" ,
221206 },
222207 {
223208 "cell_type" : " code" ,
209+ "execution_count" : null ,
224210 "metadata" : {},
211+ "outputs" : [],
225212 "source" : [
226213 " print(job.status())\n " ,
227214 " \n " ,
244231 " print(\n " ,
245232 " \" Wait for at least one full epoch and a checkpoint to S3 before running the suspend cell.\"\n " ,
246233 " )"
247- ],
248- "execution_count" : null ,
249- "outputs" : []
234+ ]
250235 },
251236 {
252237 "cell_type" : " markdown" ,
261246 },
262247 {
263248 "cell_type" : " code" ,
249+ "execution_count" : null ,
264250 "metadata" : {},
251+ "outputs" : [],
265252 "source" : [
266253 " print(\" =\" * 60)\n " ,
267254 " print(\" SUSPENDING RayJob (checkpoint demo — not deleting the RayJob CR)\" )\n " ,
270257 " \n " ,
271258 " job.stop()\n " ,
272259 " print(\" Stop requested; poll job.status() until the RayJob reports suspended / non-running.\" )"
273- ],
274- "execution_count" : null ,
275- "outputs" : []
260+ ]
276261 },
277262 {
278263 "cell_type" : " markdown" ,
285270 },
286271 {
287272 "cell_type" : " code" ,
273+ "execution_count" : null ,
288274 "metadata" : {},
275+ "outputs" : [],
289276 "source" : [
290277 " print(\" =\" * 60)\n " ,
291278 " print(\" RESUMING RayJob after suspend\" )\n " ,
295282 " job.resubmit()\n " ,
296283 " time.sleep(10)\n " ,
297284 " print(job.status())"
298- ],
299- "execution_count" : null ,
300- "outputs" : []
285+ ]
301286 },
302287 {
303288 "cell_type" : " markdown" ,
317302 },
318303 {
319304 "cell_type" : " code" ,
305+ "execution_count" : null ,
320306 "metadata" : {},
307+ "outputs" : [],
321308 "source" : [
322309 " print(job.status())\n " ,
323310 " try:\n " ,
326313 " except Exception as e:\n " ,
327314 " print(f\" Could not resolve cluster yet: {e}\" )\n " ,
328315 " print(\" Check Jobs tab for: RESUMING FROM CHECKPOINT - Starting at epoch N\" )"
329- ],
330- "execution_count" : null ,
331- "outputs" : []
316+ ]
332317 },
333318 {
334319 "cell_type" : " markdown" ,
341326 },
342327 {
343328 "cell_type" : " code" ,
329+ "execution_count" : null ,
344330 "metadata" : {},
331+ "outputs" : [],
345332 "source" : [
346333 " print(\" Cleaning up...\" )\n " ,
347334 " cluster_name = job.cluster_name\n " ,
357344 " pass\n " ,
358345 " \n " ,
359346 " print(\" Cleanup attempted (RayJob delete; cluster.down if RayCluster still exists).\" )"
360- ],
361- "execution_count" : null ,
362- "outputs" : []
347+ ]
363348 },
364349 {
365350 "cell_type" : " code" ,
351+ "execution_count" : null ,
366352 "metadata" : {},
353+ "outputs" : [],
367354 "source" : [
368355 " # No explicit logout needed - authentication is managed automatically by kube-authkit"
369- ],
370- "execution_count" : null ,
371- "outputs" : []
356+ ]
372357 }
373358 ],
374359 "metadata" : {
392377 },
393378 "nbformat" : 4 ,
394379 "nbformat_minor" : 2
395- }
380+ }
0 commit comments