From 1702d86ea554fb714f8468831b26864afd6a956e Mon Sep 17 00:00:00 2001 From: Pat O'Connor Date: Wed, 11 Feb 2026 09:14:17 +0000 Subject: [PATCH] task(RHOAIENG-30976): Added RayJobs section to SDK docs Signed-off-by: Pat O'Connor --- docs/sphinx/conf.py | 4 +- docs/sphinx/index.rst | 1 + .../user-docs/ray-cluster-interaction.rst | 3 +- docs/sphinx/user-docs/rayjob.rst | 105 ++++++++++++++++++ 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 docs/sphinx/user-docs/rayjob.rst diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py index 75f6f16fd..070f9eafe 100644 --- a/docs/sphinx/conf.py +++ b/docs/sphinx/conf.py @@ -12,9 +12,9 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = "CodeFlare SDK" -copyright = "2024, Project CodeFlare" +copyright = "2026, Project CodeFlare" author = "Project CodeFlare" -release = "v0.21.1" +release = "v0.34.0" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/sphinx/index.rst b/docs/sphinx/index.rst index 3c6fe876f..8e1b1cdb6 100644 --- a/docs/sphinx/index.rst +++ b/docs/sphinx/index.rst @@ -22,6 +22,7 @@ The CodeFlare SDK is an intuitive, easy-to-use python interface for batch resour user-docs/authentication user-docs/cluster-configuration user-docs/ray-cluster-interaction + user-docs/rayjob user-docs/e2e user-docs/s3-compatible-storage user-docs/setup-kueue diff --git a/docs/sphinx/user-docs/ray-cluster-interaction.rst b/docs/sphinx/user-docs/ray-cluster-interaction.rst index 7eaa5f980..ea6a1fae6 100644 --- a/docs/sphinx/user-docs/ray-cluster-interaction.rst +++ b/docs/sphinx/user-docs/ray-cluster-interaction.rst @@ -2,7 +2,8 @@ Ray Cluster Interaction ======================= The CodeFlare SDK offers multiple ways to interact with Ray Clusters -including the below methods. +including the below methods. For submitting batch jobs that create or +use a Ray cluster, see :doc:`./rayjob`. get_cluster() ------------- diff --git a/docs/sphinx/user-docs/rayjob.rst b/docs/sphinx/user-docs/rayjob.rst new file mode 100644 index 000000000..3359e3434 --- /dev/null +++ b/docs/sphinx/user-docs/rayjob.rst @@ -0,0 +1,105 @@ +Submitting RayJobs +================== + +The CodeFlare SDK provides a ``RayJob`` interface for submitting and +managing Ray jobs via the KubeRay operator (RayJob custom resource). +You can either create a short-lived Ray cluster for the job (managed by +the operator and cleaned up after the job finishes) or run the job on an +existing Ray cluster. + +Import the following to use RayJob: + +:: + + from codeflare_sdk import RayJob, ManagedClusterConfig + +Submitting a job with a new cluster (ManagedClusterConfig) +--------------------------------------------------------- + +When you provide ``cluster_config``, the KubeRay operator creates a +Ray cluster for the job and tears it down after the job completes. You +do not need to manage the cluster lifecycle yourself. + +| Required: ``job_name`` (str), ``entrypoint`` (str), ``cluster_config`` (ManagedClusterConfig). +| Optional: ``namespace``, ``runtime_env``, ``ttl_seconds_after_finished``, ``active_deadline_seconds``, ``local_queue``, ``priority_class``. + +.. code:: python + + from codeflare_sdk import RayJob, ManagedClusterConfig + + cluster_config = ManagedClusterConfig( + head_memory_requests=6, + head_memory_limits=8, + num_workers=2, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=4, + worker_memory_limits=6, + head_accelerators={"nvidia.com/gpu": 0}, + worker_accelerators={"nvidia.com/gpu": 0}, + ) + + job = RayJob( + job_name="my-rayjob", + entrypoint="python -c 'print(\"Hello from RayJob!\")'", + cluster_config=cluster_config, + namespace="default", + ) + job.submit() + +Submitting a job to an existing cluster +-------------------------------------- + +When you provide ``cluster_name``, the job runs on an existing Ray +cluster. The cluster is not shut down when the job finishes. + +| Required: ``job_name`` (str), ``entrypoint`` (str), ``cluster_name`` (str). +| Optional: ``namespace``, ``runtime_env``, ``active_deadline_seconds``, ``local_queue``, ``priority_class``. +| Note: ``ttl_seconds_after_finished`` cannot be set when using an existing cluster. + +.. code:: python + + from codeflare_sdk import RayJob + + job = RayJob( + job_name="my-rayjob", + entrypoint="python my_script.py", + cluster_name="my-existing-cluster", + namespace="default", + ) + job.submit() + +RayJob methods +-------------- + +| ``job.submit()`` — Submits the RayJob to the KubeRay operator. Returns the job name on success. When using ``cluster_config``, the operator creates the cluster and runs the job; when using ``cluster_name``, the job is submitted to the specified cluster. +| ``job.status(print_to_console=True)`` — Returns the job status (e.g. RUNNING, COMPLETE, FAILED) and a ready flag; optionally prints a formatted status to the console. +| ``job.stop()`` — Suspends the Ray job. +| ``job.resubmit()`` — Resubmits the Ray job. +| ``job.delete()`` — Deletes the RayJob custom resource (and the cluster if it was created by this RayJob). + +Runtime environment +------------------- + +You can pass ``runtime_env`` when creating a ``RayJob`` to set the Ray +runtime environment (e.g. working directory, pip packages, environment +variables). It can be a Ray ``RuntimeEnv`` object from ``ray.runtime_env`` +or a dict with keys such as ``working_dir``, ``pip``, ``env_vars``. For +example: ``runtime_env={"working_dir": "./my-scripts", "pip": ["requests"]}``. +See the Ray documentation for runtime environment options. + +Kueue integration +----------------- + +When Kueue is installed, you can set ``local_queue`` to the name of a +Kueue LocalQueue and ``priority_class`` to a WorkloadPriorityClass name +for preemption control. These apply to both new clusters (``cluster_config``) +and existing clusters (``cluster_name``). For Kueue setup, see :doc:`./setup-kueue`. + +.. note:: + + ``RayJob`` is used for the **RayJob custom resource** (batch job + lifecycle managed by the KubeRay operator). For submitting jobs + interactively to an already-running cluster via the Ray dashboard API, + the SDK exposes ``RayJobClient``; see the Code Documentation (modules) + for the API reference.