diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml index c925b6a52..0dcdbb992 100644 --- a/.github/workflows/e2e_tests.yaml +++ b/.github/workflows/e2e_tests.yaml @@ -18,7 +18,8 @@ concurrency: cancel-in-progress: true env: - CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + KUEUE_VERSION: v0.13.4 + KUBERAY_VERSION: v1.4.2 jobs: kubernetes: @@ -37,23 +38,6 @@ jobs: ref: "main" path: "common" - - name: Checkout CodeFlare operator repository - uses: actions/checkout@v4 - with: - repository: project-codeflare/codeflare-operator - path: codeflare-operator - - - name: Set Go - uses: actions/setup-go@v5 - with: - go-version-file: "./codeflare-operator/go.mod" - cache-dependency-path: "./codeflare-operator/go.sum" - - - name: Set up gotestfmt - uses: gotesttools/gotestfmt-action@v2 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up specific Python version uses: actions/setup-python@v5 with: @@ -71,16 +55,55 @@ jobs: - name: Install NVidia GPU operator for KinD uses: ./common/github-actions/nvidia-gpu-operator - - name: Deploy CodeFlare stack + - name: Deploy Kueue and KubeRay id: deploy run: | - cd codeflare-operator - echo Setting up CodeFlare stack - make setup-e2e KUEUE_VERSION=v0.13.4 KUBERAY_VERSION=v1.4.0 - echo Deploying CodeFlare operator - make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" - kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager - cd .. + # Install Kueue + echo "Installing Kueue ${KUEUE_VERSION}..." + kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml + kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager + + # Install KubeRay from opendatahub-io fork (has RHOAI features) + echo "Installing KubeRay ${KUBERAY_VERSION} from opendatahub-io..." + kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}" + kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator + + # Create default Kueue resources for the tests + echo "Creating Kueue resources..." + kubectl apply -f - <`__. - Setup Phase: - - Pull the `codeflare-operator - repo `__ - and run the following make targets: + - Create a KinD cluster: :: - make kind-e2e - export CLUSTER_HOSTNAME=kind - make setup-e2e - make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.3.0 + kind create cluster - For running tests locally on Kind cluster, we need to disable `rayDashboardOAuthEnabled` in `codeflare-operator-config` ConfigMap and then restart CodeFlare Operator + - Install Kueue: + + :: + + KUEUE_VERSION=v0.13.4 + kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml + kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager + + - Install KubeRay from the opendatahub-io fork (includes RHOAI features): + + :: + + KUBERAY_VERSION=v1.4.2 + kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}" + kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator + + - Create Kueue resources (ResourceFlavor, ClusterQueue, LocalQueue): + + :: + + kubectl apply -f - <`__. kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user - kubectl create clusterrole list-rayclusters --verb=get,list --resource=rayclusters - kubectl create clusterrolebinding sdk-user-list-rayclusters --clusterrole=list-rayclusters --user=sdk-user + kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters + kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user + kubectl create clusterrole rayjob-creator --verb=get,list,create,delete,patch --resource=rayjobs + kubectl create clusterrolebinding sdk-user-rayjob-creator --clusterrole=rayjob-creator --user=sdk-user + kubectl create clusterrole list-secrets --verb=get,list --resource=secrets + kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user kubectl config use-context sdk-user - - Install the latest development version of kueue - - :: - - kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev" - - Test Phase: - - Once we have the codeflare-operator, kuberay-operator and kueue - running and ready, we can run the e2e test on the codeflare-sdk - repository: + - Once we have kuberay-operator and kueue running and ready, we can + run the e2e test on the codeflare-sdk repository: :: @@ -102,21 +147,60 @@ On OpenShift clusters - Setup Phase: - - Pull the `codeflare-operator - repo `__ - and run the following make targets: + - Install Kueue: :: + KUEUE_VERSION=v0.13.4 + kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml + kubectl wait --timeout=120s --for=condition=Available=true deployment -n kueue-system kueue-controller-manager + + - Install KubeRay from the opendatahub-io fork (includes RHOAI features): + + :: - make setup-e2e - make deploy -e IMG=quay.io/project-codeflare/codeflare-operator:v1.3.0 + KUBERAY_VERSION=v1.4.2 + kubectl create -k "github.com/opendatahub-io/kuberay/ray-operator/config/default?ref=${KUBERAY_VERSION}" + kubectl wait --timeout=120s --for=condition=Available=true deployment kuberay-operator - - Install the latest development version of kueue + - Create Kueue resources (ResourceFlavor, ClusterQueue, LocalQueue): :: - kubectl apply --server-side -k "github.com/opendatahub-io/kueue/config/rhoai?ref=dev" + kubectl apply -f - <= timeout: - raise TimeoutError(f"job has timed out after waiting {timeout}s") - sleep(5) - time += 5 - - logs = client.get_job_logs(submission_id) - print(logs) - - self.assert_job_completion(status) - - client.delete_job(submission_id) + try: + submission_id = client.submit_job( + entrypoint="python mnist.py", + runtime_env={ + "working_dir": "./tests/e2e/", + "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": get_setup_env_variables(ACCELERATOR=accelerator), + }, + entrypoint_num_gpus=number_of_gpus, + ) + print(f"Submitted job with ID: {submission_id}") + done = False + time = 0 + timeout = 900 + while not done: + status = client.get_job_status(submission_id) + if status.is_terminal(): + break + if not done: + print(status) + if timeout and time >= timeout: + raise TimeoutError( + f"job has timed out after waiting {timeout}s" + ) + sleep(5) + time += 5 + + logs = client.get_job_logs(submission_id) + print(logs) + + self.assert_job_completion(status) + + client.delete_job(submission_id) + finally: + self.cleanup_port_forward() def assert_job_completion(self, status): if status == "SUCCEEDED": diff --git a/tests/e2e/rayjob/rayjob_existing_cluster_test.py b/tests/e2e/rayjob/rayjob_existing_cluster_test.py index 00faa2d9e..8acdc1c5f 100644 --- a/tests/e2e/rayjob/rayjob_existing_cluster_test.py +++ b/tests/e2e/rayjob/rayjob_existing_cluster_test.py @@ -67,8 +67,9 @@ def test_existing_kueue_cluster(self): cluster.apply() # Wait for cluster to be ready (with Kueue admission) + # On KinD, disable dashboard check as HTTPRoute/Route is not available print(f"Waiting for cluster '{cluster_name}' to be ready...") - cluster.wait_ready(timeout=600) + cluster.wait_ready(timeout=600, dashboard_check=is_openshift()) print(f"✓ Cluster '{cluster_name}' is ready") # RayJob with explicit local_queue (will be ignored for existing clusters)