Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 11 additions & 15 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,10 @@ def assert_jobsubmit_withoutLogin(self, cluster):
# API endpoint is directly under the hostname
api_url = dashboard_url + "/api/jobs/"

job_spec = get_mnist_job_submission_spec()
jobdata = {
"entrypoint": "python mnist.py",
"runtime_env": {
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": get_setup_env_variables(),
},
"entrypoint": job_spec["entrypoint"],
"runtime_env": job_spec["runtime_env"],
}

# Try to submit a job without authentication
Expand Down Expand Up @@ -286,13 +283,11 @@ def assert_jobsubmit_withlogin(self, cluster):
"Verified: No jobs exist from the previous unauthenticated submission attempt."
)

job_spec = get_mnist_job_submission_spec()
print(f"Submitting job: {job_spec['entrypoint']}")
submission_id = client.submit_job(
entrypoint="python mnist.py",
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": get_setup_env_variables(),
},
entrypoint=job_spec["entrypoint"],
runtime_env=job_spec["runtime_env"],
entrypoint_num_cpus=1,
)
print(f"Submitted job with ID: {submission_id}")
Expand All @@ -318,9 +313,10 @@ def assert_jobsubmit_withlogin(self, cluster):
client.delete_job(submission_id)

def assert_job_completion(self, status):
if status == "SUCCEEDED":
print(f"Job has completed: '{status}'")
status_value = getattr(status, "value", status)
if status_value == "SUCCEEDED":
print(f"Job has completed: '{status_value}'")
assert True
else:
print(f"Job has completed: '{status}'")
print(f"Job has completed: '{status_value}'")
assert False
140 changes: 129 additions & 11 deletions tests/e2e/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,130 @@ def get_setup_env_variables(**kwargs):
return env_vars


def _env_flag_enabled(name):
return os.environ.get(name, "").strip().lower() in ("1", "true", "yes")


def _env_flag_disabled(name):
return os.environ.get(name, "").strip().lower() in ("0", "false", "no", "off")


def _disconnected_cluster_signals():
"""Return True when the cluster is likely disconnected / air-gapped."""
if _env_flag_enabled("DISCONNECTED_CLUSTER") or _env_flag_enabled(
"IS_DISCONNECTED_CLUSTER"
):
return True
try:
server = (run_oc_command(["whoami", "--show-server=true"]) or "").lower()
if "-dis-" in server or "disconnected" in server:
return True
except Exception:
pass
return False


def _mnist_prerequisites_met():
"""
Return True when full MNIST can run (pip packages + dataset reachable).

Connected labs may use public PyPI and MNIST mirrors with no extra env.
Disconnected labs need internal PIP_INDEX_URL and AWS_DEFAULT_ENDPOINT (MinIO).
"""
if not _disconnected_cluster_signals():
return True

pip_url = (os.environ.get("PIP_INDEX_URL") or "").strip()
aws_endpoint = (os.environ.get("AWS_DEFAULT_ENDPOINT") or "").strip()
pip_ok = bool(pip_url) and "pypi.org" not in pip_url
aws_ok = bool(aws_endpoint)
if pip_ok and aws_ok:
print(
"Disconnected cluster with PIP mirror and S3 endpoint configured; "
"using full MNIST job"
)
return True
return False


def use_smoke_job():
"""
Use a lightweight Ray job when full MNIST is not viable.

Detection order (first match wins):
1. USE_SMOKE_JOB / UPGRADE_USE_SMOKE_JOB=true|false (explicit override)
2. Full MNIST prerequisites met (connected, or disconnected with mirrors)
3. DISCONNECTED_CLUSTER / IS_DISCONNECTED_CLUSTER env (Jenkins)
4. API server URL heuristic (-dis- / disconnected), last resort

ImageDigestMirrorSet / ICSP are intentionally not used: many connected
OpenShift clusters mirror container registries without blocking pip/PyPI.
"""
for name in ("USE_SMOKE_JOB", "UPGRADE_USE_SMOKE_JOB"):
if _env_flag_enabled(name):
print(f"{name} enabled; using smoke job (no pip install)")
return True
if _env_flag_disabled(name):
print(f"{name} disabled; using full MNIST job")
return False

if _mnist_prerequisites_met():
return False

if _env_flag_enabled("DISCONNECTED_CLUSTER") or _env_flag_enabled(
"IS_DISCONNECTED_CLUSTER"
):
print(
"Disconnected cluster env set without PIP/S3 mirrors; "
"using smoke job (no pip install)"
)
return True

try:
server = (run_oc_command(["whoami", "--show-server=true"]) or "").lower()
if "-dis-" in server or "disconnected" in server:
print(
"Detected disconnected cluster from API server URL; "
"using smoke job (no pip install)"
)
return True
except Exception:
pass

return False


def use_upgrade_smoke_job():
"""Backward-compatible alias for upgrade tests."""
return use_smoke_job()


def get_mnist_job_submission_spec(**kwargs):
"""Return entrypoint and runtime_env for tier1 / upgrade MNIST job submission tests."""
env_vars = get_setup_env_variables(**kwargs)
if use_smoke_job():
return {
"entrypoint": "python upgrade_job_smoke.py",
"runtime_env": {
"working_dir": "./tests/e2e/",
"env_vars": env_vars,
},
}
return {
"entrypoint": "python mnist.py",
"runtime_env": {
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": env_vars,
},
}


def get_upgrade_job_submission_spec():
"""Backward-compatible alias for post-upgrade job submission tests."""
return get_mnist_job_submission_spec()


def random_choice():
alphabet = string.ascii_lowercase + string.digits
return "".join(random.choices(alphabet, k=5))
Expand Down Expand Up @@ -694,18 +818,12 @@ def assert_get_cluster_and_jobsubmit(
client = cluster.job_client

# Submit a job and get the submission ID
env_vars = (
get_setup_env_variables(ACCELERATOR=accelerator)
if accelerator
else get_setup_env_variables()
)
spec_kwargs = {"ACCELERATOR": accelerator} if accelerator else {}
job_spec = get_mnist_job_submission_spec(**spec_kwargs)
print(f"Submitting job: {job_spec['entrypoint']}")
submission_id = client.submit_job(
entrypoint="python mnist.py",
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": env_vars,
},
entrypoint=job_spec["entrypoint"],
runtime_env=job_spec["runtime_env"],
entrypoint_num_cpus=1 if number_of_gpus is None else None,
entrypoint_num_gpus=number_of_gpus,
)
Expand Down
16 changes: 16 additions & 0 deletions tests/e2e/upgrade_job_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright 2024 IBM, Red Hat
#
# Minimal Ray job for upgrade qualification on disconnected clusters.
# Validates job submission and execution without pip installs or external datasets.

import sys


def main() -> int:
print("upgrade-job-smoke: job started")
print("upgrade-job-smoke: job finished successfully")
return 0


if __name__ == "__main__":
sys.exit(main())
28 changes: 12 additions & 16 deletions tests/upgrade/01_raycluster_sdk_upgrade_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,10 @@ def assert_jobsubmit_withoutLogin(self, cluster):
# API endpoint is directly under the hostname
api_url = dashboard_url + "/api/jobs/"

job_spec = get_upgrade_job_submission_spec()
jobdata = {
"entrypoint": "python mnist.py",
"runtime_env": {
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": get_setup_env_variables(),
},
"entrypoint": job_spec["entrypoint"],
"runtime_env": job_spec["runtime_env"],
}

# Try to submit a job without authentication
Expand Down Expand Up @@ -282,14 +279,12 @@ def assert_jobsubmit_withlogin(self, cluster):
header = {"Authorization": f"Bearer {auth_token}"}
client = RayJobClient(address=ray_dashboard, headers=header, verify=False)

# Submit the job
job_spec = get_upgrade_job_submission_spec()
print(f"Submitting upgrade job: {job_spec['entrypoint']}")

submission_id = client.submit_job(
entrypoint="python mnist.py",
runtime_env={
"working_dir": "./tests/e2e/",
"pip": "./tests/e2e/mnist_pip_requirements.txt",
"env_vars": get_setup_env_variables(),
},
entrypoint=job_spec["entrypoint"],
runtime_env=job_spec["runtime_env"],
)
print(f"Submitted job with ID: {submission_id}")
done = False
Expand All @@ -314,9 +309,10 @@ def assert_jobsubmit_withlogin(self, cluster):
client.delete_job(submission_id)

def assert_job_completion(self, status):
if status == "SUCCEEDED":
print(f"Job has completed: '{status}'")
status_value = getattr(status, "value", status)
if status_value == "SUCCEEDED":
print(f"Job has completed: '{status_value}'")
assert True
else:
print(f"Job has completed: '{status}'")
print(f"Job has completed: '{status_value}'")
assert False
17 changes: 17 additions & 0 deletions tests/upgrade/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,23 @@ Post-upgrade job tests use `is_byoidc_cluster_detected()` from `tests/e2e/suppor

On **htpasswd/LDAP** QE clusters, job tests use `oc whoami --show-token=true` with `OCP_ADMIN_USER_*` — not Keycloak password grant.

## Disconnected clusters (post-upgrade job submission)

Full MNIST installs torch via pip and may fetch datasets from the public internet. On disconnected clusters that fails unless mirrors are configured.

Post-upgrade and tier1 MNIST job tests use `get_mnist_job_submission_spec()` from `tests/e2e/support.py`:

| Job | When |
|-----|------|
| **`upgrade_job_smoke.py`** (no pip) | Disconnected / mirror-only cluster without PyPI + S3 env |
| **`mnist.py`** (full pip + training) | Connected cluster, or disconnected with `PIP_INDEX_URL` (non-pypi.org) **and** `AWS_DEFAULT_ENDPOINT` set |

**Detection order:** `USE_SMOKE_JOB` / `UPGRADE_USE_SMOKE_JOB` override → MNIST prerequisites → `DISCONNECTED_CLUSTER` / `IS_DISCONNECTED_CLUSTER` (Jenkins) → API URL `-dis-` heuristic (last resort).

Registry image mirrors (`ImageDigestMirrorSet` / `ICSP`) are **not** used for detection — connected OpenShift clusters often have them without blocking pip.

Set `USE_SMOKE_JOB=false` or `UPGRADE_USE_SMOKE_JOB=false` to force full MNIST when mirrors are configured. Set `DISCONNECTED_CLUSTER=true` in the test env file for explicit lab config (aligns with Jenkins `IS_DISCONNECTED_CLUSTER`).

---

## Migration script
Expand Down
Loading