Skip to content

Commit 1bc3451

Browse files
RHOAIENG-69301: fix disconnected env job submission
1 parent 4a12c8e commit 1bc3451

4 files changed

Lines changed: 199 additions & 15 deletions

File tree

tests/e2e/support.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,160 @@ def get_setup_env_variables(**kwargs):
157157
return env_vars
158158

159159

160+
def _env_flag_enabled(name):
161+
return os.environ.get(name, "").strip().lower() in ("1", "true", "yes")
162+
163+
164+
def _env_flag_disabled(name):
165+
return os.environ.get(name, "").strip().lower() in ("0", "false", "no", "off")
166+
167+
168+
def _has_registry_mirror_configured():
169+
"""
170+
Detect mirror-only registry layout typical of disconnected OpenShift installs.
171+
172+
Checks ImageDigestMirrorSet (OCP 4.14+) and ImageContentSourcePolicy (legacy).
173+
"""
174+
try:
175+
from kubernetes import client as k8s_client
176+
177+
custom_api = k8s_client.CustomObjectsApi()
178+
mirror_types = [
179+
("config.openshift.io", "v1", "imagedigestmirrorsets"),
180+
("operator.openshift.io", "v1alpha1", "imagecontentsourcepolicies"),
181+
]
182+
for group, version, plural in mirror_types:
183+
try:
184+
result = custom_api.list_cluster_custom_object(
185+
group=group,
186+
version=version,
187+
plural=plural,
188+
)
189+
if result.get("items"):
190+
return True
191+
except Exception:
192+
continue
193+
except Exception:
194+
pass
195+
return False
196+
197+
198+
def _disconnected_cluster_signals():
199+
"""Return True when the cluster is likely disconnected / air-gapped."""
200+
if _env_flag_enabled("DISCONNECTED_CLUSTER") or _env_flag_enabled(
201+
"IS_DISCONNECTED_CLUSTER"
202+
):
203+
return True
204+
if _has_registry_mirror_configured():
205+
return True
206+
try:
207+
server = (run_oc_command(["whoami", "--show-server=true"]) or "").lower()
208+
if "-dis-" in server or "disconnected" in server:
209+
return True
210+
except Exception:
211+
pass
212+
return False
213+
214+
215+
def _upgrade_mnist_prerequisites_met():
216+
"""
217+
Return True when full MNIST can run (pip packages + dataset reachable).
218+
219+
Connected labs may use public PyPI and MNIST mirrors with no extra env.
220+
Disconnected labs need internal PIP_INDEX_URL and AWS_DEFAULT_ENDPOINT (MinIO).
221+
"""
222+
if not _disconnected_cluster_signals():
223+
return True
224+
225+
pip_url = (os.environ.get("PIP_INDEX_URL") or "").strip()
226+
aws_endpoint = (os.environ.get("AWS_DEFAULT_ENDPOINT") or "").strip()
227+
pip_ok = bool(pip_url) and "pypi.org" not in pip_url
228+
aws_ok = bool(aws_endpoint)
229+
if pip_ok and aws_ok:
230+
print(
231+
"Disconnected cluster with PIP mirror and S3 endpoint configured; "
232+
"using full MNIST upgrade job"
233+
)
234+
return True
235+
return False
236+
237+
238+
def use_upgrade_smoke_job():
239+
"""
240+
Use a lightweight Ray job for upgrade tests when full MNIST is not viable.
241+
242+
Full MNIST pulls torch/pytorch via pip and may download datasets from the
243+
public internet — both fail on disconnected clusters without mirrors.
244+
245+
Detection order (first match wins):
246+
1. UPGRADE_USE_SMOKE_JOB=true|false (explicit override)
247+
2. Full MNIST prerequisites met (connected, or disconnected with mirrors)
248+
3. DISCONNECTED_CLUSTER / IS_DISCONNECTED_CLUSTER env (Jenkins)
249+
4. ImageDigestMirrorSet / ImageContentSourcePolicy on cluster
250+
5. API server URL heuristic (-dis- / disconnected), last resort
251+
"""
252+
if _env_flag_enabled("UPGRADE_USE_SMOKE_JOB"):
253+
print("UPGRADE_USE_SMOKE_JOB enabled; using upgrade smoke job")
254+
return True
255+
if _env_flag_disabled("UPGRADE_USE_SMOKE_JOB"):
256+
print("UPGRADE_USE_SMOKE_JOB disabled; using full MNIST upgrade job")
257+
return False
258+
259+
if _upgrade_mnist_prerequisites_met():
260+
return False
261+
262+
if _env_flag_enabled("DISCONNECTED_CLUSTER") or _env_flag_enabled(
263+
"IS_DISCONNECTED_CLUSTER"
264+
):
265+
print(
266+
"Disconnected cluster env set without PIP/S3 mirrors; "
267+
"using upgrade smoke job (no pip install)"
268+
)
269+
return True
270+
271+
if _has_registry_mirror_configured():
272+
print(
273+
"Registry mirror detected (ImageDigestMirrorSet/ICSP) without "
274+
"PIP/S3 mirrors; using upgrade smoke job (no pip install)"
275+
)
276+
return True
277+
278+
try:
279+
server = (run_oc_command(["whoami", "--show-server=true"]) or "").lower()
280+
if "-dis-" in server or "disconnected" in server:
281+
print(
282+
"Detected disconnected cluster from API server URL; "
283+
"using upgrade smoke job (no pip install)"
284+
)
285+
return True
286+
except Exception:
287+
pass
288+
289+
return False
290+
291+
292+
def get_upgrade_job_submission_spec():
293+
"""
294+
Return entrypoint and runtime_env for post-upgrade job submission tests.
295+
"""
296+
if use_upgrade_smoke_job():
297+
return {
298+
"entrypoint": "python upgrade_job_smoke.py",
299+
"runtime_env": {
300+
"working_dir": "./tests/e2e/",
301+
"env_vars": get_setup_env_variables(),
302+
},
303+
}
304+
return {
305+
"entrypoint": "python mnist.py",
306+
"runtime_env": {
307+
"working_dir": "./tests/e2e/",
308+
"pip": "./tests/e2e/mnist_pip_requirements.txt",
309+
"env_vars": get_setup_env_variables(),
310+
},
311+
}
312+
313+
160314
def random_choice():
161315
alphabet = string.ascii_lowercase + string.digits
162316
return "".join(random.choices(alphabet, k=5))

tests/e2e/upgrade_job_smoke.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Copyright 2024 IBM, Red Hat
2+
#
3+
# Minimal Ray job for upgrade qualification on disconnected clusters.
4+
# Validates job submission and execution without pip installs or external datasets.
5+
6+
import sys
7+
8+
9+
def main() -> int:
10+
print("upgrade-job-smoke: job started")
11+
print("upgrade-job-smoke: job finished successfully")
12+
return 0
13+
14+
15+
if __name__ == "__main__":
16+
sys.exit(main())

tests/upgrade/01_raycluster_sdk_upgrade_test.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -166,13 +166,10 @@ def assert_jobsubmit_withoutLogin(self, cluster):
166166
else:
167167
api_url = dashboard_url + "/api/jobs/"
168168

169+
job_spec = get_upgrade_job_submission_spec()
169170
jobdata = {
170-
"entrypoint": "python mnist.py",
171-
"runtime_env": {
172-
"working_dir": "./tests/e2e/",
173-
"pip": "./tests/e2e/mnist_pip_requirements.txt",
174-
"env_vars": get_setup_env_variables(),
175-
},
171+
"entrypoint": job_spec["entrypoint"],
172+
"runtime_env": job_spec["runtime_env"],
176173
}
177174

178175
response = requests.post(
@@ -251,14 +248,13 @@ def assert_jobsubmit_withlogin(self, cluster):
251248
header = {"Authorization": f"Bearer {auth_token}"}
252249
client = RayJobClient(address=ray_dashboard, headers=header, verify=False)
253250

251+
job_spec = get_upgrade_job_submission_spec()
252+
print(f"Submitting upgrade job: {job_spec['entrypoint']}")
253+
254254
# Submit the job
255255
submission_id = client.submit_job(
256-
entrypoint="python mnist.py",
257-
runtime_env={
258-
"working_dir": "./tests/e2e/",
259-
"pip": "./tests/e2e/mnist_pip_requirements.txt",
260-
"env_vars": get_setup_env_variables(),
261-
},
256+
entrypoint=job_spec["entrypoint"],
257+
runtime_env=job_spec["runtime_env"],
262258
)
263259
print(f"Submitted job with ID: {submission_id}")
264260
done = False
@@ -283,9 +279,10 @@ def assert_jobsubmit_withlogin(self, cluster):
283279
client.delete_job(submission_id)
284280

285281
def assert_job_completion(self, status):
286-
if status == "SUCCEEDED":
287-
print(f"Job has completed: '{status}'")
282+
status_value = getattr(status, "value", status)
283+
if status_value == "SUCCEEDED":
284+
print(f"Job has completed: '{status_value}'")
288285
assert True
289286
else:
290-
print(f"Job has completed: '{status}'")
287+
print(f"Job has completed: '{status_value}'")
291288
assert False

tests/upgrade/README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,23 @@ Post-upgrade job tests use `is_byoidc_cluster_detected()` from `tests/e2e/suppor
4949

5050
On **htpasswd/LDAP** QE clusters (e.g. ods-qe-psi-17), job tests use `oc whoami --show-token=true` with `OCP_ADMIN_USER_*` credentials — not Keycloak password grant.
5151

52+
## Disconnected clusters (post-upgrade job submission)
53+
54+
Full MNIST installs torch via pip and may fetch datasets from the public internet. On disconnected clusters that fails unless mirrors are configured.
55+
56+
Post-upgrade job tests use `get_upgrade_job_submission_spec()` from `tests/e2e/support.py`:
57+
58+
| Job | When |
59+
|-----|------|
60+
| **`upgrade_job_smoke.py`** (no pip) | Disconnected / mirror-only cluster without PyPI + S3 env |
61+
| **`mnist.py`** (full pip + training) | Connected cluster, or disconnected with `PIP_INDEX_URL` (non-pypi.org) **and** `AWS_DEFAULT_ENDPOINT` set |
62+
63+
**Detection order:** `UPGRADE_USE_SMOKE_JOB` override → MNIST prerequisites → `DISCONNECTED_CLUSTER` / `IS_DISCONNECTED_CLUSTER` (Jenkins) → `ImageDigestMirrorSet` / `ImageContentSourcePolicy` → API URL `-dis-` heuristic (last resort).
64+
65+
Set `UPGRADE_USE_SMOKE_JOB=false` to force full MNIST when mirrors are configured. Set `DISCONNECTED_CLUSTER=true` in the test env file for explicit lab config (aligns with Jenkins `IS_DISCONNECTED_CLUSTER`).
66+
67+
**Porting:** apply the same helpers (`use_upgrade_smoke_job`, `upgrade_job_smoke.py`, `get_upgrade_job_submission_spec` usage) to **`main`** (3.5 / `2.x-3.x-upgrade-tests-post-upgrade`) and **`3.3`** upgrade branches.
68+
5269
## Migration script
5370

5471
`scripts/migration/ray_cluster_migration.py` (keep in sync with [rhoai-upgrade-helpers](https://github.com/red-hat-data-services/rhoai-upgrade-helpers) `main`).

0 commit comments

Comments
 (0)