Skip to content

Commit 48302fe

Browse files
committed
RHOAIENG-48973: fix notebook tests and add required auto param for trainer accelerator
1 parent 3153fe9 commit 48302fe

9 files changed

Lines changed: 177 additions & 36 deletions

File tree

.github/workflows/ui_notebooks_test.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,38 @@ jobs:
104104
poetry run yarn playwright install chromium
105105
working-directory: ui-tests
106106

107+
- name: Verify Kind cluster connectivity
108+
run: |
109+
echo "Checking kubeconfig location..."
110+
echo "HOME: $HOME"
111+
ls -la ~/.kube/ || echo "No .kube directory found"
112+
113+
echo "Checking kubeconfig format..."
114+
kubectl config view --minify
115+
116+
# Check if kubeconfig has embedded certs or file references
117+
if grep -q "certificate-authority:" ~/.kube/config 2>/dev/null; then
118+
echo "WARNING: kubeconfig uses external certificate files"
119+
fi
120+
if grep -q "certificate-authority-data:" ~/.kube/config 2>/dev/null; then
121+
echo "OK: kubeconfig has embedded certificate data"
122+
fi
123+
124+
echo "Testing cluster connectivity..."
125+
kubectl cluster-info
126+
kubectl get nodes
127+
kubectl get pods -A
128+
129+
# Test Python kubernetes client connectivity
130+
echo "Testing Python kubernetes client..."
131+
python3 -c "
132+
from kubernetes import client, config
133+
config.load_kube_config()
134+
v1 = client.CoreV1Api()
135+
nodes = v1.list_node()
136+
print(f'Successfully connected! Found {len(nodes.items)} nodes')
137+
"
138+
107139
- name: Fix 3_widget_example.ipynb notebook for test
108140
run: |
109141
# Remove login/logout cells, as KinD doesn't support authentication using token
@@ -118,6 +150,13 @@ jobs:
118150
run: |
119151
set -euo pipefail
120152
153+
# Verify kubectl can connect to the cluster
154+
kubectl cluster-info
155+
kubectl get nodes
156+
157+
# Debug: Show kubeconfig location
158+
echo "KUBECONFIG: ${KUBECONFIG:-~/.kube/config}"
159+
121160
poetry run yarn test
122161
working-directory: ui-tests
123162

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
pytorch_lightning==1.9.5
1+
pytorch_lightning==2.6.0
22
ray_lightning
3-
torchmetrics==0.9.1
4-
torchvision==0.19.0
3+
torchmetrics==1.8.2
4+
torchvision==0.20.1
55
minio

src/codeflare_sdk/common/kubernetes_cluster/auth.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -284,15 +284,27 @@ def config_check() -> str:
284284
return config_path
285285

286286

287-
def _client_with_cert(client: client.ApiClient, ca_cert_path: Optional[str] = None):
288-
client.configuration.verify_ssl = True
287+
def _client_with_cert(api_client: client.ApiClient, ca_cert_path: Optional[str] = None):
288+
"""
289+
Configure SSL certificate verification for a Kubernetes API client.
290+
291+
If a custom CA cert path is provided or configured via environment variable,
292+
it will be used. Otherwise, the existing ssl_ca_cert configuration from the
293+
kubeconfig (which may include embedded certificates) is preserved.
294+
295+
Args:
296+
api_client: The Kubernetes API client to configure.
297+
ca_cert_path: Optional path to a custom CA certificate file.
298+
"""
299+
api_client.configuration.verify_ssl = True
289300
cert_path = _gen_ca_cert_path(ca_cert_path)
290-
if cert_path is None:
291-
client.configuration.ssl_ca_cert = None
292-
elif os.path.isfile(cert_path):
293-
client.configuration.ssl_ca_cert = cert_path
294-
else:
295-
raise FileNotFoundError(f"Certificate file not found at {cert_path}")
301+
if cert_path is not None:
302+
if os.path.isfile(cert_path):
303+
api_client.configuration.ssl_ca_cert = cert_path
304+
else:
305+
raise FileNotFoundError(f"Certificate file not found at {cert_path}")
306+
# If cert_path is None, preserve the existing ssl_ca_cert from kubeconfig
307+
# (which may contain embedded certificate data from certificate-authority-data)
296308

297309

298310
def _gen_ca_cert_path(ca_cert_path: Optional[str]):

src/codeflare_sdk/common/kubernetes_cluster/test_auth_edge_cases.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,17 +77,20 @@ def test_gen_ca_cert_path_defaults_to_none(monkeypatch):
7777

7878

7979
def test_client_with_cert_none(mocker, monkeypatch):
80-
"""Test _client_with_cert when cert_path is None."""
80+
"""Test _client_with_cert when cert_path is None preserves existing ssl_ca_cert."""
8181
# Clear env var to ensure we're testing the None case
8282
monkeypatch.delenv("CF_SDK_CA_CERT_PATH", raising=False)
8383

8484
mock_client = mocker.MagicMock()
8585
mock_client.configuration = mocker.MagicMock()
86+
# Set an existing ssl_ca_cert to verify it's preserved
87+
mock_client.configuration.ssl_ca_cert = "/existing/ca.crt"
8688

8789
_client_with_cert(mock_client, None)
8890

8991
assert mock_client.configuration.verify_ssl is True
90-
assert mock_client.configuration.ssl_ca_cert is None
92+
# ssl_ca_cert should be preserved (not overwritten to None)
93+
assert mock_client.configuration.ssl_ca_cert == "/existing/ca.crt"
9194

9295

9396
def test_client_with_cert_file_not_found(mocker):
@@ -99,6 +102,22 @@ def test_client_with_cert_file_not_found(mocker):
99102
_client_with_cert(mock_client, "/nonexistent/cert.crt")
100103

101104

105+
def test_client_with_cert_valid_file(mocker, tmp_path):
106+
"""Test _client_with_cert sets ssl_ca_cert when valid cert file exists."""
107+
# Create a temporary cert file
108+
cert_file = tmp_path / "ca.crt"
109+
cert_file.write_text("-----BEGIN CERTIFICATE-----\ntest\n-----END CERTIFICATE-----")
110+
111+
mock_client = mocker.MagicMock()
112+
mock_client.configuration = mocker.MagicMock()
113+
mock_client.configuration.ssl_ca_cert = None
114+
115+
_client_with_cert(mock_client, str(cert_file))
116+
117+
assert mock_client.configuration.verify_ssl is True
118+
assert mock_client.configuration.ssl_ca_cert == str(cert_file)
119+
120+
102121
def test_token_auth_exception_handling(mocker):
103122
"""Test TokenAuthentication.login() handles ApiException."""
104123
from kubernetes.client import ApiException

src/codeflare_sdk/common/widgets/test_widgets.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,16 @@ def test_cluster_apply_down_buttons(mocker):
7676

7777
# Check if the `apply` and `down` methods were called
7878
mock_wait_ready.assert_called_once()
79-
mock_apply.assert_called_once()
79+
# Widget button uses shorter TLS timeout (60s) to avoid blocking UI
80+
mock_apply.assert_called_once_with(timeout=60)
8081
mock_down.assert_called_once()
8182

8283

8384
@patch.dict("os.environ", {}, clear=True) # Mock environment with no variables
8485
def test_is_notebook_false():
85-
assert cf_widgets.is_notebook() is False
86+
# Mock get_ipython to return None (no IPython shell)
87+
with patch("IPython.get_ipython", return_value=None):
88+
assert cf_widgets.is_notebook() is False
8689

8790

8891
@patch.dict(
@@ -92,6 +95,43 @@ def test_is_notebook_true():
9295
assert cf_widgets.is_notebook() is True
9396

9497

98+
@patch.dict("os.environ", {}, clear=True) # No env vars
99+
def test_is_notebook_with_zmq_shell():
100+
"""Test is_notebook returns True when running in ZMQInteractiveShell (Jupyter)."""
101+
mock_shell = MagicMock()
102+
mock_shell.__class__.__name__ = "ZMQInteractiveShell"
103+
104+
with patch("IPython.get_ipython", return_value=mock_shell):
105+
assert cf_widgets.is_notebook() is True
106+
107+
108+
@patch.dict("os.environ", {}, clear=True) # No env vars
109+
def test_is_notebook_with_terminal_shell():
110+
"""Test is_notebook returns False when running in TerminalInteractiveShell (ipython CLI)."""
111+
mock_shell = MagicMock()
112+
mock_shell.__class__.__name__ = "TerminalInteractiveShell"
113+
114+
with patch("IPython.get_ipython", return_value=mock_shell):
115+
assert cf_widgets.is_notebook() is False
116+
117+
118+
@patch.dict(
119+
"os.environ", {"JPY_PARENT_PID": "12345"}
120+
) # Standard Jupyter environment variable
121+
def test_is_notebook_with_jpy_parent_pid():
122+
"""Test is_notebook returns True when JPY_PARENT_PID env var is set."""
123+
# Mock get_ipython to return None to test env var fallback
124+
with patch("IPython.get_ipython", return_value=None):
125+
assert cf_widgets.is_notebook() is True
126+
127+
128+
@patch.dict("os.environ", {}, clear=True) # No env vars
129+
def test_is_notebook_ipython_import_error():
130+
"""Test is_notebook handles ImportError gracefully."""
131+
with patch.dict("sys.modules", {"IPython": None}):
132+
assert cf_widgets.is_notebook() is False
133+
134+
95135
def test_view_clusters(mocker, capsys):
96136
# If is not a notebook environment, a warning should be raised
97137
with pytest.warns(

src/codeflare_sdk/common/widgets/widgets.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,9 @@ def cluster_apply_down_buttons(
299299
def on_apply_button_clicked(b): # Handle the apply button click event
300300
with output:
301301
output.clear_output()
302-
cluster.apply()
302+
# Use shorter TLS timeout (60s) for widget button clicks to avoid blocking UI
303+
# Users who need full TLS wait can use wait_ready() checkbox or call apply() directly
304+
cluster.apply(timeout=60)
303305

304306
# If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
305307
if wait_ready_check.value:
@@ -327,15 +329,34 @@ def _wait_ready_check_box():
327329

328330
def is_notebook() -> bool:
329331
"""
330-
The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that.
332+
The is_notebook function checks if we're running in a Jupyter Notebook environment.
333+
334+
Detection methods:
335+
1. Check for IPython's ZMQInteractiveShell (standard Jupyter kernel)
336+
2. Check for known environment variables (VSCode, RHOAI/ODH)
331337
"""
338+
# First, try the standard IPython detection method
339+
try:
340+
from IPython import get_ipython
341+
342+
shell = get_ipython()
343+
if shell is not None:
344+
shell_class = shell.__class__.__name__
345+
# ZMQInteractiveShell = Jupyter notebook/lab, qtconsole
346+
if shell_class == "ZMQInteractiveShell":
347+
return True
348+
except (ImportError, NameError):
349+
pass
350+
351+
# Fallback: check for known environment variables
332352
if (
333353
"PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING" in os.environ
334354
or "JPY_SESSION_NAME" in os.environ
335-
): # If running Jupyter NBs in VsCode or RHOAI/ODH display UI buttons
355+
or "JPY_PARENT_PID" in os.environ # Standard Jupyter
356+
):
336357
return True
337-
else:
338-
return False
358+
359+
return False
339360

340361

341362
def view_clusters(namespace: str = None):

tests/e2e/mnist.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,13 @@
3939
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
4040
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))
4141

42-
print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
43-
ACCELERATOR = os.getenv("ACCELERATOR")
42+
# Get accelerator from environment variable, default to "auto" for pytorch_lightning 2.x compatibility
43+
# Valid values: auto, cuda, tpu, cpu, mps, gpu
44+
ACCELERATOR = os.getenv("ACCELERATOR", "auto")
45+
print("ACCELERATOR: is ", ACCELERATOR)
4446

45-
# If GPU is requested but CUDA is not available, fall back to CPU
46-
if ACCELERATOR == "gpu" and not torch.cuda.is_available():
47+
# If GPU/CUDA is requested but CUDA is not available, fall back to CPU
48+
if ACCELERATOR in ("gpu", "cuda") and not torch.cuda.is_available():
4749
print("Warning: GPU requested but CUDA is not available. Falling back to CPU.")
4850
ACCELERATOR = "cpu"
4951

@@ -249,7 +251,7 @@ def test_dataloader(self):
249251
callbacks=[TQDMProgressBar(refresh_rate=20)],
250252
num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
251253
devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
252-
replace_sampler_ddp=False,
254+
use_distributed_sampler=False,
253255
strategy="ddp",
254256
)
255257

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
--extra-index-url https://download.pytorch.org/whl/cu118
22
torch==2.7.1+cu118
33
torchvision==0.22.1+cu118
4-
pytorch_lightning==1.9.5
4+
pytorch_lightning==2.6.0
55
torchmetrics==1.8.2
66
minio

ui-tests/tests/widget_notebook_example.test.ts

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ test.describe("Widget Functionality", () => {
5050

5151
// Verify widgets render correctly
5252
await waitForWidget(page, applyDownWidgetCellIndex, 'input[type="checkbox"]', 30000);
53+
5354
await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Down")', 10000);
5455
await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', 10000);
5556

@@ -71,7 +72,7 @@ test.describe("Widget Functionality", () => {
7172
});
7273

7374
// Test Cluster Apply button WITHOUT the wait_ready checkbox checked
74-
// This avoids the 300s TLS timeout + dashboard_check issues in KinD
75+
// This avoids the long TLS timeout + dashboard_check issues in KinD
7576
await interactWithWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', async (button) => {
7677
await button.click();
7778

@@ -81,6 +82,10 @@ test.describe("Widget Functionality", () => {
8182
expect(successMessage).not.toBeNull();
8283
});
8384

85+
// Wait for apply() to complete (widget uses 60s TLS timeout)
86+
// We wait for either success or failure message from the TLS setup phase
87+
await page.waitForSelector('text=/Cluster .widgettest. (is ready|resources applied but TLS setup incomplete)/', { timeout: 90000 });
88+
8489
// Test view_clusters widget
8590
const viewClustersCellIndex = 4; // 5 on OpenShift
8691
await page.notebook.runCell(cellCount - 2, true);
@@ -104,8 +109,8 @@ test.describe("Widget Functionality", () => {
104109
// Test Delete Cluster button to clean up
105110
await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Delete Cluster")', async (button) => {
106111
await button.click();
107-
// Wait for deletion confirmation
108-
const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 10000 });
112+
// Wait for deletion confirmation - increase timeout as cluster deletion can take time
113+
const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 30000 });
109114
expect(successMessage).not.toBeNull();
110115
});
111116
});
@@ -114,18 +119,21 @@ test.describe("Widget Functionality", () => {
114119
async function waitForWidget(page, cellIndex: number, widgetSelector: string, timeout = 5000) {
115120
const widgetCell = await page.notebook.getCellOutput(cellIndex);
116121

117-
if (widgetCell) {
118-
await widgetCell.waitForSelector(widgetSelector, { timeout });
122+
if (!widgetCell) {
123+
throw new Error(`Cell ${cellIndex} has no output - widgets not rendered. Check if is_notebook() detection is working.`);
119124
}
125+
await widgetCell.waitForSelector(widgetSelector, { timeout });
120126
}
121127

122128
async function interactWithWidget(page, cellIndex: number, widgetSelector: string, action: (widget) => Promise<void>) {
123129
const widgetCell = await page.notebook.getCellOutput(cellIndex);
124130

125-
if (widgetCell) {
126-
const widget = await widgetCell.$(widgetSelector);
127-
if (widget) {
128-
await action(widget);
129-
}
131+
if (!widgetCell) {
132+
throw new Error(`Cell ${cellIndex} has no output - cannot interact with widget.`);
133+
}
134+
const widget = await widgetCell.$(widgetSelector);
135+
if (!widget) {
136+
throw new Error(`Widget '${widgetSelector}' not found in cell ${cellIndex}.`);
130137
}
138+
await action(widget);
131139
}

0 commit comments

Comments
 (0)