diff --git a/.github/workflows/ui_notebooks_test.yaml b/.github/workflows/ui_notebooks_test.yaml index df4ac2c11..e1670ee64 100644 --- a/.github/workflows/ui_notebooks_test.yaml +++ b/.github/workflows/ui_notebooks_test.yaml @@ -104,6 +104,38 @@ jobs: poetry run yarn playwright install chromium working-directory: ui-tests + - name: Verify Kind cluster connectivity + run: | + echo "Checking kubeconfig location..." + echo "HOME: $HOME" + ls -la ~/.kube/ || echo "No .kube directory found" + + echo "Checking kubeconfig format..." + kubectl config view --minify + + # Check if kubeconfig has embedded certs or file references + if grep -q "certificate-authority:" ~/.kube/config 2>/dev/null; then + echo "WARNING: kubeconfig uses external certificate files" + fi + if grep -q "certificate-authority-data:" ~/.kube/config 2>/dev/null; then + echo "OK: kubeconfig has embedded certificate data" + fi + + echo "Testing cluster connectivity..." + kubectl cluster-info + kubectl get nodes + kubectl get pods -A + + # Test Python kubernetes client connectivity + echo "Testing Python kubernetes client..." + python3 -c " + from kubernetes import client, config + config.load_kube_config() + v1 = client.CoreV1Api() + nodes = v1.list_node() + print(f'Successfully connected! Found {len(nodes.items)} nodes') + " + - name: Fix 3_widget_example.ipynb notebook for test run: | # Remove login/logout cells, as KinD doesn't support authentication using token @@ -118,6 +150,13 @@ jobs: run: | set -euo pipefail + # Verify kubectl can connect to the cluster + kubectl cluster-info + kubectl get nodes + + # Debug: Show kubeconfig location + echo "KUBECONFIG: ${KUBECONFIG:-~/.kube/config}" + poetry run yarn test working-directory: ui-tests diff --git a/demo-notebooks/additional-demos/requirements.txt b/demo-notebooks/additional-demos/requirements.txt index 5f86ab536..0bc10c60d 100644 --- a/demo-notebooks/additional-demos/requirements.txt +++ b/demo-notebooks/additional-demos/requirements.txt @@ -1,5 +1,5 @@ -pytorch_lightning==1.9.5 +pytorch_lightning==2.6.0 ray_lightning -torchmetrics==0.9.1 -torchvision==0.19.0 +torchmetrics==1.8.2 +torchvision==0.20.1 minio diff --git a/src/codeflare_sdk/common/kubernetes_cluster/auth.py b/src/codeflare_sdk/common/kubernetes_cluster/auth.py index 82773a16a..1f933ef0b 100644 --- a/src/codeflare_sdk/common/kubernetes_cluster/auth.py +++ b/src/codeflare_sdk/common/kubernetes_cluster/auth.py @@ -284,15 +284,27 @@ def config_check() -> str: return config_path -def _client_with_cert(client: client.ApiClient, ca_cert_path: Optional[str] = None): - client.configuration.verify_ssl = True +def _client_with_cert(api_client: client.ApiClient, ca_cert_path: Optional[str] = None): + """ + Configure SSL certificate verification for a Kubernetes API client. + + If a custom CA cert path is provided or configured via environment variable, + it will be used. Otherwise, the existing ssl_ca_cert configuration from the + kubeconfig (which may include embedded certificates) is preserved. + + Args: + api_client: The Kubernetes API client to configure. + ca_cert_path: Optional path to a custom CA certificate file. + """ + api_client.configuration.verify_ssl = True cert_path = _gen_ca_cert_path(ca_cert_path) - if cert_path is None: - client.configuration.ssl_ca_cert = None - elif os.path.isfile(cert_path): - client.configuration.ssl_ca_cert = cert_path - else: - raise FileNotFoundError(f"Certificate file not found at {cert_path}") + if cert_path is not None: + if os.path.isfile(cert_path): + api_client.configuration.ssl_ca_cert = cert_path + else: + raise FileNotFoundError(f"Certificate file not found at {cert_path}") + # If cert_path is None, preserve the existing ssl_ca_cert from kubeconfig + # (which may contain embedded certificate data from certificate-authority-data) def _gen_ca_cert_path(ca_cert_path: Optional[str]): diff --git a/src/codeflare_sdk/common/kubernetes_cluster/test_auth_edge_cases.py b/src/codeflare_sdk/common/kubernetes_cluster/test_auth_edge_cases.py index 954648ed7..39d29b801 100644 --- a/src/codeflare_sdk/common/kubernetes_cluster/test_auth_edge_cases.py +++ b/src/codeflare_sdk/common/kubernetes_cluster/test_auth_edge_cases.py @@ -77,17 +77,20 @@ def test_gen_ca_cert_path_defaults_to_none(monkeypatch): def test_client_with_cert_none(mocker, monkeypatch): - """Test _client_with_cert when cert_path is None.""" + """Test _client_with_cert when cert_path is None preserves existing ssl_ca_cert.""" # Clear env var to ensure we're testing the None case monkeypatch.delenv("CF_SDK_CA_CERT_PATH", raising=False) mock_client = mocker.MagicMock() mock_client.configuration = mocker.MagicMock() + # Set an existing ssl_ca_cert to verify it's preserved + mock_client.configuration.ssl_ca_cert = "/existing/ca.crt" _client_with_cert(mock_client, None) assert mock_client.configuration.verify_ssl is True - assert mock_client.configuration.ssl_ca_cert is None + # ssl_ca_cert should be preserved (not overwritten to None) + assert mock_client.configuration.ssl_ca_cert == "/existing/ca.crt" def test_client_with_cert_file_not_found(mocker): @@ -99,6 +102,22 @@ def test_client_with_cert_file_not_found(mocker): _client_with_cert(mock_client, "/nonexistent/cert.crt") +def test_client_with_cert_valid_file(mocker, tmp_path): + """Test _client_with_cert sets ssl_ca_cert when valid cert file exists.""" + # Create a temporary cert file + cert_file = tmp_path / "ca.crt" + cert_file.write_text("-----BEGIN CERTIFICATE-----\ntest\n-----END CERTIFICATE-----") + + mock_client = mocker.MagicMock() + mock_client.configuration = mocker.MagicMock() + mock_client.configuration.ssl_ca_cert = None + + _client_with_cert(mock_client, str(cert_file)) + + assert mock_client.configuration.verify_ssl is True + assert mock_client.configuration.ssl_ca_cert == str(cert_file) + + def test_token_auth_exception_handling(mocker): """Test TokenAuthentication.login() handles ApiException.""" from kubernetes.client import ApiException diff --git a/src/codeflare_sdk/common/widgets/test_widgets.py b/src/codeflare_sdk/common/widgets/test_widgets.py index 2d72b728c..ef83f8582 100644 --- a/src/codeflare_sdk/common/widgets/test_widgets.py +++ b/src/codeflare_sdk/common/widgets/test_widgets.py @@ -76,13 +76,16 @@ def test_cluster_apply_down_buttons(mocker): # Check if the `apply` and `down` methods were called mock_wait_ready.assert_called_once() - mock_apply.assert_called_once() + # Widget button uses shorter TLS timeout (60s) to avoid blocking UI + mock_apply.assert_called_once_with(timeout=60) mock_down.assert_called_once() @patch.dict("os.environ", {}, clear=True) # Mock environment with no variables def test_is_notebook_false(): - assert cf_widgets.is_notebook() is False + # Mock get_ipython to return None (no IPython shell) + with patch("IPython.get_ipython", return_value=None): + assert cf_widgets.is_notebook() is False @patch.dict( @@ -92,6 +95,43 @@ def test_is_notebook_true(): assert cf_widgets.is_notebook() is True +@patch.dict("os.environ", {}, clear=True) # No env vars +def test_is_notebook_with_zmq_shell(): + """Test is_notebook returns True when running in ZMQInteractiveShell (Jupyter).""" + mock_shell = MagicMock() + mock_shell.__class__.__name__ = "ZMQInteractiveShell" + + with patch("IPython.get_ipython", return_value=mock_shell): + assert cf_widgets.is_notebook() is True + + +@patch.dict("os.environ", {}, clear=True) # No env vars +def test_is_notebook_with_terminal_shell(): + """Test is_notebook returns False when running in TerminalInteractiveShell (ipython CLI).""" + mock_shell = MagicMock() + mock_shell.__class__.__name__ = "TerminalInteractiveShell" + + with patch("IPython.get_ipython", return_value=mock_shell): + assert cf_widgets.is_notebook() is False + + +@patch.dict( + "os.environ", {"JPY_PARENT_PID": "12345"} +) # Standard Jupyter environment variable +def test_is_notebook_with_jpy_parent_pid(): + """Test is_notebook returns True when JPY_PARENT_PID env var is set.""" + # Mock get_ipython to return None to test env var fallback + with patch("IPython.get_ipython", return_value=None): + assert cf_widgets.is_notebook() is True + + +@patch.dict("os.environ", {}, clear=True) # No env vars +def test_is_notebook_ipython_import_error(): + """Test is_notebook handles ImportError gracefully.""" + with patch.dict("sys.modules", {"IPython": None}): + assert cf_widgets.is_notebook() is False + + def test_view_clusters(mocker, capsys): # If is not a notebook environment, a warning should be raised with pytest.warns( diff --git a/src/codeflare_sdk/common/widgets/widgets.py b/src/codeflare_sdk/common/widgets/widgets.py index 48a3b3c71..64a7a3ad0 100644 --- a/src/codeflare_sdk/common/widgets/widgets.py +++ b/src/codeflare_sdk/common/widgets/widgets.py @@ -299,7 +299,9 @@ def cluster_apply_down_buttons( def on_apply_button_clicked(b): # Handle the apply button click event with output: output.clear_output() - cluster.apply() + # Use shorter TLS timeout (60s) for widget button clicks to avoid blocking UI + # Users who need full TLS wait can use wait_ready() checkbox or call apply() directly + cluster.apply(timeout=60) # If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function if wait_ready_check.value: @@ -327,15 +329,34 @@ def _wait_ready_check_box(): def is_notebook() -> bool: """ - The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that. + The is_notebook function checks if we're running in a Jupyter Notebook environment. + + Detection methods: + 1. Check for IPython's ZMQInteractiveShell (standard Jupyter kernel) + 2. Check for known environment variables (VSCode, RHOAI/ODH) """ + # First, try the standard IPython detection method + try: + from IPython import get_ipython + + shell = get_ipython() + if shell is not None: + shell_class = shell.__class__.__name__ + # ZMQInteractiveShell = Jupyter notebook/lab, qtconsole + if shell_class == "ZMQInteractiveShell": + return True + except (ImportError, NameError): + pass + + # Fallback: check for known environment variables if ( "PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING" in os.environ or "JPY_SESSION_NAME" in os.environ - ): # If running Jupyter NBs in VsCode or RHOAI/ODH display UI buttons + or "JPY_PARENT_PID" in os.environ # Standard Jupyter + ): return True - else: - return False + + return False def view_clusters(namespace: str = None): diff --git a/tests/e2e/mnist.py b/tests/e2e/mnist.py index 26f37a4b2..b8135d0fc 100644 --- a/tests/e2e/mnist.py +++ b/tests/e2e/mnist.py @@ -39,11 +39,13 @@ print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR")) print("MASTER_PORT: is ", os.getenv("MASTER_PORT")) -print("ACCELERATOR: is ", os.getenv("ACCELERATOR")) -ACCELERATOR = os.getenv("ACCELERATOR") +# Get accelerator from environment variable, default to "auto" for pytorch_lightning 2.x compatibility +# Valid values: auto, cuda, tpu, cpu, mps, gpu +ACCELERATOR = os.getenv("ACCELERATOR", "auto") +print("ACCELERATOR: is ", ACCELERATOR) -# If GPU is requested but CUDA is not available, fall back to CPU -if ACCELERATOR == "gpu" and not torch.cuda.is_available(): +# If GPU/CUDA is requested but CUDA is not available, fall back to CPU +if ACCELERATOR in ("gpu", "cuda") and not torch.cuda.is_available(): print("Warning: GPU requested but CUDA is not available. Falling back to CPU.") ACCELERATOR = "cpu" @@ -249,7 +251,7 @@ def test_dataloader(self): callbacks=[TQDMProgressBar(refresh_rate=20)], num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)), devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)), - replace_sampler_ddp=False, + use_distributed_sampler=False, strategy="ddp", ) diff --git a/tests/e2e/mnist_pip_requirements.txt b/tests/e2e/mnist_pip_requirements.txt index ec87256b3..f898c0d02 100644 --- a/tests/e2e/mnist_pip_requirements.txt +++ b/tests/e2e/mnist_pip_requirements.txt @@ -1,6 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.7.1+cu118 torchvision==0.22.1+cu118 -pytorch_lightning==1.9.5 +pytorch_lightning==2.6.0 torchmetrics==1.8.2 minio diff --git a/ui-tests/tests/widget_notebook_example.test.ts b/ui-tests/tests/widget_notebook_example.test.ts index c3b6dc159..c8d3ecfb6 100644 --- a/ui-tests/tests/widget_notebook_example.test.ts +++ b/ui-tests/tests/widget_notebook_example.test.ts @@ -50,6 +50,7 @@ test.describe("Widget Functionality", () => { // Verify widgets render correctly await waitForWidget(page, applyDownWidgetCellIndex, 'input[type="checkbox"]', 30000); + await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Down")', 10000); await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', 10000); @@ -71,7 +72,7 @@ test.describe("Widget Functionality", () => { }); // Test Cluster Apply button WITHOUT the wait_ready checkbox checked - // This avoids the 300s TLS timeout + dashboard_check issues in KinD + // This avoids the long TLS timeout + dashboard_check issues in KinD await interactWithWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', async (button) => { await button.click(); @@ -81,6 +82,10 @@ test.describe("Widget Functionality", () => { expect(successMessage).not.toBeNull(); }); + // Wait for apply() to complete (widget uses 60s TLS timeout) + // We wait for either success or failure message from the TLS setup phase + await page.waitForSelector('text=/Cluster .widgettest. (is ready|resources applied but TLS setup incomplete)/', { timeout: 90000 }); + // Test view_clusters widget const viewClustersCellIndex = 4; // 5 on OpenShift await page.notebook.runCell(cellCount - 2, true); @@ -104,8 +109,8 @@ test.describe("Widget Functionality", () => { // Test Delete Cluster button to clean up await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Delete Cluster")', async (button) => { await button.click(); - // Wait for deletion confirmation - const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 10000 }); + // Wait for deletion confirmation - increase timeout as cluster deletion can take time + const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 30000 }); expect(successMessage).not.toBeNull(); }); }); @@ -114,18 +119,21 @@ test.describe("Widget Functionality", () => { async function waitForWidget(page, cellIndex: number, widgetSelector: string, timeout = 5000) { const widgetCell = await page.notebook.getCellOutput(cellIndex); - if (widgetCell) { - await widgetCell.waitForSelector(widgetSelector, { timeout }); + if (!widgetCell) { + throw new Error(`Cell ${cellIndex} has no output - widgets not rendered. Check if is_notebook() detection is working.`); } + await widgetCell.waitForSelector(widgetSelector, { timeout }); } async function interactWithWidget(page, cellIndex: number, widgetSelector: string, action: (widget) => Promise) { const widgetCell = await page.notebook.getCellOutput(cellIndex); - if (widgetCell) { - const widget = await widgetCell.$(widgetSelector); - if (widget) { - await action(widget); - } + if (!widgetCell) { + throw new Error(`Cell ${cellIndex} has no output - cannot interact with widget.`); + } + const widget = await widgetCell.$(widgetSelector); + if (!widget) { + throw new Error(`Widget '${widgetSelector}' not found in cell ${cellIndex}.`); } + await action(widget); }