Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/ui_notebooks_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,38 @@ jobs:
poetry run yarn playwright install chromium
working-directory: ui-tests

- name: Verify Kind cluster connectivity
run: |
echo "Checking kubeconfig location..."
echo "HOME: $HOME"
ls -la ~/.kube/ || echo "No .kube directory found"

echo "Checking kubeconfig format..."
kubectl config view --minify

# Check if kubeconfig has embedded certs or file references
if grep -q "certificate-authority:" ~/.kube/config 2>/dev/null; then
echo "WARNING: kubeconfig uses external certificate files"
fi
if grep -q "certificate-authority-data:" ~/.kube/config 2>/dev/null; then
echo "OK: kubeconfig has embedded certificate data"
fi

echo "Testing cluster connectivity..."
kubectl cluster-info
kubectl get nodes
kubectl get pods -A

# Test Python kubernetes client connectivity
echo "Testing Python kubernetes client..."
python3 -c "
from kubernetes import client, config
config.load_kube_config()
v1 = client.CoreV1Api()
nodes = v1.list_node()
print(f'Successfully connected! Found {len(nodes.items)} nodes')
"

- name: Fix 3_widget_example.ipynb notebook for test
run: |
# Remove login/logout cells, as KinD doesn't support authentication using token
Expand All @@ -118,6 +150,13 @@ jobs:
run: |
set -euo pipefail

# Verify kubectl can connect to the cluster
kubectl cluster-info
kubectl get nodes

# Debug: Show kubeconfig location
echo "KUBECONFIG: ${KUBECONFIG:-~/.kube/config}"

poetry run yarn test
working-directory: ui-tests

Expand Down
6 changes: 3 additions & 3 deletions demo-notebooks/additional-demos/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
pytorch_lightning==1.9.5
pytorch_lightning==2.6.0
ray_lightning
torchmetrics==0.9.1
torchvision==0.19.0
torchmetrics==1.8.2
torchvision==0.20.1
minio
28 changes: 20 additions & 8 deletions src/codeflare_sdk/common/kubernetes_cluster/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,15 +284,27 @@ def config_check() -> str:
return config_path


def _client_with_cert(client: client.ApiClient, ca_cert_path: Optional[str] = None):
client.configuration.verify_ssl = True
def _client_with_cert(api_client: client.ApiClient, ca_cert_path: Optional[str] = None):
"""
Configure SSL certificate verification for a Kubernetes API client.

If a custom CA cert path is provided or configured via environment variable,
it will be used. Otherwise, the existing ssl_ca_cert configuration from the
kubeconfig (which may include embedded certificates) is preserved.

Args:
api_client: The Kubernetes API client to configure.
ca_cert_path: Optional path to a custom CA certificate file.
"""
api_client.configuration.verify_ssl = True
cert_path = _gen_ca_cert_path(ca_cert_path)
if cert_path is None:
client.configuration.ssl_ca_cert = None
elif os.path.isfile(cert_path):
client.configuration.ssl_ca_cert = cert_path
else:
raise FileNotFoundError(f"Certificate file not found at {cert_path}")
if cert_path is not None:
if os.path.isfile(cert_path):
api_client.configuration.ssl_ca_cert = cert_path
else:
raise FileNotFoundError(f"Certificate file not found at {cert_path}")
# If cert_path is None, preserve the existing ssl_ca_cert from kubeconfig
# (which may contain embedded certificate data from certificate-authority-data)


def _gen_ca_cert_path(ca_cert_path: Optional[str]):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,20 @@ def test_gen_ca_cert_path_defaults_to_none(monkeypatch):


def test_client_with_cert_none(mocker, monkeypatch):
"""Test _client_with_cert when cert_path is None."""
"""Test _client_with_cert when cert_path is None preserves existing ssl_ca_cert."""
# Clear env var to ensure we're testing the None case
monkeypatch.delenv("CF_SDK_CA_CERT_PATH", raising=False)

mock_client = mocker.MagicMock()
mock_client.configuration = mocker.MagicMock()
# Set an existing ssl_ca_cert to verify it's preserved
mock_client.configuration.ssl_ca_cert = "/existing/ca.crt"

_client_with_cert(mock_client, None)

assert mock_client.configuration.verify_ssl is True
assert mock_client.configuration.ssl_ca_cert is None
# ssl_ca_cert should be preserved (not overwritten to None)
assert mock_client.configuration.ssl_ca_cert == "/existing/ca.crt"


def test_client_with_cert_file_not_found(mocker):
Expand All @@ -99,6 +102,22 @@ def test_client_with_cert_file_not_found(mocker):
_client_with_cert(mock_client, "/nonexistent/cert.crt")


def test_client_with_cert_valid_file(mocker, tmp_path):
"""Test _client_with_cert sets ssl_ca_cert when valid cert file exists."""
# Create a temporary cert file
cert_file = tmp_path / "ca.crt"
cert_file.write_text("-----BEGIN CERTIFICATE-----\ntest\n-----END CERTIFICATE-----")

mock_client = mocker.MagicMock()
mock_client.configuration = mocker.MagicMock()
mock_client.configuration.ssl_ca_cert = None

_client_with_cert(mock_client, str(cert_file))

assert mock_client.configuration.verify_ssl is True
assert mock_client.configuration.ssl_ca_cert == str(cert_file)


def test_token_auth_exception_handling(mocker):
"""Test TokenAuthentication.login() handles ApiException."""
from kubernetes.client import ApiException
Expand Down
44 changes: 42 additions & 2 deletions src/codeflare_sdk/common/widgets/test_widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,13 +76,16 @@ def test_cluster_apply_down_buttons(mocker):

# Check if the `apply` and `down` methods were called
mock_wait_ready.assert_called_once()
mock_apply.assert_called_once()
# Widget button uses shorter TLS timeout (60s) to avoid blocking UI
mock_apply.assert_called_once_with(timeout=60)
mock_down.assert_called_once()


@patch.dict("os.environ", {}, clear=True) # Mock environment with no variables
def test_is_notebook_false():
assert cf_widgets.is_notebook() is False
# Mock get_ipython to return None (no IPython shell)
with patch("IPython.get_ipython", return_value=None):
assert cf_widgets.is_notebook() is False


@patch.dict(
Expand All @@ -92,6 +95,43 @@ def test_is_notebook_true():
assert cf_widgets.is_notebook() is True


@patch.dict("os.environ", {}, clear=True) # No env vars
def test_is_notebook_with_zmq_shell():
"""Test is_notebook returns True when running in ZMQInteractiveShell (Jupyter)."""
mock_shell = MagicMock()
mock_shell.__class__.__name__ = "ZMQInteractiveShell"

with patch("IPython.get_ipython", return_value=mock_shell):
assert cf_widgets.is_notebook() is True


@patch.dict("os.environ", {}, clear=True) # No env vars
def test_is_notebook_with_terminal_shell():
"""Test is_notebook returns False when running in TerminalInteractiveShell (ipython CLI)."""
mock_shell = MagicMock()
mock_shell.__class__.__name__ = "TerminalInteractiveShell"

with patch("IPython.get_ipython", return_value=mock_shell):
assert cf_widgets.is_notebook() is False


@patch.dict(
"os.environ", {"JPY_PARENT_PID": "12345"}
) # Standard Jupyter environment variable
def test_is_notebook_with_jpy_parent_pid():
"""Test is_notebook returns True when JPY_PARENT_PID env var is set."""
# Mock get_ipython to return None to test env var fallback
with patch("IPython.get_ipython", return_value=None):
assert cf_widgets.is_notebook() is True


@patch.dict("os.environ", {}, clear=True) # No env vars
def test_is_notebook_ipython_import_error():
"""Test is_notebook handles ImportError gracefully."""
with patch.dict("sys.modules", {"IPython": None}):
assert cf_widgets.is_notebook() is False


def test_view_clusters(mocker, capsys):
# If is not a notebook environment, a warning should be raised
with pytest.warns(
Expand Down
31 changes: 26 additions & 5 deletions src/codeflare_sdk/common/widgets/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,9 @@ def cluster_apply_down_buttons(
def on_apply_button_clicked(b): # Handle the apply button click event
with output:
output.clear_output()
cluster.apply()
# Use shorter TLS timeout (60s) for widget button clicks to avoid blocking UI
# Users who need full TLS wait can use wait_ready() checkbox or call apply() directly
cluster.apply(timeout=60)

# If the wait_ready Checkbox is clicked(value == True) trigger the wait_ready function
if wait_ready_check.value:
Expand Down Expand Up @@ -327,15 +329,34 @@ def _wait_ready_check_box():

def is_notebook() -> bool:
"""
The is_notebook function checks if Jupyter Notebook environment variables exist in the given environment and return True/False based on that.
The is_notebook function checks if we're running in a Jupyter Notebook environment.

Detection methods:
1. Check for IPython's ZMQInteractiveShell (standard Jupyter kernel)
2. Check for known environment variables (VSCode, RHOAI/ODH)
"""
# First, try the standard IPython detection method
try:
from IPython import get_ipython

shell = get_ipython()
if shell is not None:
shell_class = shell.__class__.__name__
# ZMQInteractiveShell = Jupyter notebook/lab, qtconsole
if shell_class == "ZMQInteractiveShell":
return True
except (ImportError, NameError):
pass

# Fallback: check for known environment variables
if (
"PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING" in os.environ
or "JPY_SESSION_NAME" in os.environ
): # If running Jupyter NBs in VsCode or RHOAI/ODH display UI buttons
or "JPY_PARENT_PID" in os.environ # Standard Jupyter
):
return True
else:
return False

return False


def view_clusters(namespace: str = None):
Expand Down
12 changes: 7 additions & 5 deletions tests/e2e/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@
print("MASTER_ADDR: is ", os.getenv("MASTER_ADDR"))
print("MASTER_PORT: is ", os.getenv("MASTER_PORT"))

print("ACCELERATOR: is ", os.getenv("ACCELERATOR"))
ACCELERATOR = os.getenv("ACCELERATOR")
# Get accelerator from environment variable, default to "auto" for pytorch_lightning 2.x compatibility
# Valid values: auto, cuda, tpu, cpu, mps, gpu
ACCELERATOR = os.getenv("ACCELERATOR", "auto")
print("ACCELERATOR: is ", ACCELERATOR)

# If GPU is requested but CUDA is not available, fall back to CPU
if ACCELERATOR == "gpu" and not torch.cuda.is_available():
# If GPU/CUDA is requested but CUDA is not available, fall back to CPU
if ACCELERATOR in ("gpu", "cuda") and not torch.cuda.is_available():
print("Warning: GPU requested but CUDA is not available. Falling back to CPU.")
ACCELERATOR = "cpu"

Expand Down Expand Up @@ -249,7 +251,7 @@ def test_dataloader(self):
callbacks=[TQDMProgressBar(refresh_rate=20)],
num_nodes=int(os.environ.get("GROUP_WORLD_SIZE", 1)),
devices=int(os.environ.get("LOCAL_WORLD_SIZE", 1)),
replace_sampler_ddp=False,
use_distributed_sampler=False,
strategy="ddp",
)

Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/mnist_pip_requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.7.1+cu118
torchvision==0.22.1+cu118
pytorch_lightning==1.9.5
pytorch_lightning==2.6.0
torchmetrics==1.8.2
minio
28 changes: 18 additions & 10 deletions ui-tests/tests/widget_notebook_example.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ test.describe("Widget Functionality", () => {

// Verify widgets render correctly
await waitForWidget(page, applyDownWidgetCellIndex, 'input[type="checkbox"]', 30000);

await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Down")', 10000);
await waitForWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', 10000);

Expand All @@ -71,7 +72,7 @@ test.describe("Widget Functionality", () => {
});

// Test Cluster Apply button WITHOUT the wait_ready checkbox checked
// This avoids the 300s TLS timeout + dashboard_check issues in KinD
// This avoids the long TLS timeout + dashboard_check issues in KinD
await interactWithWidget(page, applyDownWidgetCellIndex, 'button:has-text("Cluster Apply")', async (button) => {
await button.click();

Expand All @@ -81,6 +82,10 @@ test.describe("Widget Functionality", () => {
expect(successMessage).not.toBeNull();
});

// Wait for apply() to complete (widget uses 60s TLS timeout)
// We wait for either success or failure message from the TLS setup phase
await page.waitForSelector('text=/Cluster .widgettest. (is ready|resources applied but TLS setup incomplete)/', { timeout: 90000 });

// Test view_clusters widget
const viewClustersCellIndex = 4; // 5 on OpenShift
await page.notebook.runCell(cellCount - 2, true);
Expand All @@ -104,8 +109,8 @@ test.describe("Widget Functionality", () => {
// Test Delete Cluster button to clean up
await interactWithWidget(page, viewClustersCellIndex, 'button:has-text("Delete Cluster")', async (button) => {
await button.click();
// Wait for deletion confirmation
const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 10000 });
// Wait for deletion confirmation - increase timeout as cluster deletion can take time
const successMessage = await page.waitForSelector(`text=Cluster widgettest in the ${namespace} namespace was deleted successfully.`, { timeout: 30000 });
expect(successMessage).not.toBeNull();
});
});
Expand All @@ -114,18 +119,21 @@ test.describe("Widget Functionality", () => {
async function waitForWidget(page, cellIndex: number, widgetSelector: string, timeout = 5000) {
const widgetCell = await page.notebook.getCellOutput(cellIndex);

if (widgetCell) {
await widgetCell.waitForSelector(widgetSelector, { timeout });
if (!widgetCell) {
throw new Error(`Cell ${cellIndex} has no output - widgets not rendered. Check if is_notebook() detection is working.`);
}
await widgetCell.waitForSelector(widgetSelector, { timeout });
}

async function interactWithWidget(page, cellIndex: number, widgetSelector: string, action: (widget) => Promise<void>) {
const widgetCell = await page.notebook.getCellOutput(cellIndex);

if (widgetCell) {
const widget = await widgetCell.$(widgetSelector);
if (widget) {
await action(widget);
}
if (!widgetCell) {
throw new Error(`Cell ${cellIndex} has no output - cannot interact with widget.`);
}
const widget = await widgetCell.$(widgetSelector);
if (!widget) {
throw new Error(`Widget '${widgetSelector}' not found in cell ${cellIndex}.`);
}
await action(widget);
}
Loading