Merge pull request #24 from NVIDIA-NeMo/pablo-garay/add-cicd-testing

pablo-garay · web-flow · commit 9d74ab18e54c · 2025-11-04T10:59:24.000-08:00
CPU &amp; GPU: init + sample tests
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -19,6 +19,7 @@ on:
     branches:
       - main
       - "pull-request/[0-9]+"
+      - "r[0-9]+.[0-9]+.[0-9]+"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
@@ -11,8 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-FROM nvcr.io/nvidia/pytorch:25.05-py3
-
-ENV PIP_CONSTRAINT=""
+FROM nvcr.io/nvidia/pytorch:25.09-py3
 
 WORKDIR /workspace
+
+# Install uv
+ENV UV_VERSION="0.8.22"
+RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
+ENV PATH="/root/.local/bin:$PATH"
+
+# Set up virtual environment for uv
+ENV UV_PROJECT_ENVIRONMENT=/opt/venv
+ENV UV_CACHE_DIR=/opt/uv_cache
+ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
+ENV UV_LINK_MODE=copy
+
+# Create virtual environment
+RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
+
+# Copy dependency files
+COPY pyproject.toml uv.lock ./
+
+# Install test dependencies using uv sync
+RUN uv sync --link-mode copy --locked --group test --no-install-project
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,39 +19,16 @@
 def pytest_addoption(parser):
     """
     Additional command-line arguments passed to pytest.
-    For now:
-        --cpu: use CPU during testing (DEFAULT: GPU)
-        --use_local_test_data: use local test data/skip downloading from URL/GitHub (DEFAULT: False)
     """
-    parser.addoption(
-        "--cpu", action="store_true", help="pass that argument to use CPU during testing (DEFAULT: False = GPU)"
-    )
     parser.addoption(
         "--with_downloads",
         action="store_true",
         help="pass this argument to active tests which download models from the cloud.",
     )
 
 
-@pytest.fixture
-def device(request):
-    """Simple fixture returning string denoting the device [CPU | GPU]"""
-    if request.config.getoption("--cpu"):
-        return "CPU"
-    else:
-        return "GPU"
-
-
-@pytest.fixture(autouse=True)
-def run_only_on_device_fixture(request, device):
-    """Fixture to skip tests based on the device"""
-    if request.node.get_closest_marker("run_only_on"):
-        if request.node.get_closest_marker("run_only_on").args[0] != device:
-            pytest.skip("skipped on this device: {}".format(device))
-
-
 @pytest.fixture(autouse=True)
-def downloads_weights(request, device):
+def downloads_weights(request):
     """Fixture to validate if the with_downloads flag is passed if necessary"""
     if request.node.get_closest_marker("with_downloads"):
         if not request.config.getoption("--with_downloads"):
@@ -77,15 +54,16 @@ def reset_env_vars():
 def pytest_configure(config):
     """
     Initial configuration of conftest.
-    The function checks if test_data.tar.gz is present in tests/.data.
-    If so, compares its size with github's test_data.tar.gz.
-    If file absent or sizes not equal, function downloads the archive from github and unpacks it.
+
+    Note: DFM uses the following pattern for CPU/GPU test separation:
+    Tests don't use markers - GPU visibility is controlled by CUDA_VISIBLE_DEVICES
+    in the shell scripts (L0_Unit_Tests_CPU.sh and L0_Unit_Tests_GPU.sh).
     """
     config.addinivalue_line(
         "markers",
-        "run_only_on(device): runs the test only on a given device [CPU | GPU]",
+        "with_downloads: runs the test using data present in tests/.data",
     )
     config.addinivalue_line(
         "markers",
-        "with_downloads: runs the test using data present in tests/.data",
+        "pleasefixme: marks test as needing fixes (will be skipped in CI)",
     )
diff --git a/tests/unit_tests/L0_Unit_Tests_CPU.sh b/tests/unit_tests/L0_Unit_Tests_CPU.sh
@@ -11,4 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-CUDA_VISIBLE_DEVICES="" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --cpu --with_downloads
+
+# Hide GPU from PyTorch by setting CUDA_VISIBLE_DEVICES to empty
+# This makes torch.cuda.is_available() return False
+CUDA_VISIBLE_DEVICES="" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads
diff --git a/tests/unit_tests/test_cpu_sample.py b/tests/unit_tests/test_cpu_sample.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+
+def test_cpu_tensor_operations():
+    """Test basic CPU tensor operations"""
+    # Create tensors on CPU
+    a = torch.tensor([1.0, 2.0, 3.0], device="cpu")
+    b = torch.tensor([4.0, 5.0, 6.0], device="cpu")
+
+    # Test addition
+    c = a + b
+    expected = torch.tensor([5.0, 7.0, 9.0])
+    assert torch.allclose(c, expected), f"Expected {expected}, got {c}"
+
+    # Test multiplication
+    d = a * b
+    expected = torch.tensor([4.0, 10.0, 18.0])
+    assert torch.allclose(d, expected), f"Expected {expected}, got {d}"
+
+    print("✓ CPU tensor operations test passed")
+
+
+def test_cpu_only_environment():
+    """Verify that CUDA is not visible in CPU-only environment"""
+    # In CPU tests, CUDA should not be available or visible
+    # When CUDA_VISIBLE_DEVICES="" is set, torch.cuda.is_available() should be False
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    print(f"CUDA device count: {torch.cuda.device_count()}")
+    print("✓ CPU environment test completed")
+
+
+def test_cpu_matrix_multiplication():
+    """Test matrix multiplication on CPU"""
+    # Create random matrices
+    matrix_a = torch.randn(10, 20, device="cpu")
+    matrix_b = torch.randn(20, 30, device="cpu")
+
+    # Perform matrix multiplication
+    result = torch.matmul(matrix_a, matrix_b)
+
+    # Verify shape
+    assert result.shape == (10, 30), f"Expected shape (10, 30), got {result.shape}"
+
+    # Verify result is finite
+    assert torch.isfinite(result).all(), "Result contains non-finite values"
+
+    print("✓ CPU matrix multiplication test passed")
+
+
+def test_pytorch_version():
+    """Test that PyTorch is properly installed"""
+    print(f"PyTorch version: {torch.__version__}")
+    assert torch.__version__ is not None, "PyTorch version not found"
+    print("✓ PyTorch version check passed")
diff --git a/tests/unit_tests/test_gpu_sample.py b/tests/unit_tests/test_gpu_sample.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+
+def test_gpu_availability():
+    """Test that GPU is available and accessible"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    gpu_count = torch.cuda.device_count()
+    print(f"Number of GPUs available: {gpu_count}")
+    assert gpu_count >= 1, f"Expected at least 1 GPU, found {gpu_count}"
+
+    # Print GPU information
+    for i in range(gpu_count):
+        gpu_name = torch.cuda.get_device_name(i)
+        print(f"GPU {i}: {gpu_name}")
+        print(f"  Memory allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
+        print(f"  Memory reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
+
+    print("✓ GPU availability test passed")
+
+
+def test_gpu_tensor_operations():
+    """Test basic GPU tensor operations"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    # Create tensors on GPU
+    device = torch.device("cuda:0")
+    a = torch.tensor([1.0, 2.0, 3.0], device=device)
+    b = torch.tensor([4.0, 5.0, 6.0], device=device)
+
+    # Verify tensors are on GPU
+    assert a.is_cuda, "Tensor a is not on GPU"
+    assert b.is_cuda, "Tensor b is not on GPU"
+
+    # Test addition
+    c = a + b
+    expected = torch.tensor([5.0, 7.0, 9.0], device=device)
+    assert torch.allclose(c, expected), f"Expected {expected}, got {c}"
+    assert c.is_cuda, "Result tensor is not on GPU"
+
+    # Test multiplication
+    d = a * b
+    expected = torch.tensor([4.0, 10.0, 18.0], device=device)
+    assert torch.allclose(d, expected), f"Expected {expected}, got {d}"
+
+    print("✓ GPU tensor operations test passed")
+
+
+def test_gpu_matrix_multiplication():
+    """Test matrix multiplication on GPU"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    device = torch.device("cuda:0")
+
+    # Create random matrices on GPU
+    matrix_a = torch.randn(100, 200, device=device)
+    matrix_b = torch.randn(200, 300, device=device)
+
+    # Perform matrix multiplication
+    result = torch.matmul(matrix_a, matrix_b)
+
+    # Verify shape
+    assert result.shape == (100, 300), f"Expected shape (100, 300), got {result.shape}"
+
+    # Verify result is on GPU
+    assert result.is_cuda, "Result is not on GPU"
+
+    # Verify result is finite
+    assert torch.isfinite(result).all(), "Result contains non-finite values"
+
+    print("✓ GPU matrix multiplication test passed")
+
+
+def test_multi_gpu_tensor_transfer():
+    """Test tensor transfer between GPUs if multiple GPUs are available"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    gpu_count = torch.cuda.device_count()
+    print(f"Testing with {gpu_count} GPU(s)")
+
+    if gpu_count < 2:
+        print("Only 1 GPU available, testing single GPU operations")
+        device = torch.device("cuda:0")
+        tensor = torch.randn(10, 10, device=device)
+        assert tensor.is_cuda, "Tensor is not on GPU"
+    else:
+        print("Multiple GPUs available, testing cross-GPU transfer")
+        # Create tensor on GPU 0
+        tensor_gpu0 = torch.randn(10, 10, device="cuda:0")
+        assert tensor_gpu0.device.index == 0, "Tensor not on GPU 0"
+
+        # Transfer to GPU 1
+        tensor_gpu1 = tensor_gpu0.to("cuda:1")
+        assert tensor_gpu1.device.index == 1, "Tensor not on GPU 1"
+
+        # Verify data is preserved
+        assert torch.allclose(tensor_gpu0.cpu(), tensor_gpu1.cpu()), "Data changed during transfer"
+
+    print("✓ Multi-GPU tensor transfer test passed")
+
+
+def test_gpu_memory_allocation():
+    """Test GPU memory allocation and deallocation"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    device = torch.device("cuda:0")
+
+    # Record initial memory
+    torch.cuda.empty_cache()
+    initial_memory = torch.cuda.memory_allocated(0)
+    print(f"Initial GPU memory allocated: {initial_memory / 1024**2:.2f} MB")
+
+    # Allocate large tensor
+    large_tensor = torch.randn(1000, 1000, device=device)
+    memory_after_alloc = torch.cuda.memory_allocated(0)
+    print(f"Memory after allocation: {memory_after_alloc / 1024**2:.2f} MB")
+
+    # Verify memory increased
+    assert memory_after_alloc > initial_memory, "GPU memory did not increase after allocation"
+
+    # Delete tensor and clear cache
+    del large_tensor
+    torch.cuda.empty_cache()
+    memory_after_dealloc = torch.cuda.memory_allocated(0)
+    print(f"Memory after deallocation: {memory_after_dealloc / 1024**2:.2f} MB")
+
+    print("✓ GPU memory allocation test passed")
+
+
+def test_cuda_compute_capability():
+    """Test CUDA compute capability"""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    for i in range(torch.cuda.device_count()):
+        capability = torch.cuda.get_device_capability(i)
+        print(f"GPU {i} compute capability: {capability[0]}.{capability[1]}")
+
+        # Verify compute capability is reasonable (at least 3.5)
+        assert capability[0] >= 3, f"GPU {i} compute capability too old: {capability}"
+
+    print("✓ CUDA compute capability test passed")
diff --git a/tests/unit_tests/test_placeholder.py b/tests/unit_tests/test_placeholder.py