Skip to content
This repository was archived by the owner on May 20, 2026. It is now read-only.

Commit 9d74ab1

Browse files
authored
Merge pull request #24 from NVIDIA-NeMo/pablo-garay/add-cicd-testing
CPU & GPU: init + sample tests
2 parents 23d4a67 + 5888465 commit 9d74ab1

7 files changed

Lines changed: 263 additions & 51 deletions

File tree

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ on:
1919
branches:
2020
- main
2121
- "pull-request/[0-9]+"
22+
- "r[0-9]+.[0-9]+.[0-9]+"
2223

2324
concurrency:
2425
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}

docker/Dockerfile.ci

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,26 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
FROM nvcr.io/nvidia/pytorch:25.05-py3
15-
16-
ENV PIP_CONSTRAINT=""
14+
FROM nvcr.io/nvidia/pytorch:25.09-py3
1715

1816
WORKDIR /workspace
17+
18+
# Install uv
19+
ENV UV_VERSION="0.8.22"
20+
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
21+
ENV PATH="/root/.local/bin:$PATH"
22+
23+
# Set up virtual environment for uv
24+
ENV UV_PROJECT_ENVIRONMENT=/opt/venv
25+
ENV UV_CACHE_DIR=/opt/uv_cache
26+
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
27+
ENV UV_LINK_MODE=copy
28+
29+
# Create virtual environment
30+
RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
31+
32+
# Copy dependency files
33+
COPY pyproject.toml uv.lock ./
34+
35+
# Install test dependencies using uv sync
36+
RUN uv sync --link-mode copy --locked --group test --no-install-project

tests/conftest.py

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,39 +19,16 @@
1919
def pytest_addoption(parser):
2020
"""
2121
Additional command-line arguments passed to pytest.
22-
For now:
23-
--cpu: use CPU during testing (DEFAULT: GPU)
24-
--use_local_test_data: use local test data/skip downloading from URL/GitHub (DEFAULT: False)
2522
"""
26-
parser.addoption(
27-
"--cpu", action="store_true", help="pass that argument to use CPU during testing (DEFAULT: False = GPU)"
28-
)
2923
parser.addoption(
3024
"--with_downloads",
3125
action="store_true",
3226
help="pass this argument to active tests which download models from the cloud.",
3327
)
3428

3529

36-
@pytest.fixture
37-
def device(request):
38-
"""Simple fixture returning string denoting the device [CPU | GPU]"""
39-
if request.config.getoption("--cpu"):
40-
return "CPU"
41-
else:
42-
return "GPU"
43-
44-
45-
@pytest.fixture(autouse=True)
46-
def run_only_on_device_fixture(request, device):
47-
"""Fixture to skip tests based on the device"""
48-
if request.node.get_closest_marker("run_only_on"):
49-
if request.node.get_closest_marker("run_only_on").args[0] != device:
50-
pytest.skip("skipped on this device: {}".format(device))
51-
52-
5330
@pytest.fixture(autouse=True)
54-
def downloads_weights(request, device):
31+
def downloads_weights(request):
5532
"""Fixture to validate if the with_downloads flag is passed if necessary"""
5633
if request.node.get_closest_marker("with_downloads"):
5734
if not request.config.getoption("--with_downloads"):
@@ -77,15 +54,16 @@ def reset_env_vars():
7754
def pytest_configure(config):
7855
"""
7956
Initial configuration of conftest.
80-
The function checks if test_data.tar.gz is present in tests/.data.
81-
If so, compares its size with github's test_data.tar.gz.
82-
If file absent or sizes not equal, function downloads the archive from github and unpacks it.
57+
58+
Note: DFM uses the following pattern for CPU/GPU test separation:
59+
Tests don't use markers - GPU visibility is controlled by CUDA_VISIBLE_DEVICES
60+
in the shell scripts (L0_Unit_Tests_CPU.sh and L0_Unit_Tests_GPU.sh).
8361
"""
8462
config.addinivalue_line(
8563
"markers",
86-
"run_only_on(device): runs the test only on a given device [CPU | GPU]",
64+
"with_downloads: runs the test using data present in tests/.data",
8765
)
8866
config.addinivalue_line(
8967
"markers",
90-
"with_downloads: runs the test using data present in tests/.data",
68+
"pleasefixme: marks test as needing fixes (will be skipped in CI)",
9169
)

tests/unit_tests/L0_Unit_Tests_CPU.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
CUDA_VISIBLE_DEVICES="" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --cpu --with_downloads
14+
15+
# Hide GPU from PyTorch by setting CUDA_VISIBLE_DEVICES to empty
16+
# This makes torch.cuda.is_available() return False
17+
CUDA_VISIBLE_DEVICES="" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import torch
16+
17+
18+
def test_cpu_tensor_operations():
19+
"""Test basic CPU tensor operations"""
20+
# Create tensors on CPU
21+
a = torch.tensor([1.0, 2.0, 3.0], device="cpu")
22+
b = torch.tensor([4.0, 5.0, 6.0], device="cpu")
23+
24+
# Test addition
25+
c = a + b
26+
expected = torch.tensor([5.0, 7.0, 9.0])
27+
assert torch.allclose(c, expected), f"Expected {expected}, got {c}"
28+
29+
# Test multiplication
30+
d = a * b
31+
expected = torch.tensor([4.0, 10.0, 18.0])
32+
assert torch.allclose(d, expected), f"Expected {expected}, got {d}"
33+
34+
print("✓ CPU tensor operations test passed")
35+
36+
37+
def test_cpu_only_environment():
38+
"""Verify that CUDA is not visible in CPU-only environment"""
39+
# In CPU tests, CUDA should not be available or visible
40+
# When CUDA_VISIBLE_DEVICES="" is set, torch.cuda.is_available() should be False
41+
print(f"CUDA available: {torch.cuda.is_available()}")
42+
print(f"CUDA device count: {torch.cuda.device_count()}")
43+
print("✓ CPU environment test completed")
44+
45+
46+
def test_cpu_matrix_multiplication():
47+
"""Test matrix multiplication on CPU"""
48+
# Create random matrices
49+
matrix_a = torch.randn(10, 20, device="cpu")
50+
matrix_b = torch.randn(20, 30, device="cpu")
51+
52+
# Perform matrix multiplication
53+
result = torch.matmul(matrix_a, matrix_b)
54+
55+
# Verify shape
56+
assert result.shape == (10, 30), f"Expected shape (10, 30), got {result.shape}"
57+
58+
# Verify result is finite
59+
assert torch.isfinite(result).all(), "Result contains non-finite values"
60+
61+
print("✓ CPU matrix multiplication test passed")
62+
63+
64+
def test_pytorch_version():
65+
"""Test that PyTorch is properly installed"""
66+
print(f"PyTorch version: {torch.__version__}")
67+
assert torch.__version__ is not None, "PyTorch version not found"
68+
print("✓ PyTorch version check passed")
Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
import torch
17+
18+
19+
def test_gpu_availability():
20+
"""Test that GPU is available and accessible"""
21+
if not torch.cuda.is_available():
22+
pytest.skip("CUDA not available")
23+
24+
gpu_count = torch.cuda.device_count()
25+
print(f"Number of GPUs available: {gpu_count}")
26+
assert gpu_count >= 1, f"Expected at least 1 GPU, found {gpu_count}"
27+
28+
# Print GPU information
29+
for i in range(gpu_count):
30+
gpu_name = torch.cuda.get_device_name(i)
31+
print(f"GPU {i}: {gpu_name}")
32+
print(f" Memory allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
33+
print(f" Memory reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
34+
35+
print("✓ GPU availability test passed")
36+
37+
38+
def test_gpu_tensor_operations():
39+
"""Test basic GPU tensor operations"""
40+
if not torch.cuda.is_available():
41+
pytest.skip("CUDA not available")
42+
43+
# Create tensors on GPU
44+
device = torch.device("cuda:0")
45+
a = torch.tensor([1.0, 2.0, 3.0], device=device)
46+
b = torch.tensor([4.0, 5.0, 6.0], device=device)
47+
48+
# Verify tensors are on GPU
49+
assert a.is_cuda, "Tensor a is not on GPU"
50+
assert b.is_cuda, "Tensor b is not on GPU"
51+
52+
# Test addition
53+
c = a + b
54+
expected = torch.tensor([5.0, 7.0, 9.0], device=device)
55+
assert torch.allclose(c, expected), f"Expected {expected}, got {c}"
56+
assert c.is_cuda, "Result tensor is not on GPU"
57+
58+
# Test multiplication
59+
d = a * b
60+
expected = torch.tensor([4.0, 10.0, 18.0], device=device)
61+
assert torch.allclose(d, expected), f"Expected {expected}, got {d}"
62+
63+
print("✓ GPU tensor operations test passed")
64+
65+
66+
def test_gpu_matrix_multiplication():
67+
"""Test matrix multiplication on GPU"""
68+
if not torch.cuda.is_available():
69+
pytest.skip("CUDA not available")
70+
71+
device = torch.device("cuda:0")
72+
73+
# Create random matrices on GPU
74+
matrix_a = torch.randn(100, 200, device=device)
75+
matrix_b = torch.randn(200, 300, device=device)
76+
77+
# Perform matrix multiplication
78+
result = torch.matmul(matrix_a, matrix_b)
79+
80+
# Verify shape
81+
assert result.shape == (100, 300), f"Expected shape (100, 300), got {result.shape}"
82+
83+
# Verify result is on GPU
84+
assert result.is_cuda, "Result is not on GPU"
85+
86+
# Verify result is finite
87+
assert torch.isfinite(result).all(), "Result contains non-finite values"
88+
89+
print("✓ GPU matrix multiplication test passed")
90+
91+
92+
def test_multi_gpu_tensor_transfer():
93+
"""Test tensor transfer between GPUs if multiple GPUs are available"""
94+
if not torch.cuda.is_available():
95+
pytest.skip("CUDA not available")
96+
97+
gpu_count = torch.cuda.device_count()
98+
print(f"Testing with {gpu_count} GPU(s)")
99+
100+
if gpu_count < 2:
101+
print("Only 1 GPU available, testing single GPU operations")
102+
device = torch.device("cuda:0")
103+
tensor = torch.randn(10, 10, device=device)
104+
assert tensor.is_cuda, "Tensor is not on GPU"
105+
else:
106+
print("Multiple GPUs available, testing cross-GPU transfer")
107+
# Create tensor on GPU 0
108+
tensor_gpu0 = torch.randn(10, 10, device="cuda:0")
109+
assert tensor_gpu0.device.index == 0, "Tensor not on GPU 0"
110+
111+
# Transfer to GPU 1
112+
tensor_gpu1 = tensor_gpu0.to("cuda:1")
113+
assert tensor_gpu1.device.index == 1, "Tensor not on GPU 1"
114+
115+
# Verify data is preserved
116+
assert torch.allclose(tensor_gpu0.cpu(), tensor_gpu1.cpu()), "Data changed during transfer"
117+
118+
print("✓ Multi-GPU tensor transfer test passed")
119+
120+
121+
def test_gpu_memory_allocation():
122+
"""Test GPU memory allocation and deallocation"""
123+
if not torch.cuda.is_available():
124+
pytest.skip("CUDA not available")
125+
126+
device = torch.device("cuda:0")
127+
128+
# Record initial memory
129+
torch.cuda.empty_cache()
130+
initial_memory = torch.cuda.memory_allocated(0)
131+
print(f"Initial GPU memory allocated: {initial_memory / 1024**2:.2f} MB")
132+
133+
# Allocate large tensor
134+
large_tensor = torch.randn(1000, 1000, device=device)
135+
memory_after_alloc = torch.cuda.memory_allocated(0)
136+
print(f"Memory after allocation: {memory_after_alloc / 1024**2:.2f} MB")
137+
138+
# Verify memory increased
139+
assert memory_after_alloc > initial_memory, "GPU memory did not increase after allocation"
140+
141+
# Delete tensor and clear cache
142+
del large_tensor
143+
torch.cuda.empty_cache()
144+
memory_after_dealloc = torch.cuda.memory_allocated(0)
145+
print(f"Memory after deallocation: {memory_after_dealloc / 1024**2:.2f} MB")
146+
147+
print("✓ GPU memory allocation test passed")
148+
149+
150+
def test_cuda_compute_capability():
151+
"""Test CUDA compute capability"""
152+
if not torch.cuda.is_available():
153+
pytest.skip("CUDA not available")
154+
155+
for i in range(torch.cuda.device_count()):
156+
capability = torch.cuda.get_device_capability(i)
157+
print(f"GPU {i} compute capability: {capability[0]}.{capability[1]}")
158+
159+
# Verify compute capability is reasonable (at least 3.5)
160+
assert capability[0] >= 3, f"GPU {i} compute capability too old: {capability}"
161+
162+
print("✓ CUDA compute capability test passed")

tests/unit_tests/test_placeholder.py

Lines changed: 0 additions & 18 deletions
This file was deleted.

0 commit comments

Comments
 (0)