Skip to content

Commit 75e6960

Browse files
authored
TST: Use tempdir for cufile tests (and mark some as thread unsafe) (#2218)
* Add pytest-run-parallel markers to pytest.ini * TST: Use tempdir for cufile tests (and mark some as thread unsafe) It seems that changing these to use a temp_dir (necessary to run them in parallel in multiple threads) also fixes the CI failures.
1 parent 671af69 commit 75e6960

2 files changed

Lines changed: 35 additions & 98 deletions

File tree

cuda_bindings/tests/test_cufile.py

Lines changed: 31 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

44
import ctypes
5-
import errno
65
import logging
76
import os
87
import pathlib
@@ -118,12 +117,6 @@ def get_tegra_kind():
118117
),
119118
]
120119

121-
xfail_handle_register = pytest.mark.xfail(
122-
condition=isSupportedFilesystem() and os.environ.get("CI") is not None,
123-
raises=cufile.cuFileError,
124-
reason="handle_register call fails in CI for unknown reasons",
125-
)
126-
127120

128121
def test_cufile_success_defined():
129122
"""Check if CUFILE_SUCCESS is defined in OpError enum."""
@@ -204,11 +197,10 @@ def driver(ctx):
204197

205198
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
206199
@pytest.mark.usefixtures("driver")
207-
@xfail_handle_register
208-
def test_handle_register():
200+
def test_handle_register(tmpdir):
209201
"""Test file handle registration with cuFile."""
210202
# Create test file
211-
file_path = "test_handle_register.bin"
203+
file_path = tmpdir / "test_handle_register.bin"
212204

213205
# Create file with POSIX operations
214206
fd = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
@@ -242,8 +234,6 @@ def test_handle_register():
242234

243235
finally:
244236
os.close(fd)
245-
with suppress(OSError):
246-
os.unlink(file_path)
247237

248238

249239
@pytest.mark.usefixtures("driver")
@@ -397,11 +387,10 @@ def test_buf_register_already_registered():
397387

398388
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
399389
@pytest.mark.usefixtures("driver")
400-
@xfail_handle_register
401-
def test_cufile_read_write():
390+
def test_cufile_read_write(tmpdir):
402391
"""Test cuFile read and write operations."""
403392
# Create test file
404-
file_path = "test_cufile_rw.bin"
393+
file_path = tmpdir / "test_cufile_rw.bin"
405394

406395
# Allocate CUDA memory for write and read
407396
write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
@@ -478,21 +467,14 @@ def test_cufile_read_write():
478467
# Free CUDA memory
479468
cuda.cuMemFree(write_buf)
480469
cuda.cuMemFree(read_buf)
481-
# Clean up test file
482-
try:
483-
os.unlink(file_path)
484-
except OSError as e:
485-
if e.errno != errno.ENOENT:
486-
raise
487470

488471

489472
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
490473
@pytest.mark.usefixtures("driver")
491-
@xfail_handle_register
492-
def test_cufile_read_write_host_memory():
474+
def test_cufile_read_write_host_memory(tmpdir):
493475
"""Test cuFile read and write operations using host memory."""
494476
# Create test file
495-
file_path = "test_cufile_rw_host.bin"
477+
file_path = tmpdir / "test_cufile_rw_host.bin"
496478

497479
# Allocate host memory for write and read
498480
write_size = 65536 # 64KB, aligned to 4096 bytes (65536 % 4096 == 0)
@@ -565,21 +547,14 @@ def test_cufile_read_write_host_memory():
565547
# Free host memory
566548
cuda.cuMemFreeHost(write_buf)
567549
cuda.cuMemFreeHost(read_buf)
568-
# Clean up test file
569-
try:
570-
os.unlink(file_path)
571-
except OSError as e:
572-
if e.errno != errno.ENOENT:
573-
raise
574550

575551

576552
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
577553
@pytest.mark.usefixtures("driver")
578-
@xfail_handle_register
579-
def test_cufile_read_write_large():
554+
def test_cufile_read_write_large(tmpdir):
580555
"""Test cuFile read and write operations with large data."""
581556
# Create test file
582-
file_path = "test_cufile_rw_large.bin"
557+
file_path = tmpdir / "test_cufile_rw_large.bin"
583558

584559
# Allocate large CUDA memory (1MB, aligned to 4096 bytes)
585560
write_size = 1024 * 1024 # 1MB, aligned to 4096 bytes (1048576 % 4096 == 0)
@@ -659,21 +634,14 @@ def test_cufile_read_write_large():
659634
# Free CUDA memory
660635
cuda.cuMemFree(write_buf)
661636
cuda.cuMemFree(read_buf)
662-
# Clean up test file
663-
try:
664-
os.unlink(file_path)
665-
except OSError as e:
666-
if e.errno != errno.ENOENT:
667-
raise
668637

669638

670639
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
671640
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
672-
@xfail_handle_register
673-
def test_cufile_write_async():
641+
def test_cufile_write_async(tmpdir):
674642
"""Test cuFile asynchronous write operations."""
675643
# Create test file
676-
file_path = "test_cufile_write_async.bin"
644+
file_path = tmpdir / "test_cufile_write_async.bin"
677645
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
678646

679647
try:
@@ -741,17 +709,14 @@ def test_cufile_write_async():
741709

742710
finally:
743711
os.close(fd)
744-
with suppress(OSError):
745-
os.unlink(file_path)
746712

747713

748714
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
749715
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
750-
@xfail_handle_register
751-
def test_cufile_read_async():
716+
def test_cufile_read_async(tmpdir):
752717
"""Test cuFile asynchronous read operations."""
753718
# Create test file
754-
file_path = "test_cufile_read_async.bin"
719+
file_path = tmpdir / "test_cufile_read_async.bin"
755720

756721
# First create and write test data without O_DIRECT
757722
fd_temp = os.open(file_path, os.O_CREAT | os.O_RDWR, 0o600)
@@ -832,17 +797,14 @@ def test_cufile_read_async():
832797

833798
finally:
834799
os.close(fd)
835-
with suppress(OSError):
836-
os.unlink(file_path)
837800

838801

839802
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
840-
@xfail_handle_register
841803
@pytest.mark.usefixtures("ctx", "cufile_env_json", "driver")
842-
def test_cufile_async_read_write():
804+
def test_cufile_async_read_write(tmpdir):
843805
"""Test cuFile asynchronous read and write operations in sequence."""
844806
# Create test file
845-
file_path = "test_cufile_async_rw.bin"
807+
file_path = tmpdir / "test_cufile_async_rw.bin"
846808
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
847809

848810
try:
@@ -946,17 +908,14 @@ def test_cufile_async_read_write():
946908

947909
finally:
948910
os.close(fd)
949-
with suppress(OSError):
950-
os.unlink(file_path)
951911

952912

953913
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
954914
@pytest.mark.usefixtures("driver")
955-
@xfail_handle_register
956-
def test_batch_io_basic():
915+
def test_batch_io_basic(tmpdir):
957916
"""Test basic batch IO operations with multiple read/write operations."""
958917
# Create test file
959-
file_path = "test_batch_io.bin"
918+
file_path = tmpdir / "test_batch_io.bin"
960919

961920
# Allocate CUDA memory for multiple operations
962921
buf_size = 65536 # 64KB
@@ -1145,21 +1104,14 @@ def test_batch_io_basic():
11451104
# Free CUDA memory
11461105
for buf in buffers + read_buffers:
11471106
cuda.cuMemFree(buf)
1148-
# Clean up test file
1149-
try:
1150-
os.unlink(file_path)
1151-
except OSError as e:
1152-
if e.errno != errno.ENOENT:
1153-
raise
11541107

11551108

11561109
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
11571110
@pytest.mark.usefixtures("driver")
1158-
@xfail_handle_register
1159-
def test_batch_io_cancel():
1111+
def test_batch_io_cancel(tmpdir):
11601112
"""Test batch IO cancellation."""
11611113
# Create test file
1162-
file_path = "test_batch_cancel.bin"
1114+
file_path = tmpdir / "test_batch_cancel.bin"
11631115

11641116
# Allocate CUDA memory
11651117
buf_size = 4096 # 4KB, aligned to 4096 bytes
@@ -1229,21 +1181,14 @@ def test_batch_io_cancel():
12291181
# Free CUDA memory
12301182
for buf in buffers:
12311183
cuda.cuMemFree(buf)
1232-
# Clean up test file
1233-
try:
1234-
os.unlink(file_path)
1235-
except OSError as e:
1236-
if e.errno != errno.ENOENT:
1237-
raise
12381184

12391185

12401186
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
12411187
@pytest.mark.usefixtures("driver")
1242-
@xfail_handle_register
1243-
def test_batch_io_large_operations():
1188+
def test_batch_io_large_operations(tmpdir):
12441189
"""Test batch IO with large buffer operations."""
12451190
# Create test file
1246-
file_path = "test_batch_large.bin"
1191+
file_path = tmpdir / "test_batch_large.bin"
12471192

12481193
# Allocate large CUDA memory (1MB, aligned to 4096 bytes)
12491194
buf_size = 1024 * 1024 # 1MB, aligned to 4096 bytes
@@ -1421,12 +1366,6 @@ def test_batch_io_large_operations():
14211366
# Free CUDA memory
14221367
for buf in all_buffers:
14231368
cuda.cuMemFree(buf)
1424-
# Clean up test file
1425-
try:
1426-
os.unlink(file_path)
1427-
except OSError as e:
1428-
if e.errno != errno.ENOENT:
1429-
raise
14301369

14311370

14321371
@pytest.mark.skipif(
@@ -1631,6 +1570,7 @@ def test_get_parameter_min_max_value():
16311570
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
16321571
)
16331572
@pytest.mark.usefixtures("stats")
1573+
@pytest.mark.thread_unsafe(reason="not safe to stats_start() from multiple threads")
16341574
def test_stats_start_stop():
16351575
"""Test cuFile statistics collection stop."""
16361576
# Set statistics level first (required before starting stats)
@@ -1647,11 +1587,11 @@ def test_stats_start_stop():
16471587
)
16481588
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
16491589
@pytest.mark.usefixtures("stats")
1650-
@xfail_handle_register
1651-
def test_get_stats_l1():
1590+
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
1591+
def test_get_stats_l1(tmpdir):
16521592
"""Test cuFile L1 statistics retrieval with file operations."""
16531593
# Create test file directly with O_DIRECT
1654-
file_path = "test_stats_l1.bin"
1594+
file_path = tmpdir / "test_stats_l1.bin"
16551595
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
16561596

16571597
try:
@@ -1718,20 +1658,18 @@ def test_get_stats_l1():
17181658

17191659
finally:
17201660
os.close(fd)
1721-
with suppress(OSError):
1722-
os.unlink(file_path)
17231661

17241662

17251663
@pytest.mark.skipif(
17261664
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
17271665
)
17281666
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
17291667
@pytest.mark.usefixtures("stats")
1730-
@xfail_handle_register
1731-
def test_get_stats_l2():
1668+
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
1669+
def test_get_stats_l2(tmpdir):
17321670
"""Test cuFile L2 statistics retrieval with file operations."""
17331671
# Create test file directly with O_DIRECT
1734-
file_path = "test_stats_l2.bin"
1672+
file_path = tmpdir / "test_stats_l2.bin"
17351673
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
17361674

17371675
try:
@@ -1802,20 +1740,18 @@ def test_get_stats_l2():
18021740

18031741
finally:
18041742
os.close(fd)
1805-
with suppress(OSError):
1806-
os.unlink(file_path)
18071743

18081744

18091745
@pytest.mark.skipif(
18101746
cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later"
18111747
)
18121748
@pytest.mark.skipif(not isSupportedFilesystem(), reason="cuFile handle_register requires ext4 or xfs filesystem")
18131749
@pytest.mark.usefixtures("stats")
1814-
@xfail_handle_register
1815-
def test_get_stats_l3():
1750+
@pytest.mark.thread_unsafe(reason="cuFile stats counters and collection state are process-global")
1751+
def test_get_stats_l3(tmpdir):
18161752
"""Test cuFile L3 statistics retrieval with file operations."""
18171753
# Create test file directly with O_DIRECT
1818-
file_path = "test_stats_l3.bin"
1754+
file_path = tmpdir / "test_stats_l3.bin"
18191755
fd = os.open(file_path, os.O_CREAT | os.O_RDWR | os.O_DIRECT, 0o600)
18201756

18211757
try:
@@ -1896,8 +1832,6 @@ def test_get_stats_l3():
18961832

18971833
finally:
18981834
os.close(fd)
1899-
with suppress(OSError):
1900-
os.unlink(file_path)
19011835

19021836

19031837
@pytest.mark.skipif(

pytest.ini

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

44
[pytest]
@@ -21,3 +21,6 @@ markers =
2121
cython: cython tests
2222
smoke: meta-level smoke tests
2323
flaky: mark test as flaky (provided by pytest-rerunfailures)
24+
# pytest-run-parallel related markers
25+
thread_unsafe: mark test as thread unsafe (provided by pytest-run-parallel)
26+
parallel_threads_limit: max number of threads (provided by pytest-run-parallel)

0 commit comments

Comments
 (0)