44
55# Real GPU tests for cuda.core.checkpoint — no mocks.
66#
7- # Lifecycle tests self-checkpoint the current process (os.getpid()) and
8- # exercise lock / checkpoint / restore / unlock through the real driver.
7+ # Lifecycle tests exercise lightweight state/lock operations in-process and
8+ # mutating checkpoint / restore cycles through an isolated coordinator/target
9+ # process pair.
910#
1011# Migration tests attempt GPU UUID remapping following the pattern from
1112# NVIDIA/cuda-checkpoint r580-migration-api.c. They require ≥2 GPUs of
1213# the same chip type and a driver that supports migration; the tests skip
1314# gracefully when the hardware or driver cannot satisfy this.
1415
1516import os
17+ import signal
18+ import subprocess
1619import sys
20+ import textwrap
1721from contextlib import suppress
1822
1923import pytest
2024
21- from cuda .core import Device , checkpoint
22- from cuda .core ._utils .cuda_utils import CUDAError
25+ from cuda .core import checkpoint
2326
2427# -- Skip condition -------------------------------------------------------
2528
@@ -42,18 +45,65 @@ def _checkpoint_available():
4245# -- Helpers ---------------------------------------------------------------
4346
4447
45- def _build_rotation_mapping (devices ):
46- """GPU i UUID -> GPU (i+1) % N UUID for every visible device.
48+ def _run_or_skip_unsupported (func , * args , ** kwargs ):
49+ try :
50+ return func (* args , ** kwargs )
51+ except RuntimeError as exc :
52+ if "CUDA checkpointing is not supported" in str (exc ):
53+ pytest .skip (str (exc ))
54+ raise
4755
48- Returns a ``{str: str}`` dict of UUID strings suitable for
49- :meth:`~checkpoint.Process.restore`.
50- """
56+
57+ _SCENARIO_SKIP_EXIT_CODE = 77
58+
59+ _SCENARIO_COMMON = r"""
60+ import subprocess
61+ import sys
62+ from contextlib import suppress
63+
64+ from cuda.core import Device, checkpoint
65+ from cuda.core._utils.cuda_utils import CUDAError
66+
67+ EXIT_SKIP = 77
68+
69+ TARGET_SCRIPT = r'''
70+ import sys
71+
72+ from cuda.core import Device
73+
74+ device_index = int(sys.argv[1])
75+ Device(device_index).set_current()
76+ print(f"READY:{Device().uuid}", flush=True)
77+
78+ for line in sys.stdin:
79+ command = line.strip()
80+ if command == "uuid":
81+ print(f"UUID:{Device().uuid}", flush=True)
82+ elif command == "exit":
83+ break
84+ '''
85+
86+
87+ def skip(reason):
88+ print(f"SKIP: {reason}", flush=True)
89+ raise SystemExit(EXIT_SKIP)
90+
91+
92+ def run_or_skip_unsupported(func, *args, **kwargs):
93+ try:
94+ return func(*args, **kwargs)
95+ except RuntimeError as exc:
96+ if "CUDA checkpointing is not supported" in str(exc):
97+ skip(str(exc))
98+ raise
99+
100+
101+ def build_rotation_mapping(devices):
51102 n = len(devices)
52103 return {devices[i].uuid: devices[(i + 1) % n].uuid for i in range(n)}
53104
54105
55- def _find_same_chip_pair (devices ):
56- """Return (i, j) indices of two devices with the same name, or None."""
106+ def find_same_chip_pair(devices):
57107 seen = {}
58108 for i, dev in enumerate(devices):
59109 name = dev.name
@@ -63,13 +113,105 @@ def _find_same_chip_pair(devices):
63113 return None
64114
65115
66- def _run_or_skip_unsupported (func , * args , ** kwargs ):
116+ def read_prefixed(target, prefix):
117+ line = target.stdout.readline()
118+ if not line:
119+ stderr = target.stderr.read()
120+ raise RuntimeError(f"checkpoint target exited before {prefix!r}; stderr:\n{stderr}")
121+ line = line.strip()
122+ if not line.startswith(prefix):
123+ raise RuntimeError(f"expected target output prefix {prefix!r}, got {line!r}")
124+ return line[len(prefix):]
125+
126+
127+ def start_target(device_index=0):
128+ target = subprocess.Popen(
129+ [sys.executable, "-c", TARGET_SCRIPT, str(device_index)],
130+ stdin=subprocess.PIPE,
131+ stdout=subprocess.PIPE,
132+ stderr=subprocess.PIPE,
133+ text=True,
134+ )
67135 try:
68- return func (* args , ** kwargs )
69- except RuntimeError as exc :
70- if "CUDA checkpointing is not supported" in str (exc ):
71- pytest .skip (str (exc ))
136+ ready_uuid = read_prefixed(target, "READY:")
137+ except Exception:
138+ stop_target(target)
72139 raise
140+ return target, ready_uuid
141+
142+
143+ def stop_target(target):
144+ if target.poll() is None:
145+ with suppress(Exception):
146+ target.stdin.write("exit\n")
147+ target.stdin.flush()
148+ try:
149+ target.wait(timeout=5)
150+ except subprocess.TimeoutExpired:
151+ target.kill()
152+ target.wait()
153+
154+
155+ def target_uuid(target):
156+ target.stdin.write("uuid\n")
157+ target.stdin.flush()
158+ return read_prefixed(target, "UUID:")
159+
160+
161+ def checkpoint_restore(proc, gpu_mapping=None):
162+ run_or_skip_unsupported(proc.lock, timeout_ms=5000)
163+ run_or_skip_unsupported(proc.checkpoint)
164+ try:
165+ run_or_skip_unsupported(proc.restore, gpu_mapping=gpu_mapping)
166+ except (CUDAError, RuntimeError) as exc:
167+ with suppress(Exception):
168+ proc.restore()
169+ with suppress(Exception):
170+ proc.unlock()
171+ if "INVALID_VALUE" in str(exc):
172+ skip(
173+ "Driver does not support GPU migration on this hardware "
174+ "(CUDA_ERROR_INVALID_VALUE; see NVBug 5437334)"
175+ )
176+ raise
177+ proc.unlock()
178+ """
179+
180+
181+ def _run_checkpoint_scenario_or_skip (body : str , * , timeout : int = 90 ) -> None :
182+ """Run mutating checkpoint/restore scenarios out-of-process.
183+
184+ The CUDA checkpoint APIs can block inside the driver when a runner exposes
185+ symbols but the platform path cannot complete checkpoint/restore. Running
186+ the scenario in its own process group lets the parent test skip that runner
187+ cleanly instead of hanging the entire CI job.
188+ """
189+ script = _SCENARIO_COMMON + "\n " + textwrap .dedent (body )
190+ proc = subprocess .Popen ( # noqa: S603 - controlled test subprocess using this Python executable.
191+ [sys .executable , "-c" , script ],
192+ stdout = subprocess .PIPE ,
193+ stderr = subprocess .PIPE ,
194+ text = True ,
195+ start_new_session = True ,
196+ )
197+ try :
198+ stdout , stderr = proc .communicate (timeout = timeout )
199+ except subprocess .TimeoutExpired :
200+ with suppress (ProcessLookupError ):
201+ os .killpg (proc .pid , signal .SIGKILL )
202+ stdout , stderr = proc .communicate ()
203+ pytest .skip (
204+ f"CUDA checkpoint scenario timed out after { timeout } s; driver/hardware did not complete "
205+ f"checkpoint/restore.\n stdout:\n { stdout } \n stderr:\n { stderr } "
206+ )
207+
208+ if proc .returncode == _SCENARIO_SKIP_EXIT_CODE :
209+ reason = stdout .strip () or stderr .strip () or "CUDA checkpoint scenario skipped"
210+ pytest .skip (reason )
211+ if proc .returncode != 0 :
212+ pytest .fail (
213+ f"CUDA checkpoint scenario failed with exit code { proc .returncode } .\n stdout:\n { stdout } \n stderr:\n { stderr } "
214+ )
73215
74216
75217# -- Fixtures --------------------------------------------------------------
@@ -154,19 +296,28 @@ def test_lock_with_timeout(self, self_process):
154296 assert self_process .state == "locked"
155297 self_process .unlock ()
156298
157- def test_full_cycle_no_migration (self , self_process ):
299+ def test_full_cycle_no_migration (self ):
158300 """lock -> checkpoint -> restore -> unlock, verify state at each step."""
159- _run_or_skip_unsupported (self_process .lock )
160- assert self_process .state == "locked"
301+ _run_checkpoint_scenario_or_skip (
302+ """
303+ target, _ = start_target()
304+ proc = checkpoint.Process(target.pid)
305+ try:
306+ run_or_skip_unsupported(proc.lock, timeout_ms=5000)
307+ assert proc.state == "locked"
161308
162- _run_or_skip_unsupported ( self_process .checkpoint )
163- assert self_process .state == "checkpointed"
309+ run_or_skip_unsupported(proc .checkpoint)
310+ assert proc .state == "checkpointed"
164311
165- _run_or_skip_unsupported ( self_process .restore )
166- assert self_process .state == "locked" # restore leaves process locked
312+ run_or_skip_unsupported(proc .restore)
313+ assert proc .state == "locked" # restore leaves process locked
167314
168- self_process .unlock ()
169- assert self_process .state == "running"
315+ proc.unlock()
316+ assert proc.state == "running"
317+ finally:
318+ stop_target(target)
319+ """
320+ )
170321
171322
172323# -- GPU migration (>= 2 same-chip GPUs, real driver) ---------------------
@@ -183,83 +334,72 @@ class TestCheckpointGpuMigration:
183334 NVBug 5437334).
184335 """
185336
186- @staticmethod
187- def _try_migration (proc , gpu_mapping ):
188- """Attempt a single checkpoint-restore with migration.
189-
190- Returns True on success. Skips the test if the driver rejects
191- the migration with CUDA_ERROR_INVALID_VALUE (known limitation
192- on some architectures / driver versions).
193- """
194- _run_or_skip_unsupported (proc .lock )
195- _run_or_skip_unsupported (proc .checkpoint )
196- try :
197- _run_or_skip_unsupported (proc .restore , gpu_mapping = gpu_mapping )
198- except (CUDAError , RuntimeError ) as exc :
199- # Recover: restore without migration, then unlock.
200- proc .restore ()
201- proc .unlock ()
202- if "INVALID_VALUE" in str (exc ):
203- pytest .skip (
204- "Driver does not support GPU migration on this hardware "
205- "(CUDA_ERROR_INVALID_VALUE — see NVBug 5437334)"
206- )
207- raise
208- proc .unlock ()
209- return True
210-
211- def test_rotation_migrates_context (self , self_process ):
337+ def test_rotation_migrates_context (self ):
212338 """Rotate context through all GPUs and back to the origin.
213339
214340 Builds a rotation mapping (device i -> device (i+1) % N) for
215341 every visible device and performs N rotations. After each step
216342 the context device UUID is checked. After N steps the context
217343 should be back on the original device.
218344 """
219- devices = Device .get_all_devices ()
220- if len (devices ) < 2 :
221- pytest .skip ("GPU migration tests require at least 2 GPUs" )
222- if _find_same_chip_pair (devices ) is None :
223- pytest .skip ("GPU migration requires at least 2 GPUs of the same chip type" )
224-
225- gpu_mapping = _build_rotation_mapping (devices )
226- uuid_origin = Device ().uuid
227-
228- for step in range (len (devices )):
229- expected_uuid = devices [(step + 1 ) % len (devices )].uuid
230-
231- self ._try_migration (self_process , gpu_mapping )
232-
233- assert Device ().uuid == expected_uuid , f"Step { step } : expected UUID { expected_uuid } , got { Device ().uuid } "
234-
235- # After N rotations, back at the origin.
236- assert Device ().uuid == uuid_origin
237-
238- def test_swap_identical_gpus (self , self_process ):
345+ _run_checkpoint_scenario_or_skip (
346+ """
347+ devices = Device.get_all_devices()
348+ if len(devices) < 2:
349+ skip("GPU migration tests require at least 2 GPUs")
350+ if find_same_chip_pair(devices) is None:
351+ skip("GPU migration requires at least 2 GPUs of the same chip type")
352+
353+ gpu_mapping = build_rotation_mapping(devices)
354+ target, uuid_origin = start_target(0)
355+ proc = checkpoint.Process(target.pid)
356+ try:
357+ for step in range(len(devices)):
358+ expected_uuid = devices[(step + 1) % len(devices)].uuid
359+ checkpoint_restore(proc, gpu_mapping=gpu_mapping)
360+ observed_uuid = target_uuid(target)
361+ assert observed_uuid == expected_uuid, (
362+ f"Step {step}: expected UUID {expected_uuid}, got {observed_uuid}"
363+ )
364+
365+ assert target_uuid(target) == uuid_origin
366+ finally:
367+ stop_target(target)
368+ """ ,
369+ timeout = 180 ,
370+ )
371+
372+ def test_swap_identical_gpus (self ):
239373 """Swap context between two GPUs of the same chip type.
240374
241375 Sets the context on one of the pair members so that a successful
242376 migration is observable (the context UUID changes).
243377 """
244- devices = Device .get_all_devices ()
245- pair = _find_same_chip_pair (devices )
246- if pair is None :
247- pytest .skip ("No two GPUs of the same chip type found" )
248-
249- i , j = pair
250- # Place context on device i so the swap is observable.
251- devices [i ].set_current ()
252-
253- # Build an identity mapping, then swap the pair (using UUID strings).
254- gpu_mapping = {d .uuid : d .uuid for d in devices }
255- gpu_mapping [devices [i ].uuid ] = devices [j ].uuid
256- gpu_mapping [devices [j ].uuid ] = devices [i ].uuid
257-
258- assert Device ().uuid == devices [i ].uuid
259-
260- self ._try_migration (self_process , gpu_mapping )
261- uuid_after = Device ().uuid
262-
263- if uuid_after == devices [i ].uuid :
264- pytest .skip ("Driver accepted GPU swap but migration is a no-op on this hardware/driver version" )
265- assert uuid_after == devices [j ].uuid
378+ _run_checkpoint_scenario_or_skip (
379+ """
380+ devices = Device.get_all_devices()
381+ pair = find_same_chip_pair(devices)
382+ if pair is None:
383+ skip("No two GPUs of the same chip type found")
384+
385+ i, j = pair
386+ gpu_mapping = {d.uuid: d.uuid for d in devices}
387+ gpu_mapping[devices[i].uuid] = devices[j].uuid
388+ gpu_mapping[devices[j].uuid] = devices[i].uuid
389+
390+ target, uuid_before = start_target(i)
391+ proc = checkpoint.Process(target.pid)
392+ try:
393+ assert uuid_before == devices[i].uuid
394+
395+ checkpoint_restore(proc, gpu_mapping=gpu_mapping)
396+ uuid_after = target_uuid(target)
397+
398+ if uuid_after == devices[i].uuid:
399+ skip("Driver accepted GPU swap but migration is a no-op on this hardware/driver version")
400+ assert uuid_after == devices[j].uuid
401+ finally:
402+ stop_target(target)
403+ """ ,
404+ timeout = 120 ,
405+ )
0 commit comments