Skip to content

Commit 8f798f4

Browse files
committed
Isolate checkpoint lifecycle tests
1 parent fbb8037 commit 8f798f4

1 file changed

Lines changed: 233 additions & 93 deletions

File tree

cuda_core/tests/test_checkpoint.py

Lines changed: 233 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,25 @@
44

55
# Real GPU tests for cuda.core.checkpoint — no mocks.
66
#
7-
# Lifecycle tests self-checkpoint the current process (os.getpid()) and
8-
# exercise lock / checkpoint / restore / unlock through the real driver.
7+
# Lifecycle tests exercise lightweight state/lock operations in-process and
8+
# mutating checkpoint / restore cycles through an isolated coordinator/target
9+
# process pair.
910
#
1011
# Migration tests attempt GPU UUID remapping following the pattern from
1112
# NVIDIA/cuda-checkpoint r580-migration-api.c. They require ≥2 GPUs of
1213
# the same chip type and a driver that supports migration; the tests skip
1314
# gracefully when the hardware or driver cannot satisfy this.
1415

1516
import os
17+
import signal
18+
import subprocess
1619
import sys
20+
import textwrap
1721
from contextlib import suppress
1822

1923
import pytest
2024

21-
from cuda.core import Device, checkpoint
22-
from cuda.core._utils.cuda_utils import CUDAError
25+
from cuda.core import checkpoint
2326

2427
# -- Skip condition -------------------------------------------------------
2528

@@ -42,18 +45,65 @@ def _checkpoint_available():
4245
# -- Helpers ---------------------------------------------------------------
4346

4447

45-
def _build_rotation_mapping(devices):
46-
"""GPU i UUID -> GPU (i+1) % N UUID for every visible device.
48+
def _run_or_skip_unsupported(func, *args, **kwargs):
49+
try:
50+
return func(*args, **kwargs)
51+
except RuntimeError as exc:
52+
if "CUDA checkpointing is not supported" in str(exc):
53+
pytest.skip(str(exc))
54+
raise
4755

48-
Returns a ``{str: str}`` dict of UUID strings suitable for
49-
:meth:`~checkpoint.Process.restore`.
50-
"""
56+
57+
_SCENARIO_SKIP_EXIT_CODE = 77
58+
59+
_SCENARIO_COMMON = r"""
60+
import subprocess
61+
import sys
62+
from contextlib import suppress
63+
64+
from cuda.core import Device, checkpoint
65+
from cuda.core._utils.cuda_utils import CUDAError
66+
67+
EXIT_SKIP = 77
68+
69+
TARGET_SCRIPT = r'''
70+
import sys
71+
72+
from cuda.core import Device
73+
74+
device_index = int(sys.argv[1])
75+
Device(device_index).set_current()
76+
print(f"READY:{Device().uuid}", flush=True)
77+
78+
for line in sys.stdin:
79+
command = line.strip()
80+
if command == "uuid":
81+
print(f"UUID:{Device().uuid}", flush=True)
82+
elif command == "exit":
83+
break
84+
'''
85+
86+
87+
def skip(reason):
88+
print(f"SKIP: {reason}", flush=True)
89+
raise SystemExit(EXIT_SKIP)
90+
91+
92+
def run_or_skip_unsupported(func, *args, **kwargs):
93+
try:
94+
return func(*args, **kwargs)
95+
except RuntimeError as exc:
96+
if "CUDA checkpointing is not supported" in str(exc):
97+
skip(str(exc))
98+
raise
99+
100+
101+
def build_rotation_mapping(devices):
51102
n = len(devices)
52103
return {devices[i].uuid: devices[(i + 1) % n].uuid for i in range(n)}
53104
54105
55-
def _find_same_chip_pair(devices):
56-
"""Return (i, j) indices of two devices with the same name, or None."""
106+
def find_same_chip_pair(devices):
57107
seen = {}
58108
for i, dev in enumerate(devices):
59109
name = dev.name
@@ -63,13 +113,105 @@ def _find_same_chip_pair(devices):
63113
return None
64114
65115
66-
def _run_or_skip_unsupported(func, *args, **kwargs):
116+
def read_prefixed(target, prefix):
117+
line = target.stdout.readline()
118+
if not line:
119+
stderr = target.stderr.read()
120+
raise RuntimeError(f"checkpoint target exited before {prefix!r}; stderr:\n{stderr}")
121+
line = line.strip()
122+
if not line.startswith(prefix):
123+
raise RuntimeError(f"expected target output prefix {prefix!r}, got {line!r}")
124+
return line[len(prefix):]
125+
126+
127+
def start_target(device_index=0):
128+
target = subprocess.Popen(
129+
[sys.executable, "-c", TARGET_SCRIPT, str(device_index)],
130+
stdin=subprocess.PIPE,
131+
stdout=subprocess.PIPE,
132+
stderr=subprocess.PIPE,
133+
text=True,
134+
)
67135
try:
68-
return func(*args, **kwargs)
69-
except RuntimeError as exc:
70-
if "CUDA checkpointing is not supported" in str(exc):
71-
pytest.skip(str(exc))
136+
ready_uuid = read_prefixed(target, "READY:")
137+
except Exception:
138+
stop_target(target)
72139
raise
140+
return target, ready_uuid
141+
142+
143+
def stop_target(target):
144+
if target.poll() is None:
145+
with suppress(Exception):
146+
target.stdin.write("exit\n")
147+
target.stdin.flush()
148+
try:
149+
target.wait(timeout=5)
150+
except subprocess.TimeoutExpired:
151+
target.kill()
152+
target.wait()
153+
154+
155+
def target_uuid(target):
156+
target.stdin.write("uuid\n")
157+
target.stdin.flush()
158+
return read_prefixed(target, "UUID:")
159+
160+
161+
def checkpoint_restore(proc, gpu_mapping=None):
162+
run_or_skip_unsupported(proc.lock, timeout_ms=5000)
163+
run_or_skip_unsupported(proc.checkpoint)
164+
try:
165+
run_or_skip_unsupported(proc.restore, gpu_mapping=gpu_mapping)
166+
except (CUDAError, RuntimeError) as exc:
167+
with suppress(Exception):
168+
proc.restore()
169+
with suppress(Exception):
170+
proc.unlock()
171+
if "INVALID_VALUE" in str(exc):
172+
skip(
173+
"Driver does not support GPU migration on this hardware "
174+
"(CUDA_ERROR_INVALID_VALUE; see NVBug 5437334)"
175+
)
176+
raise
177+
proc.unlock()
178+
"""
179+
180+
181+
def _run_checkpoint_scenario_or_skip(body: str, *, timeout: int = 90) -> None:
182+
"""Run mutating checkpoint/restore scenarios out-of-process.
183+
184+
The CUDA checkpoint APIs can block inside the driver when a runner exposes
185+
symbols but the platform path cannot complete checkpoint/restore. Running
186+
the scenario in its own process group lets the parent test skip that runner
187+
cleanly instead of hanging the entire CI job.
188+
"""
189+
script = _SCENARIO_COMMON + "\n" + textwrap.dedent(body)
190+
proc = subprocess.Popen( # noqa: S603 - controlled test subprocess using this Python executable.
191+
[sys.executable, "-c", script],
192+
stdout=subprocess.PIPE,
193+
stderr=subprocess.PIPE,
194+
text=True,
195+
start_new_session=True,
196+
)
197+
try:
198+
stdout, stderr = proc.communicate(timeout=timeout)
199+
except subprocess.TimeoutExpired:
200+
with suppress(ProcessLookupError):
201+
os.killpg(proc.pid, signal.SIGKILL)
202+
stdout, stderr = proc.communicate()
203+
pytest.skip(
204+
f"CUDA checkpoint scenario timed out after {timeout}s; driver/hardware did not complete "
205+
f"checkpoint/restore.\nstdout:\n{stdout}\nstderr:\n{stderr}"
206+
)
207+
208+
if proc.returncode == _SCENARIO_SKIP_EXIT_CODE:
209+
reason = stdout.strip() or stderr.strip() or "CUDA checkpoint scenario skipped"
210+
pytest.skip(reason)
211+
if proc.returncode != 0:
212+
pytest.fail(
213+
f"CUDA checkpoint scenario failed with exit code {proc.returncode}.\nstdout:\n{stdout}\nstderr:\n{stderr}"
214+
)
73215

74216

75217
# -- Fixtures --------------------------------------------------------------
@@ -154,19 +296,28 @@ def test_lock_with_timeout(self, self_process):
154296
assert self_process.state == "locked"
155297
self_process.unlock()
156298

157-
def test_full_cycle_no_migration(self, self_process):
299+
def test_full_cycle_no_migration(self):
158300
"""lock -> checkpoint -> restore -> unlock, verify state at each step."""
159-
_run_or_skip_unsupported(self_process.lock)
160-
assert self_process.state == "locked"
301+
_run_checkpoint_scenario_or_skip(
302+
"""
303+
target, _ = start_target()
304+
proc = checkpoint.Process(target.pid)
305+
try:
306+
run_or_skip_unsupported(proc.lock, timeout_ms=5000)
307+
assert proc.state == "locked"
161308
162-
_run_or_skip_unsupported(self_process.checkpoint)
163-
assert self_process.state == "checkpointed"
309+
run_or_skip_unsupported(proc.checkpoint)
310+
assert proc.state == "checkpointed"
164311
165-
_run_or_skip_unsupported(self_process.restore)
166-
assert self_process.state == "locked" # restore leaves process locked
312+
run_or_skip_unsupported(proc.restore)
313+
assert proc.state == "locked" # restore leaves process locked
167314
168-
self_process.unlock()
169-
assert self_process.state == "running"
315+
proc.unlock()
316+
assert proc.state == "running"
317+
finally:
318+
stop_target(target)
319+
"""
320+
)
170321

171322

172323
# -- GPU migration (>= 2 same-chip GPUs, real driver) ---------------------
@@ -183,83 +334,72 @@ class TestCheckpointGpuMigration:
183334
NVBug 5437334).
184335
"""
185336

186-
@staticmethod
187-
def _try_migration(proc, gpu_mapping):
188-
"""Attempt a single checkpoint-restore with migration.
189-
190-
Returns True on success. Skips the test if the driver rejects
191-
the migration with CUDA_ERROR_INVALID_VALUE (known limitation
192-
on some architectures / driver versions).
193-
"""
194-
_run_or_skip_unsupported(proc.lock)
195-
_run_or_skip_unsupported(proc.checkpoint)
196-
try:
197-
_run_or_skip_unsupported(proc.restore, gpu_mapping=gpu_mapping)
198-
except (CUDAError, RuntimeError) as exc:
199-
# Recover: restore without migration, then unlock.
200-
proc.restore()
201-
proc.unlock()
202-
if "INVALID_VALUE" in str(exc):
203-
pytest.skip(
204-
"Driver does not support GPU migration on this hardware "
205-
"(CUDA_ERROR_INVALID_VALUE — see NVBug 5437334)"
206-
)
207-
raise
208-
proc.unlock()
209-
return True
210-
211-
def test_rotation_migrates_context(self, self_process):
337+
def test_rotation_migrates_context(self):
212338
"""Rotate context through all GPUs and back to the origin.
213339
214340
Builds a rotation mapping (device i -> device (i+1) % N) for
215341
every visible device and performs N rotations. After each step
216342
the context device UUID is checked. After N steps the context
217343
should be back on the original device.
218344
"""
219-
devices = Device.get_all_devices()
220-
if len(devices) < 2:
221-
pytest.skip("GPU migration tests require at least 2 GPUs")
222-
if _find_same_chip_pair(devices) is None:
223-
pytest.skip("GPU migration requires at least 2 GPUs of the same chip type")
224-
225-
gpu_mapping = _build_rotation_mapping(devices)
226-
uuid_origin = Device().uuid
227-
228-
for step in range(len(devices)):
229-
expected_uuid = devices[(step + 1) % len(devices)].uuid
230-
231-
self._try_migration(self_process, gpu_mapping)
232-
233-
assert Device().uuid == expected_uuid, f"Step {step}: expected UUID {expected_uuid}, got {Device().uuid}"
234-
235-
# After N rotations, back at the origin.
236-
assert Device().uuid == uuid_origin
237-
238-
def test_swap_identical_gpus(self, self_process):
345+
_run_checkpoint_scenario_or_skip(
346+
"""
347+
devices = Device.get_all_devices()
348+
if len(devices) < 2:
349+
skip("GPU migration tests require at least 2 GPUs")
350+
if find_same_chip_pair(devices) is None:
351+
skip("GPU migration requires at least 2 GPUs of the same chip type")
352+
353+
gpu_mapping = build_rotation_mapping(devices)
354+
target, uuid_origin = start_target(0)
355+
proc = checkpoint.Process(target.pid)
356+
try:
357+
for step in range(len(devices)):
358+
expected_uuid = devices[(step + 1) % len(devices)].uuid
359+
checkpoint_restore(proc, gpu_mapping=gpu_mapping)
360+
observed_uuid = target_uuid(target)
361+
assert observed_uuid == expected_uuid, (
362+
f"Step {step}: expected UUID {expected_uuid}, got {observed_uuid}"
363+
)
364+
365+
assert target_uuid(target) == uuid_origin
366+
finally:
367+
stop_target(target)
368+
""",
369+
timeout=180,
370+
)
371+
372+
def test_swap_identical_gpus(self):
239373
"""Swap context between two GPUs of the same chip type.
240374
241375
Sets the context on one of the pair members so that a successful
242376
migration is observable (the context UUID changes).
243377
"""
244-
devices = Device.get_all_devices()
245-
pair = _find_same_chip_pair(devices)
246-
if pair is None:
247-
pytest.skip("No two GPUs of the same chip type found")
248-
249-
i, j = pair
250-
# Place context on device i so the swap is observable.
251-
devices[i].set_current()
252-
253-
# Build an identity mapping, then swap the pair (using UUID strings).
254-
gpu_mapping = {d.uuid: d.uuid for d in devices}
255-
gpu_mapping[devices[i].uuid] = devices[j].uuid
256-
gpu_mapping[devices[j].uuid] = devices[i].uuid
257-
258-
assert Device().uuid == devices[i].uuid
259-
260-
self._try_migration(self_process, gpu_mapping)
261-
uuid_after = Device().uuid
262-
263-
if uuid_after == devices[i].uuid:
264-
pytest.skip("Driver accepted GPU swap but migration is a no-op on this hardware/driver version")
265-
assert uuid_after == devices[j].uuid
378+
_run_checkpoint_scenario_or_skip(
379+
"""
380+
devices = Device.get_all_devices()
381+
pair = find_same_chip_pair(devices)
382+
if pair is None:
383+
skip("No two GPUs of the same chip type found")
384+
385+
i, j = pair
386+
gpu_mapping = {d.uuid: d.uuid for d in devices}
387+
gpu_mapping[devices[i].uuid] = devices[j].uuid
388+
gpu_mapping[devices[j].uuid] = devices[i].uuid
389+
390+
target, uuid_before = start_target(i)
391+
proc = checkpoint.Process(target.pid)
392+
try:
393+
assert uuid_before == devices[i].uuid
394+
395+
checkpoint_restore(proc, gpu_mapping=gpu_mapping)
396+
uuid_after = target_uuid(target)
397+
398+
if uuid_after == devices[i].uuid:
399+
skip("Driver accepted GPU swap but migration is a no-op on this hardware/driver version")
400+
assert uuid_after == devices[j].uuid
401+
finally:
402+
stop_target(target)
403+
""",
404+
timeout=120,
405+
)

0 commit comments

Comments
 (0)