Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def _import_versioned_module():
del _import_versioned_module


from cuda.core import system, utils
from cuda.core import checkpoint, system, utils
from cuda.core._device import Device
from cuda.core._event import Event, EventOptions
from cuda.core._graphics import GraphicsResource
Expand Down
248 changes: 248 additions & 0 deletions cuda_core/cuda/core/checkpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

Comment thread
leofang marked this conversation as resolved.
import ctypes as _ctypes
from collections.abc import Mapping as _Mapping
from typing import Any as _Any

from cuda.core._utils.cuda_utils import handle_return as _handle_cuda_return
from cuda.core._utils.version import binding_version as _binding_version
from cuda.core._utils.version import driver_version as _driver_version
from cuda.core.typing import ProcessStateT as _ProcessStateT

try:
from cuda.bindings import driver as _driver
except ImportError:
from cuda import cuda as _driver
Comment thread
leofang marked this conversation as resolved.


_PROCESS_STATE_NAME_ATTRS: tuple[tuple[str, _ProcessStateT], ...] = (
("CU_PROCESS_STATE_RUNNING", "running"),
("CU_PROCESS_STATE_LOCKED", "locked"),
("CU_PROCESS_STATE_CHECKPOINTED", "checkpointed"),
("CU_PROCESS_STATE_FAILED", "failed"),
)

_REQUIRED_BINDING_ATTRS = (
"cuCheckpointProcessCheckpoint",
"cuCheckpointProcessGetRestoreThreadId",
"cuCheckpointProcessGetState",
"cuCheckpointProcessLock",
"cuCheckpointProcessRestore",
"cuCheckpointProcessUnlock",
"CUcheckpointGpuPair",
"CUcheckpointLockArgs",
"CUprocessState",
"CUcheckpointRestoreArgs",
)
_REQUIRED_DRIVER_VERSION = (12, 8, 0)
_driver_capability_checked = False


class Process:
"""
CUDA process that can be locked, checkpointed, restored, and unlocked.

Parameters
----------
pid : int
Process ID of the CUDA process.
"""

__slots__ = ("pid",)

def __init__(self, pid: int):
self.pid = _check_pid(pid)
Comment thread
leofang marked this conversation as resolved.
Outdated

@property
def state(self) -> _ProcessStateT:
"""
CUDA checkpoint state for this process.
"""
driver = _get_driver()
state = _call_driver(driver, driver.cuCheckpointProcessGetState, self.pid)
state_names = _get_process_state_names(driver)
try:
return state_names[state]
except KeyError as e:
state_value = int(state)
raise RuntimeError(f"Unknown CUDA checkpoint process state: {state_value}") from e

@property
def restore_thread_id(self) -> int:
"""
CUDA restore thread ID for this process.
"""
driver = _get_driver()
return _call_driver(driver, driver.cuCheckpointProcessGetRestoreThreadId, self.pid)

def lock(self, timeout_ms: int = 0) -> None:
Comment thread
leofang marked this conversation as resolved.
"""
Lock this process, blocking further CUDA API calls.

Parameters
----------
timeout_ms : int, optional
Timeout in milliseconds. A value of 0 indicates no timeout.
"""
driver = _get_driver()
args = driver.CUcheckpointLockArgs()
args.timeoutMs = _check_timeout_ms(timeout_ms)
_call_driver(driver, driver.cuCheckpointProcessLock, self.pid, args)

def checkpoint(self) -> None:
"""
Checkpoint the GPU memory contents of this locked process.
"""
driver = _get_driver()
_call_driver(driver, driver.cuCheckpointProcessCheckpoint, self.pid, None)

def restore(self, gpu_mapping: _Mapping[_Any, _Any] | None = None) -> None:
"""
Restore this checkpointed process.

Parameters
----------
gpu_mapping : mapping, optional
GPU UUID remapping from each checkpointed GPU UUID to the GPU UUID
to restore onto. For migration workflows, provide mappings for
every CUDA-visible GPU.
"""
driver = _get_driver()
args = _make_restore_args(driver, gpu_mapping)
_call_driver(driver, driver.cuCheckpointProcessRestore, self.pid, args)

def unlock(self) -> None:
"""
Unlock this locked process so it can resume CUDA API calls.
"""
driver = _get_driver()
_call_driver(driver, driver.cuCheckpointProcessUnlock, self.pid, None)


def _get_driver():
global _driver_capability_checked
if _driver_capability_checked:
return _driver

binding_ver = _binding_version()
if not _binding_version_supports_checkpoint(binding_ver):
raise RuntimeError(
"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. "
f"Found cuda.bindings {'.'.join(str(part) for part in binding_ver[:3])}."
)

missing = [name for name in _REQUIRED_BINDING_ATTRS if not hasattr(_driver, name)]
if missing:
raise RuntimeError(
f"CUDA checkpointing requires cuda.bindings with CUDA checkpoint API support. Missing: {', '.join(missing)}"
)

driver_ver = _driver_version()
if driver_ver < _REQUIRED_DRIVER_VERSION:
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
)

_driver_capability_checked = True
return _driver
Comment thread
leofang marked this conversation as resolved.
Comment thread
leofang marked this conversation as resolved.


def _binding_version_supports_checkpoint(version) -> bool:
major, minor, patch = version[:3]
return (major == 12 and (minor, patch) >= (8, 0)) or (major == 13 and (minor, patch) >= (0, 2)) or major > 13


def _get_process_state_names(driver) -> dict[_Any, _ProcessStateT]:
return {getattr(driver.CUprocessState, attr): state_name for attr, state_name in _PROCESS_STATE_NAME_ATTRS}


def _call_driver(driver, func, *args):
try:
result = func(*args)
except RuntimeError as e:
if "cuCheckpointProcess" in str(e) and "not found" in str(e):
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
) from e
raise
return _handle_return(driver, result)


def _handle_return(driver, result):
Comment thread
leofang marked this conversation as resolved.
Outdated
err = result[0]
not_supported_errors = (
getattr(driver.CUresult, "CUDA_ERROR_NOT_FOUND", None),
getattr(driver.CUresult, "CUDA_ERROR_NOT_SUPPORTED", None),
)
if err in not_supported_errors:
raise RuntimeError(
"CUDA checkpointing is not supported by the installed NVIDIA driver. "
"Upgrade to a driver version with CUDA checkpoint API support."
)

return _handle_cuda_return(result)


def _check_pid(pid: int) -> int:
if isinstance(pid, bool) or not isinstance(pid, int):
raise TypeError("pid must be an int")
if pid <= 0:
raise ValueError("pid must be a positive int")
return pid


def _check_timeout_ms(timeout_ms: int) -> int:
if isinstance(timeout_ms, bool) or not isinstance(timeout_ms, int):
raise TypeError("timeout_ms must be an int")
if timeout_ms < 0:
raise ValueError("timeout_ms must be >= 0")
return timeout_ms


def _make_restore_args(driver, gpu_mapping: _Mapping[_Any, _Any] | None):
if gpu_mapping is None:
return None
if not isinstance(gpu_mapping, _Mapping):
raise TypeError("gpu_mapping must be a mapping from checkpointed GPU UUID to restore GPU UUID")

pairs = []
for old_uuid, new_uuid in gpu_mapping.items():
pair = driver.CUcheckpointGpuPair()
buffers = []
pair.oldUuid = _as_cuuuid(driver, old_uuid, buffers)
pair.newUuid = _as_cuuuid(driver, new_uuid, buffers)
pairs.append(pair)

if not pairs:
return None

args = driver.CUcheckpointRestoreArgs()
args.gpuPairs = pairs
args.gpuPairsCount = len(pairs)
Comment thread
leofang marked this conversation as resolved.
return args


def _as_cuuuid(driver, value, buffers):
"""Convert *value* to a ``CUuuid``.

Accepts a ``CUuuid`` instance (returned as-is) or a UUID string in
the ``"xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"`` format returned by
:attr:`Device.uuid`.
"""
if isinstance(value, str):
raw = bytes.fromhex(value.replace("-", ""))
if len(raw) != 16:
raise ValueError(f"GPU UUID string must be 32 hex characters (with optional hyphens), got {value!r}")
buf = _ctypes.create_string_buffer(raw, 16)
buffers.append(buf)
return driver.CUuuid(_ctypes.addressof(buf))
return value


__all__ = [
"Process",
]
5 changes: 5 additions & 0 deletions cuda_core/cuda/core/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,15 @@

"""Public type aliases and protocols used in cuda.core API signatures."""

from typing import Literal as _Literal

from cuda.core._memory._buffer import DevicePointerT
from cuda.core._stream import IsStreamT

ProcessStateT = _Literal["running", "locked", "checkpointed", "failed"]

__all__ = [
"DevicePointerT",
"IsStreamT",
"ProcessStateT",
]
59 changes: 59 additions & 0 deletions cuda_core/docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,65 @@ CUDA compilation toolchain
LinkerOptions


CUDA process checkpointing
--------------------------

The :mod:`cuda.core.checkpoint` module wraps the CUDA driver process
checkpoint APIs. These APIs are intended for Linux process checkpoint and
restore workflows, and require a CUDA driver with checkpoint API support and
a ``cuda-bindings`` version that exposes those driver entry points.

Checkpointing is typically driven by a coordinator process acting on a target
CUDA process, similar to attaching a debugger or sending a signal. The target
process is identified by process ID. Linux and the CUDA driver enforce process
permissions; checkpointing another user's process may require elevated
permissions such as ``CAP_SYS_PTRACE`` or administrator privileges.

The CUDA checkpoint APIs prepare CUDA-managed GPU state for process-level
checkpoint and restore. They do not capture the CPU process image by
themselves; full process checkpoint workflows still need a CPU-side process
checkpointing tool such as CRIU. A minimal coordinator-side sequence looks like
this:

.. code-block:: python

import os

from cuda.core import checkpoint

target_pid = os.getpid() # or the PID of another CUDA process
process = checkpoint.Process(target_pid)
process.lock(timeout_ms=5000)
process.checkpoint()

# Capture or restore the CPU process image outside cuda.core.

process.restore()
process.unlock()

``Process.state`` returns one of ``"running"``, ``"locked"``,
``"checkpointed"``, or ``"failed"``. Restore may optionally remap GPUs by
passing ``gpu_mapping`` from each checkpointed GPU UUID to the GPU UUID that
should be used during restore. For migration workflows, provide mappings for
every CUDA-visible GPU. The mapping may use ``CUuuid`` objects or the UUID
strings returned by :attr:`Device.uuid`. A successful restore returns the
process to the locked state; call ``Process.unlock`` after restore to allow
CUDA API calls to resume.

The CUDA driver requires restore to run from the process restore thread.
Use ``Process.restore_thread_id`` to discover that thread before calling
``Process.restore`` from a checkpoint coordinator. Restore also requires
persistence mode to be enabled or ``cuInit`` to have been called before
execution.

.. autosummary::
:toctree: generated/

:template: class.rst

checkpoint.Process


CUDA system information and NVIDIA Management Library (NVML)
------------------------------------------------------------

Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/api_private.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ CUDA runtime
:toctree: generated/

typing.DevicePointerT
typing.ProcessStateT
_memory._virtual_memory_resource.VirtualMemoryAllocationTypeT
_memory._virtual_memory_resource.VirtualMemoryLocationTypeT
_memory._virtual_memory_resource.VirtualMemoryGranularityT
Expand Down
5 changes: 4 additions & 1 deletion cuda_core/docs/source/release/1.0.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ Highlights
New features
------------

- TBD
- Added the :mod:`cuda.core.checkpoint` module for CUDA process checkpointing,
including string process state queries, lock/checkpoint/restore/unlock
operations, and GPU UUID remapping support for restore.
(`#1343 <https://github.com/NVIDIA/cuda-python/issues/1343>`__)


Fixes and enhancements
Expand Down
Loading
Loading