From 3ce42b7eaa7f773789fbbde204242a51d301cb23 Mon Sep 17 00:00:00 2001 From: mdzurick Date: Mon, 6 Apr 2026 19:12:43 +0000 Subject: [PATCH 1/2] Use pinned host memory for GPU state-to-NumPy conversion (#2797) - Fix performance bottleneck in `np.array(cudaq.get_state(kernel))` for GPU-backed states by using pinned (`cudaMallocHost`) host memory instead of pageable (`new[]`) for the device-to-host transfer in the pybind11 buffer protocol handler - Add `SimulationState::toHostBuffer()` virtual method so GPU backends can provide optimized host allocation, with a pageable fallback for non-GPU backends and for systems that cannot pin the requested amount of memory - Fix `to_cupy()` passing a hardcoded 1024-byte size to `UnownedMemory` instead of the actual buffer size Signed-off-by: mdzurick --- docs/sphinx/api/languages/cpp_api.rst | 3 +++ python/cudaq/runtime/state.py | 3 ++- python/runtime/cudaq/algorithms/py_state.cpp | 25 ++++--------------- runtime/common/SimulationState.h | 25 +++++++++++++++++++ runtime/cudaq/qis/state.h | 6 +++++ .../nvqir/cudensitymat/CuDensityMatState.cpp | 22 ++++++++++++++++ .../nvqir/cudensitymat/CuDensityMatState.h | 3 +++ runtime/nvqir/custatevec/CuStateVecState.h | 25 +++++++++++++++++++ 8 files changed, 91 insertions(+), 21 deletions(-) diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst index ba842449820..1eb2776bc66 100644 --- a/docs/sphinx/api/languages/cpp_api.rst +++ b/docs/sphinx/api/languages/cpp_api.rst @@ -137,6 +137,9 @@ Common .. doxygenstruct:: cudaq::SimulationState::Tensor :members: +.. doxygenstruct:: cudaq::SimulationState::HostBuffer + :members: + .. doxygenenum:: cudaq::SimulationState::precision .. doxygenenum:: cudaq::simulation_precision diff --git a/python/cudaq/runtime/state.py b/python/cudaq/runtime/state.py index 4c642a599f3..b70f8a2404f 100644 --- a/python/cudaq/runtime/state.py +++ b/python/cudaq/runtime/state.py @@ -97,7 +97,8 @@ def to_cupy(state, dtype=None): arrays = [] for tensor in state.getTensors(): - mem = cp.cuda.UnownedMemory(tensor.data(), 1024, owner=None) + total_bytes = tensor.get_num_elements() * tensor.get_element_size() + mem = cp.cuda.UnownedMemory(tensor.data(), total_bytes, owner=None) memptr = cp.cuda.MemoryPointer(mem, offset=0) arrays.append(cp.ndarray(tensor.extents, dtype=dtype, memptr=memptr)) return arrays diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp index d7e4b9aa372..4919a3454f6 100644 --- a/python/runtime/cudaq/algorithms/py_state.cpp +++ b/python/runtime/cudaq/algorithms/py_state.cpp @@ -353,28 +353,13 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) { auto stateVector = self.get_tensor(); auto precision = self.get_precision(); if (self.is_on_gpu()) { - // This is device data, transfer to host, which gives us - // ownership of a new data pointer on host. Store it globally + // This is device data, transfer to host. GPU backends use pinned + // host memory for faster DMA transfers. Store the buffer globally // here so we ensure that it gets cleaned up. auto numElements = stateVector.get_num_elements(); - if (precision == SimulationState::precision::fp32) { - auto *hostData = new std::complex[numElements]; - self.to_host(hostData, numElements); - dataPtr = reinterpret_cast(hostData); - } else { - auto *hostData = new std::complex[numElements]; - self.to_host(hostData, numElements); - dataPtr = reinterpret_cast(hostData); - } - hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) { - CUDAQ_INFO("freeing data that was copied from GPU device for " - "compatibility with NumPy"); - // Use delete[] to match new[] allocation (not free()) - if (precision == SimulationState::precision::fp32) - delete[] static_cast *>(data); - else - delete[] static_cast *>(data); - }); + auto hostBuf = self.toHostBuffer(numElements); + dataPtr = hostBuf.data; + hostDataFromDevice.emplace_back(dataPtr, std::move(hostBuf.deleter)); } else { dataPtr = self.get_tensor().data; } diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h index 1fdf45ad23d..8ec4638120a 100644 --- a/runtime/common/SimulationState.h +++ b/runtime/common/SimulationState.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,30 @@ class SimulationState { "SimulationState::toHost complex64 not implemented."); } + /// @brief A host-side buffer with a custom cleanup function. + struct HostBuffer { + void *data = nullptr; + std::function deleter; + }; + + /// @brief Allocate a host buffer and copy all state data into it. + /// GPU-backed states should override to use pinned memory for faster + /// device-to-host transfers. Returns a HostBuffer that knows how to + /// free the memory correctly. + virtual HostBuffer toHostBuffer(std::size_t numElements) const { + auto prec = getPrecision(); + if (prec == precision::fp32) { + auto *ptr = new std::complex[numElements]; + toHost(ptr, numElements); + return {ptr, + [](void *p) { delete[] static_cast *>(p); }}; + } + auto *ptr = new std::complex[numElements]; + toHost(ptr, numElements); + return {ptr, + [](void *p) { delete[] static_cast *>(p); }}; + } + /// @brief Destructor virtual ~SimulationState() {} }; diff --git a/runtime/cudaq/qis/state.h b/runtime/cudaq/qis/state.h index 3723c80be0c..819e49c90de 100644 --- a/runtime/cudaq/qis/state.h +++ b/runtime/cudaq/qis/state.h @@ -143,6 +143,12 @@ class state { internal->toHost(hostPtr, numElements); } + /// @brief Allocate a host buffer and copy state data from device. + /// GPU backends use pinned memory for optimal transfer speed. + SimulationState::HostBuffer toHostBuffer(std::size_t numElements) const { + return internal->toHostBuffer(numElements); + } + /// @brief Dump the state to standard out void dump() const; diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp index 53469e13f8a..1e1596d6832 100644 --- a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp +++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp @@ -228,6 +228,28 @@ void CuDensityMatState::toHost(std::complex *userData, "double-precision array."); } +// Allocate a pinned host buffer and copy state data into it. +SimulationState::HostBuffer +CuDensityMatState::toHostBuffer(std::size_t numElements) const { + if (numElements != dimension) + throw std::runtime_error( + fmt::format("toHostBuffer: element count mismatch: provided {}, " + "expected {}.", + numElements, dimension)); + std::size_t numBytes = numElements * sizeof(std::complex); + void *pinnedPtr = nullptr; + auto err = cudaMallocHost(&pinnedPtr, numBytes); + if (err != cudaSuccess) + return SimulationState::toHostBuffer(numElements); + auto cpyErr = + cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost); + if (cpyErr != cudaSuccess) { + cudaFreeHost(pinnedPtr); + HANDLE_CUDA_ERROR(cpyErr); + } + return {pinnedPtr, [](void *p) { cudaFreeHost(p); }}; +} + // Free the device data. void CuDensityMatState::destroyState() { if (cudmState) { diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.h b/runtime/nvqir/cudensitymat/CuDensityMatState.h index e8e9117638f..fdc398f4016 100644 --- a/runtime/nvqir/cudensitymat/CuDensityMatState.h +++ b/runtime/nvqir/cudensitymat/CuDensityMatState.h @@ -116,6 +116,9 @@ class CuDensityMatState : public cudaq::SimulationState { // Copy the state device data to the user-provided host data pointer. void toHost(std::complex *userData, std::size_t numElements) const override; + // Allocate a pinned host buffer and copy state data into it. + HostBuffer toHostBuffer(std::size_t numElements) const override; + // Free the device data. void destroyState() override; diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h index a70bb2a8a4d..2559eb1b2da 100644 --- a/runtime/nvqir/custatevec/CuStateVecState.h +++ b/runtime/nvqir/custatevec/CuStateVecState.h @@ -284,6 +284,31 @@ class CusvState : public cudaq::SimulationState { return; } + /// @brief Allocate a pinned host buffer and copy state data into it. + /// Pinned memory enables direct DMA transfers, significantly improving + /// bandwidth compared to non-pinned memory for large device-to-host copies. + HostBuffer toHostBuffer(std::size_t numElements) const override { + if (numElements != size) + throw std::runtime_error( + "[custatevec-state] toHostBuffer: invalid number of elements."); + checkAndSetDevice(); + std::size_t numBytes = numElements * sizeof(std::complex); + void *pinnedPtr = nullptr; + auto err = cudaMallocHost(&pinnedPtr, numBytes); + if (err != cudaSuccess) { + // Fallback to non-pinned allocation if pinning fails (e.g., very large + // states that exceed the OS limit for pinned memory). + return SimulationState::toHostBuffer(numElements); + } + auto cpyErr = + cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost); + if (cpyErr != cudaSuccess) { + cudaFreeHost(pinnedPtr); + HANDLE_CUDA_ERROR(cpyErr); + } + return {pinnedPtr, [](void *p) { cudaFreeHost(p); }}; + } + /// @brief Free the device data. void destroyState() override { if (!ownsDevicePtr) From 80b17cff0613ecd4746b69994e757229d31fe9df Mon Sep 17 00:00:00 2001 From: mdzurick Date: Mon, 13 Apr 2026 13:05:45 +0000 Subject: [PATCH 2/2] reuse state data Signed-off-by: mdzurick --- runtime/nvqir/custatevec/CuStateVecState.h | 60 ++++++++++++++++------ 1 file changed, 43 insertions(+), 17 deletions(-) diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h index 2559eb1b2da..5ece98c13bc 100644 --- a/runtime/nvqir/custatevec/CuStateVecState.h +++ b/runtime/nvqir/custatevec/CuStateVecState.h @@ -56,6 +56,12 @@ class CusvState : public cudaq::SimulationState { /// @brief Flag indicating ownership of the state data. bool ownsDevicePtr = true; + /// @brief Cached pinned host buffer for device-to-host transfers. + /// Allocated once on the first toHostBuffer call and reused on subsequent + /// calls, avoiding repeated cudaMallocHost/cudaFreeHost overhead that + /// dominates for large state vectors (>= 64 MiB). + mutable void *cachedPinnedPtr = nullptr; + /// @brief Check that we are currently /// using the correct CUDA device, set it /// to the correct one if not @@ -284,33 +290,46 @@ class CusvState : public cudaq::SimulationState { return; } - /// @brief Allocate a pinned host buffer and copy state data into it. - /// Pinned memory enables direct DMA transfers, significantly improving - /// bandwidth compared to non-pinned memory for large device-to-host copies. + /// @brief Copy state data into a pinned host buffer and return it. + /// + /// The pinned buffer is allocated once on the first call and reused on + /// subsequent calls. This eliminates repeated cudaMallocHost/cudaFreeHost + /// overhead, which dominates transfer time for large state vectors (>= 64 + /// MiB) and negates the DMA bandwidth benefit of pinned memory. + /// + /// The returned HostBuffer uses a no-op deleter because this object owns + /// the pinned allocation and frees it in destroyState(). np.array() copies + /// the data synchronously, so the buffer is never in use across two + /// concurrent calls on the same state object. HostBuffer toHostBuffer(std::size_t numElements) const override { if (numElements != size) throw std::runtime_error( "[custatevec-state] toHostBuffer: invalid number of elements."); checkAndSetDevice(); - std::size_t numBytes = numElements * sizeof(std::complex); - void *pinnedPtr = nullptr; - auto err = cudaMallocHost(&pinnedPtr, numBytes); - if (err != cudaSuccess) { - // Fallback to non-pinned allocation if pinning fails (e.g., very large - // states that exceed the OS limit for pinned memory). - return SimulationState::toHostBuffer(numElements); + const std::size_t numBytes = numElements * sizeof(std::complex); + // Allocate pinned buffer on first call; reuse it on all subsequent calls. + if (cachedPinnedPtr == nullptr) { + auto err = cudaMallocHost(&cachedPinnedPtr, numBytes); + if (err != cudaSuccess) { + // Fallback to pageable allocation if pinning fails (e.g., the OS + // pinned-memory limit is exhausted). + return SimulationState::toHostBuffer(numElements); + } } - auto cpyErr = - cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost); - if (cpyErr != cudaSuccess) { - cudaFreeHost(pinnedPtr); + auto cpyErr = cudaMemcpy(cachedPinnedPtr, devicePtr, numBytes, + cudaMemcpyDeviceToHost); + if (cpyErr != cudaSuccess) HANDLE_CUDA_ERROR(cpyErr); - } - return {pinnedPtr, [](void *p) { cudaFreeHost(p); }}; + // No-op deleter: this state object owns the pinned buffer. + return {cachedPinnedPtr, [](void *) {}}; } - /// @brief Free the device data. + /// @brief Free the device data and the cached pinned host buffer. void destroyState() override { + if (cachedPinnedPtr) { + cudaFreeHost(cachedPinnedPtr); + cachedPinnedPtr = nullptr; + } if (!ownsDevicePtr) return; @@ -327,6 +346,13 @@ class CusvState : public cudaq::SimulationState { HANDLE_CUDA_ERROR(cudaFree(devicePtr)); } + + ~CusvState() override { + if (cachedPinnedPtr) { + cudaFreeHost(cachedPinnedPtr); + cachedPinnedPtr = nullptr; + } + } }; } // namespace cudaq