diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst index ba842449820..1eb2776bc66 100644 --- a/docs/sphinx/api/languages/cpp_api.rst +++ b/docs/sphinx/api/languages/cpp_api.rst @@ -137,6 +137,9 @@ Common .. doxygenstruct:: cudaq::SimulationState::Tensor :members: +.. doxygenstruct:: cudaq::SimulationState::HostBuffer + :members: + .. doxygenenum:: cudaq::SimulationState::precision .. doxygenenum:: cudaq::simulation_precision diff --git a/python/cudaq/runtime/state.py b/python/cudaq/runtime/state.py index 4c642a599f3..b70f8a2404f 100644 --- a/python/cudaq/runtime/state.py +++ b/python/cudaq/runtime/state.py @@ -97,7 +97,8 @@ def to_cupy(state, dtype=None): arrays = [] for tensor in state.getTensors(): - mem = cp.cuda.UnownedMemory(tensor.data(), 1024, owner=None) + total_bytes = tensor.get_num_elements() * tensor.get_element_size() + mem = cp.cuda.UnownedMemory(tensor.data(), total_bytes, owner=None) memptr = cp.cuda.MemoryPointer(mem, offset=0) arrays.append(cp.ndarray(tensor.extents, dtype=dtype, memptr=memptr)) return arrays diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp index 38fa842f020..37a4c08ad0a 100644 --- a/python/runtime/cudaq/algorithms/py_state.cpp +++ b/python/runtime/cudaq/algorithms/py_state.cpp @@ -357,28 +357,13 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) { auto stateVector = self.get_tensor(); auto precision = self.get_precision(); if (self.is_on_gpu()) { - // This is device data, transfer to host, which gives us - // ownership of a new data pointer on host. Store it globally + // This is device data, transfer to host. GPU backends use pinned + // host memory for faster DMA transfers. Store the buffer globally // here so we ensure that it gets cleaned up. auto numElements = stateVector.get_num_elements(); - if (precision == SimulationState::precision::fp32) { - auto *hostData = new std::complex[numElements]; - self.to_host(hostData, numElements); - dataPtr = reinterpret_cast(hostData); - } else { - auto *hostData = new std::complex[numElements]; - self.to_host(hostData, numElements); - dataPtr = reinterpret_cast(hostData); - } - hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) { - CUDAQ_INFO("freeing data that was copied from GPU device for " - "compatibility with NumPy"); - // Use delete[] to match new[] allocation (not free()) - if (precision == SimulationState::precision::fp32) - delete[] static_cast *>(data); - else - delete[] static_cast *>(data); - }); + auto hostBuf = self.toHostBuffer(numElements); + dataPtr = hostBuf.data; + hostDataFromDevice.emplace_back(dataPtr, std::move(hostBuf.deleter)); } else { dataPtr = self.get_tensor().data; } diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h index 1fdf45ad23d..8ec4638120a 100644 --- a/runtime/common/SimulationState.h +++ b/runtime/common/SimulationState.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -256,6 +257,30 @@ class SimulationState { "SimulationState::toHost complex64 not implemented."); } + /// @brief A host-side buffer with a custom cleanup function. + struct HostBuffer { + void *data = nullptr; + std::function deleter; + }; + + /// @brief Allocate a host buffer and copy all state data into it. + /// GPU-backed states should override to use pinned memory for faster + /// device-to-host transfers. Returns a HostBuffer that knows how to + /// free the memory correctly. + virtual HostBuffer toHostBuffer(std::size_t numElements) const { + auto prec = getPrecision(); + if (prec == precision::fp32) { + auto *ptr = new std::complex[numElements]; + toHost(ptr, numElements); + return {ptr, + [](void *p) { delete[] static_cast *>(p); }}; + } + auto *ptr = new std::complex[numElements]; + toHost(ptr, numElements); + return {ptr, + [](void *p) { delete[] static_cast *>(p); }}; + } + /// @brief Destructor virtual ~SimulationState() {} }; diff --git a/runtime/cudaq/qis/state.h b/runtime/cudaq/qis/state.h index 3723c80be0c..819e49c90de 100644 --- a/runtime/cudaq/qis/state.h +++ b/runtime/cudaq/qis/state.h @@ -143,6 +143,12 @@ class state { internal->toHost(hostPtr, numElements); } + /// @brief Allocate a host buffer and copy state data from device. + /// GPU backends use pinned memory for optimal transfer speed. + SimulationState::HostBuffer toHostBuffer(std::size_t numElements) const { + return internal->toHostBuffer(numElements); + } + /// @brief Dump the state to standard out void dump() const; diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp index 53469e13f8a..1e1596d6832 100644 --- a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp +++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp @@ -228,6 +228,28 @@ void CuDensityMatState::toHost(std::complex *userData, "double-precision array."); } +// Allocate a pinned host buffer and copy state data into it. +SimulationState::HostBuffer +CuDensityMatState::toHostBuffer(std::size_t numElements) const { + if (numElements != dimension) + throw std::runtime_error( + fmt::format("toHostBuffer: element count mismatch: provided {}, " + "expected {}.", + numElements, dimension)); + std::size_t numBytes = numElements * sizeof(std::complex); + void *pinnedPtr = nullptr; + auto err = cudaMallocHost(&pinnedPtr, numBytes); + if (err != cudaSuccess) + return SimulationState::toHostBuffer(numElements); + auto cpyErr = + cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost); + if (cpyErr != cudaSuccess) { + cudaFreeHost(pinnedPtr); + HANDLE_CUDA_ERROR(cpyErr); + } + return {pinnedPtr, [](void *p) { cudaFreeHost(p); }}; +} + // Free the device data. void CuDensityMatState::destroyState() { if (cudmState) { diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.h b/runtime/nvqir/cudensitymat/CuDensityMatState.h index e8e9117638f..fdc398f4016 100644 --- a/runtime/nvqir/cudensitymat/CuDensityMatState.h +++ b/runtime/nvqir/cudensitymat/CuDensityMatState.h @@ -116,6 +116,9 @@ class CuDensityMatState : public cudaq::SimulationState { // Copy the state device data to the user-provided host data pointer. void toHost(std::complex *userData, std::size_t numElements) const override; + // Allocate a pinned host buffer and copy state data into it. + HostBuffer toHostBuffer(std::size_t numElements) const override; + // Free the device data. void destroyState() override; diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h index a70bb2a8a4d..5ece98c13bc 100644 --- a/runtime/nvqir/custatevec/CuStateVecState.h +++ b/runtime/nvqir/custatevec/CuStateVecState.h @@ -56,6 +56,12 @@ class CusvState : public cudaq::SimulationState { /// @brief Flag indicating ownership of the state data. bool ownsDevicePtr = true; + /// @brief Cached pinned host buffer for device-to-host transfers. + /// Allocated once on the first toHostBuffer call and reused on subsequent + /// calls, avoiding repeated cudaMallocHost/cudaFreeHost overhead that + /// dominates for large state vectors (>= 64 MiB). + mutable void *cachedPinnedPtr = nullptr; + /// @brief Check that we are currently /// using the correct CUDA device, set it /// to the correct one if not @@ -284,8 +290,46 @@ class CusvState : public cudaq::SimulationState { return; } - /// @brief Free the device data. + /// @brief Copy state data into a pinned host buffer and return it. + /// + /// The pinned buffer is allocated once on the first call and reused on + /// subsequent calls. This eliminates repeated cudaMallocHost/cudaFreeHost + /// overhead, which dominates transfer time for large state vectors (>= 64 + /// MiB) and negates the DMA bandwidth benefit of pinned memory. + /// + /// The returned HostBuffer uses a no-op deleter because this object owns + /// the pinned allocation and frees it in destroyState(). np.array() copies + /// the data synchronously, so the buffer is never in use across two + /// concurrent calls on the same state object. + HostBuffer toHostBuffer(std::size_t numElements) const override { + if (numElements != size) + throw std::runtime_error( + "[custatevec-state] toHostBuffer: invalid number of elements."); + checkAndSetDevice(); + const std::size_t numBytes = numElements * sizeof(std::complex); + // Allocate pinned buffer on first call; reuse it on all subsequent calls. + if (cachedPinnedPtr == nullptr) { + auto err = cudaMallocHost(&cachedPinnedPtr, numBytes); + if (err != cudaSuccess) { + // Fallback to pageable allocation if pinning fails (e.g., the OS + // pinned-memory limit is exhausted). + return SimulationState::toHostBuffer(numElements); + } + } + auto cpyErr = cudaMemcpy(cachedPinnedPtr, devicePtr, numBytes, + cudaMemcpyDeviceToHost); + if (cpyErr != cudaSuccess) + HANDLE_CUDA_ERROR(cpyErr); + // No-op deleter: this state object owns the pinned buffer. + return {cachedPinnedPtr, [](void *) {}}; + } + + /// @brief Free the device data and the cached pinned host buffer. void destroyState() override { + if (cachedPinnedPtr) { + cudaFreeHost(cachedPinnedPtr); + cachedPinnedPtr = nullptr; + } if (!ownsDevicePtr) return; @@ -302,6 +346,13 @@ class CusvState : public cudaq::SimulationState { HANDLE_CUDA_ERROR(cudaFree(devicePtr)); } + + ~CusvState() override { + if (cachedPinnedPtr) { + cudaFreeHost(cachedPinnedPtr); + cachedPinnedPtr = nullptr; + } + } }; } // namespace cudaq