Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/sphinx/api/languages/cpp_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ Common
.. doxygenstruct:: cudaq::SimulationState::Tensor
:members:

.. doxygenstruct:: cudaq::SimulationState::HostBuffer
:members:

.. doxygenenum:: cudaq::SimulationState::precision

.. doxygenenum:: cudaq::simulation_precision
Expand Down
3 changes: 2 additions & 1 deletion python/cudaq/runtime/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def to_cupy(state, dtype=None):

arrays = []
for tensor in state.getTensors():
mem = cp.cuda.UnownedMemory(tensor.data(), 1024, owner=None)
total_bytes = tensor.get_num_elements() * tensor.get_element_size()
mem = cp.cuda.UnownedMemory(tensor.data(), total_bytes, owner=None)
memptr = cp.cuda.MemoryPointer(mem, offset=0)
arrays.append(cp.ndarray(tensor.extents, dtype=dtype, memptr=memptr))
return arrays
25 changes: 5 additions & 20 deletions python/runtime/cudaq/algorithms/py_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,28 +357,13 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
auto stateVector = self.get_tensor();
auto precision = self.get_precision();
if (self.is_on_gpu()) {
// This is device data, transfer to host, which gives us
// ownership of a new data pointer on host. Store it globally
// This is device data, transfer to host. GPU backends use pinned
// host memory for faster DMA transfers. Store the buffer globally
// here so we ensure that it gets cleaned up.
auto numElements = stateVector.get_num_elements();
if (precision == SimulationState::precision::fp32) {
auto *hostData = new std::complex<float>[numElements];
self.to_host(hostData, numElements);
dataPtr = reinterpret_cast<void *>(hostData);
} else {
auto *hostData = new std::complex<double>[numElements];
self.to_host(hostData, numElements);
dataPtr = reinterpret_cast<void *>(hostData);
}
hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) {
CUDAQ_INFO("freeing data that was copied from GPU device for "
"compatibility with NumPy");
// Use delete[] to match new[] allocation (not free())
if (precision == SimulationState::precision::fp32)
delete[] static_cast<std::complex<float> *>(data);
else
delete[] static_cast<std::complex<double> *>(data);
});
auto hostBuf = self.toHostBuffer(numElements);
dataPtr = hostBuf.data;
hostDataFromDevice.emplace_back(dataPtr, std::move(hostBuf.deleter));
} else {
dataPtr = self.get_tensor().data;
}
Expand Down
25 changes: 25 additions & 0 deletions runtime/common/SimulationState.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <algorithm>
#include <bitset>
#include <complex>
#include <functional>
#include <memory>
#include <optional>
#include <variant>
Expand Down Expand Up @@ -256,6 +257,30 @@ class SimulationState {
"SimulationState::toHost complex64 not implemented.");
}

/// @brief A host-side buffer with a custom cleanup function.
struct HostBuffer {
void *data = nullptr;
std::function<void(void *)> deleter;
};

/// @brief Allocate a host buffer and copy all state data into it.
/// GPU-backed states should override to use pinned memory for faster
/// device-to-host transfers. Returns a HostBuffer that knows how to
/// free the memory correctly.
virtual HostBuffer toHostBuffer(std::size_t numElements) const {
auto prec = getPrecision();
if (prec == precision::fp32) {
auto *ptr = new std::complex<float>[numElements];
toHost(ptr, numElements);
return {ptr,
[](void *p) { delete[] static_cast<std::complex<float> *>(p); }};
}
auto *ptr = new std::complex<double>[numElements];
toHost(ptr, numElements);
return {ptr,
[](void *p) { delete[] static_cast<std::complex<double> *>(p); }};
}

/// @brief Destructor
virtual ~SimulationState() {}
};
Expand Down
6 changes: 6 additions & 0 deletions runtime/cudaq/qis/state.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ class state {
internal->toHost(hostPtr, numElements);
}

/// @brief Allocate a host buffer and copy state data from device.
/// GPU backends use pinned memory for optimal transfer speed.
SimulationState::HostBuffer toHostBuffer(std::size_t numElements) const {
return internal->toHostBuffer(numElements);
}

/// @brief Dump the state to standard out
void dump() const;

Expand Down
22 changes: 22 additions & 0 deletions runtime/nvqir/cudensitymat/CuDensityMatState.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,28 @@ void CuDensityMatState::toHost(std::complex<float> *userData,
"double-precision array.");
}

// Allocate a pinned host buffer and copy state data into it.
SimulationState::HostBuffer
CuDensityMatState::toHostBuffer(std::size_t numElements) const {
if (numElements != dimension)
throw std::runtime_error(
fmt::format("toHostBuffer: element count mismatch: provided {}, "
"expected {}.",
numElements, dimension));
std::size_t numBytes = numElements * sizeof(std::complex<double>);
void *pinnedPtr = nullptr;
auto err = cudaMallocHost(&pinnedPtr, numBytes);
if (err != cudaSuccess)
return SimulationState::toHostBuffer(numElements);
auto cpyErr =
cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost);
if (cpyErr != cudaSuccess) {
cudaFreeHost(pinnedPtr);
HANDLE_CUDA_ERROR(cpyErr);
}
return {pinnedPtr, [](void *p) { cudaFreeHost(p); }};
}

// Free the device data.
void CuDensityMatState::destroyState() {
if (cudmState) {
Expand Down
3 changes: 3 additions & 0 deletions runtime/nvqir/cudensitymat/CuDensityMatState.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ class CuDensityMatState : public cudaq::SimulationState {
// Copy the state device data to the user-provided host data pointer.
void toHost(std::complex<float> *userData,
std::size_t numElements) const override;
// Allocate a pinned host buffer and copy state data into it.
HostBuffer toHostBuffer(std::size_t numElements) const override;

// Free the device data.
void destroyState() override;

Expand Down
53 changes: 52 additions & 1 deletion runtime/nvqir/custatevec/CuStateVecState.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ class CusvState : public cudaq::SimulationState {
/// @brief Flag indicating ownership of the state data.
bool ownsDevicePtr = true;

/// @brief Cached pinned host buffer for device-to-host transfers.
/// Allocated once on the first toHostBuffer call and reused on subsequent
/// calls, avoiding repeated cudaMallocHost/cudaFreeHost overhead that
/// dominates for large state vectors (>= 64 MiB).
mutable void *cachedPinnedPtr = nullptr;

/// @brief Check that we are currently
/// using the correct CUDA device, set it
/// to the correct one if not
Expand Down Expand Up @@ -284,8 +290,46 @@ class CusvState : public cudaq::SimulationState {
return;
}

/// @brief Free the device data.
/// @brief Copy state data into a pinned host buffer and return it.
///
/// The pinned buffer is allocated once on the first call and reused on
/// subsequent calls. This eliminates repeated cudaMallocHost/cudaFreeHost
/// overhead, which dominates transfer time for large state vectors (>= 64
/// MiB) and negates the DMA bandwidth benefit of pinned memory.
///
/// The returned HostBuffer uses a no-op deleter because this object owns
/// the pinned allocation and frees it in destroyState(). np.array() copies
/// the data synchronously, so the buffer is never in use across two
/// concurrent calls on the same state object.
HostBuffer toHostBuffer(std::size_t numElements) const override {
if (numElements != size)
throw std::runtime_error(
"[custatevec-state] toHostBuffer: invalid number of elements.");
checkAndSetDevice();
const std::size_t numBytes = numElements * sizeof(std::complex<ScalarType>);
// Allocate pinned buffer on first call; reuse it on all subsequent calls.
if (cachedPinnedPtr == nullptr) {
auto err = cudaMallocHost(&cachedPinnedPtr, numBytes);
if (err != cudaSuccess) {
// Fallback to pageable allocation if pinning fails (e.g., the OS
// pinned-memory limit is exhausted).
return SimulationState::toHostBuffer(numElements);
}
}
auto cpyErr = cudaMemcpy(cachedPinnedPtr, devicePtr, numBytes,
cudaMemcpyDeviceToHost);
if (cpyErr != cudaSuccess)
HANDLE_CUDA_ERROR(cpyErr);
// No-op deleter: this state object owns the pinned buffer.
return {cachedPinnedPtr, [](void *) {}};
}

/// @brief Free the device data and the cached pinned host buffer.
void destroyState() override {
if (cachedPinnedPtr) {
cudaFreeHost(cachedPinnedPtr);
cachedPinnedPtr = nullptr;
}
if (!ownsDevicePtr)
return;

Expand All @@ -302,6 +346,13 @@ class CusvState : public cudaq::SimulationState {

HANDLE_CUDA_ERROR(cudaFree(devicePtr));
}

~CusvState() override {
if (cachedPinnedPtr) {
cudaFreeHost(cachedPinnedPtr);
cachedPinnedPtr = nullptr;
}
}
};

} // namespace cudaq
Loading