NVIDIA · mitchdz · Apr 6, 2026 · Apr 8, 2026 · Apr 9, 2026 · Apr 10, 2026
diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
@@ -137,6 +137,9 @@ Common
 .. doxygenstruct:: cudaq::SimulationState::Tensor
     :members:
 
+.. doxygenstruct:: cudaq::SimulationState::HostBuffer
+    :members:
+
 .. doxygenenum:: cudaq::SimulationState::precision
 
 .. doxygenenum:: cudaq::simulation_precision

diff --git a/python/cudaq/runtime/state.py b/python/cudaq/runtime/state.py
@@ -97,7 +97,8 @@ def to_cupy(state, dtype=None):
 
     arrays = []
     for tensor in state.getTensors():
-        mem = cp.cuda.UnownedMemory(tensor.data(), 1024, owner=None)
+        total_bytes = tensor.get_num_elements() * tensor.get_element_size()
+        mem = cp.cuda.UnownedMemory(tensor.data(), total_bytes, owner=None)
         memptr = cp.cuda.MemoryPointer(mem, offset=0)
         arrays.append(cp.ndarray(tensor.extents, dtype=dtype, memptr=memptr))
     return arrays
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -357,28 +357,13 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
         auto stateVector = self.get_tensor();
         auto precision = self.get_precision();
         if (self.is_on_gpu()) {
-          // This is device data, transfer to host, which gives us
-          // ownership of a new data pointer on host. Store it globally
+          // This is device data, transfer to host. GPU backends use pinned
+          // host memory for faster DMA transfers. Store the buffer globally
           // here so we ensure that it gets cleaned up.
           auto numElements = stateVector.get_num_elements();
-          if (precision == SimulationState::precision::fp32) {
-            auto *hostData = new std::complex<float>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          } else {
-            auto *hostData = new std::complex<double>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          }
-          hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) {
-            CUDAQ_INFO("freeing data that was copied from GPU device for "
-                       "compatibility with NumPy");
-            // Use delete[] to match new[] allocation (not free())
-            if (precision == SimulationState::precision::fp32)
-              delete[] static_cast<std::complex<float> *>(data);
-            else
-              delete[] static_cast<std::complex<double> *>(data);
-          });
+          auto hostBuf = self.toHostBuffer(numElements);
+          dataPtr = hostBuf.data;
+          hostDataFromDevice.emplace_back(dataPtr, std::move(hostBuf.deleter));
         } else {
           dataPtr = self.get_tensor().data;
         }

diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <bitset>
 #include <complex>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <variant>
@@ -256,6 +257,30 @@ class SimulationState {
         "SimulationState::toHost complex64 not implemented.");
   }
 
+  /// @brief A host-side buffer with a custom cleanup function.
+  struct HostBuffer {
+    void *data = nullptr;
+    std::function<void(void *)> deleter;
+  };
+
+  /// @brief Allocate a host buffer and copy all state data into it.
+  /// GPU-backed states should override to use pinned memory for faster
+  /// device-to-host transfers. Returns a HostBuffer that knows how to
+  /// free the memory correctly.
+  virtual HostBuffer toHostBuffer(std::size_t numElements) const {
+    auto prec = getPrecision();
+    if (prec == precision::fp32) {
+      auto *ptr = new std::complex<float>[numElements];
+      toHost(ptr, numElements);
+      return {ptr,
+              [](void *p) { delete[] static_cast<std::complex<float> *>(p); }};
+    }
+    auto *ptr = new std::complex<double>[numElements];
+    toHost(ptr, numElements);
+    return {ptr,
+            [](void *p) { delete[] static_cast<std::complex<double> *>(p); }};
+  }
+
   /// @brief Destructor
   virtual ~SimulationState() {}
 };

diff --git a/runtime/cudaq/qis/state.h b/runtime/cudaq/qis/state.h
@@ -143,6 +143,12 @@ class state {
     internal->toHost(hostPtr, numElements);
   }
 
+  /// @brief Allocate a host buffer and copy state data from device.
+  /// GPU backends use pinned memory for optimal transfer speed.
+  SimulationState::HostBuffer toHostBuffer(std::size_t numElements) const {
+    return internal->toHostBuffer(numElements);
+  }
+
   /// @brief Dump the state to standard out
   void dump() const;
 

diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
@@ -228,6 +228,28 @@ void CuDensityMatState::toHost(std::complex<float> *userData,
       "double-precision array.");
 }
 
+// Allocate a pinned host buffer and copy state data into it.
+SimulationState::HostBuffer
+CuDensityMatState::toHostBuffer(std::size_t numElements) const {
+  if (numElements != dimension)
+    throw std::runtime_error(
+        fmt::format("toHostBuffer: element count mismatch: provided {}, "
+                    "expected {}.",
+                    numElements, dimension));
+  std::size_t numBytes = numElements * sizeof(std::complex<double>);
+  void *pinnedPtr = nullptr;
+  auto err = cudaMallocHost(&pinnedPtr, numBytes);
+  if (err != cudaSuccess)
+    return SimulationState::toHostBuffer(numElements);
+  auto cpyErr =
+      cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost);
+  if (cpyErr != cudaSuccess) {
+    cudaFreeHost(pinnedPtr);
+    HANDLE_CUDA_ERROR(cpyErr);
+  }
+  return {pinnedPtr, [](void *p) { cudaFreeHost(p); }};
+}
+
 // Free the device data.
 void CuDensityMatState::destroyState() {
   if (cudmState) {

diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.h b/runtime/nvqir/cudensitymat/CuDensityMatState.h
@@ -116,6 +116,9 @@ class CuDensityMatState : public cudaq::SimulationState {
   // Copy the state device data to the user-provided host data pointer.
   void toHost(std::complex<float> *userData,
               std::size_t numElements) const override;
+  // Allocate a pinned host buffer and copy state data into it.
+  HostBuffer toHostBuffer(std::size_t numElements) const override;
+
   // Free the device data.
   void destroyState() override;
 

diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h
@@ -56,6 +56,12 @@ class CusvState : public cudaq::SimulationState {
   /// @brief Flag indicating ownership of the state data.
   bool ownsDevicePtr = true;
 
+  /// @brief Cached pinned host buffer for device-to-host transfers.
+  /// Allocated once on the first toHostBuffer call and reused on subsequent
+  /// calls, avoiding repeated cudaMallocHost/cudaFreeHost overhead that
+  /// dominates for large state vectors (>= 64 MiB).
+  mutable void *cachedPinnedPtr = nullptr;
+
   /// @brief Check that we are currently
   /// using the correct CUDA device, set it
   /// to the correct one if not
@@ -284,8 +290,46 @@ class CusvState : public cudaq::SimulationState {
     return;
   }
 
-  /// @brief Free the device data.
+  /// @brief Copy state data into a pinned host buffer and return it.
+  ///
+  /// The pinned buffer is allocated once on the first call and reused on
+  /// subsequent calls. This eliminates repeated cudaMallocHost/cudaFreeHost
+  /// overhead, which dominates transfer time for large state vectors (>= 64
+  /// MiB) and negates the DMA bandwidth benefit of pinned memory.
+  ///
+  /// The returned HostBuffer uses a no-op deleter because this object owns
+  /// the pinned allocation and frees it in destroyState(). np.array() copies
+  /// the data synchronously, so the buffer is never in use across two
+  /// concurrent calls on the same state object.
+  HostBuffer toHostBuffer(std::size_t numElements) const override {
+    if (numElements != size)
+      throw std::runtime_error(
+          "[custatevec-state] toHostBuffer: invalid number of elements.");
+    checkAndSetDevice();
+    const std::size_t numBytes = numElements * sizeof(std::complex<ScalarType>);
+    // Allocate pinned buffer on first call; reuse it on all subsequent calls.
+    if (cachedPinnedPtr == nullptr) {
+      auto err = cudaMallocHost(&cachedPinnedPtr, numBytes);
+      if (err != cudaSuccess) {
+        // Fallback to pageable allocation if pinning fails (e.g., the OS
+        // pinned-memory limit is exhausted).
+        return SimulationState::toHostBuffer(numElements);
+      }
+    }
+    auto cpyErr = cudaMemcpy(cachedPinnedPtr, devicePtr, numBytes,
+                             cudaMemcpyDeviceToHost);
+    if (cpyErr != cudaSuccess)
+      HANDLE_CUDA_ERROR(cpyErr);
+    // No-op deleter: this state object owns the pinned buffer.
+    return {cachedPinnedPtr, [](void *) {}};
+  }
+
+  /// @brief Free the device data and the cached pinned host buffer.
   void destroyState() override {
+    if (cachedPinnedPtr) {
+      cudaFreeHost(cachedPinnedPtr);
+      cachedPinnedPtr = nullptr;
+    }
     if (!ownsDevicePtr)
       return;
 
@@ -302,6 +346,13 @@ class CusvState : public cudaq::SimulationState {
 
     HANDLE_CUDA_ERROR(cudaFree(devicePtr));
   }
+
+  ~CusvState() override {
+    if (cachedPinnedPtr) {
+      cudaFreeHost(cachedPinnedPtr);
+      cachedPinnedPtr = nullptr;
+    }
+  }
 };
 
 } // namespace cudaq