From 3ce42b7eaa7f773789fbbde204242a51d301cb23 Mon Sep 17 00:00:00 2001
From: mdzurick <mitch_dz@hotmail.com>
Date: Mon, 6 Apr 2026 19:12:43 +0000
Subject: [PATCH 1/2] Use pinned host memory for GPU state-to-NumPy conversion
 (#2797)

- Fix performance bottleneck in `np.array(cudaq.get_state(kernel))` for
  GPU-backed states by using pinned (`cudaMallocHost`) host memory instead
  of pageable (`new[]`) for the device-to-host transfer in the pybind11
  buffer protocol handler
- Add `SimulationState::toHostBuffer()` virtual method so GPU backends can
  provide optimized host allocation, with a pageable fallback for non-GPU
  backends and for systems that cannot pin the requested amount of memory
- Fix `to_cupy()` passing a hardcoded 1024-byte size to `UnownedMemory`
  instead of the actual buffer size

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 docs/sphinx/api/languages/cpp_api.rst         |  3 +++
 python/cudaq/runtime/state.py                 |  3 ++-
 python/runtime/cudaq/algorithms/py_state.cpp  | 25 ++++---------------
 runtime/common/SimulationState.h              | 25 +++++++++++++++++++
 runtime/cudaq/qis/state.h                     |  6 +++++
 .../nvqir/cudensitymat/CuDensityMatState.cpp  | 22 ++++++++++++++++
 .../nvqir/cudensitymat/CuDensityMatState.h    |  3 +++
 runtime/nvqir/custatevec/CuStateVecState.h    | 25 +++++++++++++++++++
 8 files changed, 91 insertions(+), 21 deletions(-)
diff --git a/docs/sphinx/api/languages/cpp_api.rst b/docs/sphinx/api/languages/cpp_api.rst
index ba842449820..1eb2776bc66 100644
--- a/docs/sphinx/api/languages/cpp_api.rst
+++ b/docs/sphinx/api/languages/cpp_api.rst
@@ -137,6 +137,9 @@ Common
 .. doxygenstruct:: cudaq::SimulationState::Tensor
     :members:
 
+.. doxygenstruct:: cudaq::SimulationState::HostBuffer
+    :members:
+
 .. doxygenenum:: cudaq::SimulationState::precision
 
 .. doxygenenum:: cudaq::simulation_precision
diff --git a/python/cudaq/runtime/state.py b/python/cudaq/runtime/state.py
index 4c642a599f3..b70f8a2404f 100644
--- a/python/cudaq/runtime/state.py
+++ b/python/cudaq/runtime/state.py
@@ -97,7 +97,8 @@ def to_cupy(state, dtype=None):
 
     arrays = []
     for tensor in state.getTensors():
-        mem = cp.cuda.UnownedMemory(tensor.data(), 1024, owner=None)
+        total_bytes = tensor.get_num_elements() * tensor.get_element_size()
+        mem = cp.cuda.UnownedMemory(tensor.data(), total_bytes, owner=None)
         memptr = cp.cuda.MemoryPointer(mem, offset=0)
         arrays.append(cp.ndarray(tensor.extents, dtype=dtype, memptr=memptr))
     return arrays
diff --git a/python/runtime/cudaq/algorithms/py_state.cpp b/python/runtime/cudaq/algorithms/py_state.cpp
index d7e4b9aa372..4919a3454f6 100644
--- a/python/runtime/cudaq/algorithms/py_state.cpp
+++ b/python/runtime/cudaq/algorithms/py_state.cpp
@@ -353,28 +353,13 @@ void cudaq::bindPyState(py::module &mod, LinkedLibraryHolder &holder) {
         auto stateVector = self.get_tensor();
         auto precision = self.get_precision();
         if (self.is_on_gpu()) {
-          // This is device data, transfer to host, which gives us
-          // ownership of a new data pointer on host. Store it globally
+          // This is device data, transfer to host. GPU backends use pinned
+          // host memory for faster DMA transfers. Store the buffer globally
           // here so we ensure that it gets cleaned up.
           auto numElements = stateVector.get_num_elements();
-          if (precision == SimulationState::precision::fp32) {
-            auto *hostData = new std::complex<float>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          } else {
-            auto *hostData = new std::complex<double>[numElements];
-            self.to_host(hostData, numElements);
-            dataPtr = reinterpret_cast<void *>(hostData);
-          }
-          hostDataFromDevice.emplace_back(dataPtr, [precision](void *data) {
-            CUDAQ_INFO("freeing data that was copied from GPU device for "
-                       "compatibility with NumPy");
-            // Use delete[] to match new[] allocation (not free())
-            if (precision == SimulationState::precision::fp32)
-              delete[] static_cast<std::complex<float> *>(data);
-            else
-              delete[] static_cast<std::complex<double> *>(data);
-          });
+          auto hostBuf = self.toHostBuffer(numElements);
+          dataPtr = hostBuf.data;
+          hostDataFromDevice.emplace_back(dataPtr, std::move(hostBuf.deleter));
         } else {
           dataPtr = self.get_tensor().data;
         }
diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h
index 1fdf45ad23d..8ec4638120a 100644
--- a/runtime/common/SimulationState.h
+++ b/runtime/common/SimulationState.h
@@ -12,6 +12,7 @@
 #include <algorithm>
 #include <bitset>
 #include <complex>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <variant>
@@ -256,6 +257,30 @@ class SimulationState {
         "SimulationState::toHost complex64 not implemented.");
   }
 
+  /// @brief A host-side buffer with a custom cleanup function.
+  struct HostBuffer {
+    void *data = nullptr;
+    std::function<void(void *)> deleter;
+  };
+
+  /// @brief Allocate a host buffer and copy all state data into it.
+  /// GPU-backed states should override to use pinned memory for faster
+  /// device-to-host transfers. Returns a HostBuffer that knows how to
+  /// free the memory correctly.
+  virtual HostBuffer toHostBuffer(std::size_t numElements) const {
+    auto prec = getPrecision();
+    if (prec == precision::fp32) {
+      auto *ptr = new std::complex<float>[numElements];
+      toHost(ptr, numElements);
+      return {ptr,
+              [](void *p) { delete[] static_cast<std::complex<float> *>(p); }};
+    }
+    auto *ptr = new std::complex<double>[numElements];
+    toHost(ptr, numElements);
+    return {ptr,
+            [](void *p) { delete[] static_cast<std::complex<double> *>(p); }};
+  }
+
   /// @brief Destructor
   virtual ~SimulationState() {}
 };
diff --git a/runtime/cudaq/qis/state.h b/runtime/cudaq/qis/state.h
index 3723c80be0c..819e49c90de 100644
--- a/runtime/cudaq/qis/state.h
+++ b/runtime/cudaq/qis/state.h
@@ -143,6 +143,12 @@ class state {
     internal->toHost(hostPtr, numElements);
   }
 
+  /// @brief Allocate a host buffer and copy state data from device.
+  /// GPU backends use pinned memory for optimal transfer speed.
+  SimulationState::HostBuffer toHostBuffer(std::size_t numElements) const {
+    return internal->toHostBuffer(numElements);
+  }
+
   /// @brief Dump the state to standard out
   void dump() const;
 
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
index 53469e13f8a..1e1596d6832 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
@@ -228,6 +228,28 @@ void CuDensityMatState::toHost(std::complex<float> *userData,
       "double-precision array.");
 }
 
+// Allocate a pinned host buffer and copy state data into it.
+SimulationState::HostBuffer
+CuDensityMatState::toHostBuffer(std::size_t numElements) const {
+  if (numElements != dimension)
+    throw std::runtime_error(
+        fmt::format("toHostBuffer: element count mismatch: provided {}, "
+                    "expected {}.",
+                    numElements, dimension));
+  std::size_t numBytes = numElements * sizeof(std::complex<double>);
+  void *pinnedPtr = nullptr;
+  auto err = cudaMallocHost(&pinnedPtr, numBytes);
+  if (err != cudaSuccess)
+    return SimulationState::toHostBuffer(numElements);
+  auto cpyErr =
+      cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost);
+  if (cpyErr != cudaSuccess) {
+    cudaFreeHost(pinnedPtr);
+    HANDLE_CUDA_ERROR(cpyErr);
+  }
+  return {pinnedPtr, [](void *p) { cudaFreeHost(p); }};
+}
+
 // Free the device data.
 void CuDensityMatState::destroyState() {
   if (cudmState) {
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.h b/runtime/nvqir/cudensitymat/CuDensityMatState.h
index e8e9117638f..fdc398f4016 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatState.h
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.h
@@ -116,6 +116,9 @@ class CuDensityMatState : public cudaq::SimulationState {
   // Copy the state device data to the user-provided host data pointer.
   void toHost(std::complex<float> *userData,
               std::size_t numElements) const override;
+  // Allocate a pinned host buffer and copy state data into it.
+  HostBuffer toHostBuffer(std::size_t numElements) const override;
+
   // Free the device data.
   void destroyState() override;
 
diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h
index a70bb2a8a4d..2559eb1b2da 100644
--- a/runtime/nvqir/custatevec/CuStateVecState.h
+++ b/runtime/nvqir/custatevec/CuStateVecState.h
@@ -284,6 +284,31 @@ class CusvState : public cudaq::SimulationState {
     return;
   }
 
+  /// @brief Allocate a pinned host buffer and copy state data into it.
+  /// Pinned memory enables direct DMA transfers, significantly improving
+  /// bandwidth compared to non-pinned memory for large device-to-host copies.
+  HostBuffer toHostBuffer(std::size_t numElements) const override {
+    if (numElements != size)
+      throw std::runtime_error(
+          "[custatevec-state] toHostBuffer: invalid number of elements.");
+    checkAndSetDevice();
+    std::size_t numBytes = numElements * sizeof(std::complex<ScalarType>);
+    void *pinnedPtr = nullptr;
+    auto err = cudaMallocHost(&pinnedPtr, numBytes);
+    if (err != cudaSuccess) {
+      // Fallback to non-pinned allocation if pinning fails (e.g., very large
+      // states that exceed the OS limit for pinned memory).
+      return SimulationState::toHostBuffer(numElements);
+    }
+    auto cpyErr =
+        cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost);
+    if (cpyErr != cudaSuccess) {
+      cudaFreeHost(pinnedPtr);
+      HANDLE_CUDA_ERROR(cpyErr);
+    }
+    return {pinnedPtr, [](void *p) { cudaFreeHost(p); }};
+  }
+
   /// @brief Free the device data.
   void destroyState() override {
     if (!ownsDevicePtr)

From 80b17cff0613ecd4746b69994e757229d31fe9df Mon Sep 17 00:00:00 2001
From: mdzurick <mitch_dz@hotmail.com>
Date: Mon, 13 Apr 2026 13:05:45 +0000
Subject: [PATCH 2/2] reuse state data

Signed-off-by: mdzurick <mitch_dz@hotmail.com>
---
 runtime/nvqir/custatevec/CuStateVecState.h | 60 ++++++++++++++++------
 1 file changed, 43 insertions(+), 17 deletions(-)

diff --git a/runtime/nvqir/custatevec/CuStateVecState.h b/runtime/nvqir/custatevec/CuStateVecState.h
index 2559eb1b2da..5ece98c13bc 100644
--- a/runtime/nvqir/custatevec/CuStateVecState.h
+++ b/runtime/nvqir/custatevec/CuStateVecState.h
@@ -56,6 +56,12 @@ class CusvState : public cudaq::SimulationState {
   /// @brief Flag indicating ownership of the state data.
   bool ownsDevicePtr = true;
 
+  /// @brief Cached pinned host buffer for device-to-host transfers.
+  /// Allocated once on the first toHostBuffer call and reused on subsequent
+  /// calls, avoiding repeated cudaMallocHost/cudaFreeHost overhead that
+  /// dominates for large state vectors (>= 64 MiB).
+  mutable void *cachedPinnedPtr = nullptr;
+
   /// @brief Check that we are currently
   /// using the correct CUDA device, set it
   /// to the correct one if not
@@ -284,33 +290,46 @@ class CusvState : public cudaq::SimulationState {
     return;
   }
 
-  /// @brief Allocate a pinned host buffer and copy state data into it.
-  /// Pinned memory enables direct DMA transfers, significantly improving
-  /// bandwidth compared to non-pinned memory for large device-to-host copies.
+  /// @brief Copy state data into a pinned host buffer and return it.
+  ///
+  /// The pinned buffer is allocated once on the first call and reused on
+  /// subsequent calls. This eliminates repeated cudaMallocHost/cudaFreeHost
+  /// overhead, which dominates transfer time for large state vectors (>= 64
+  /// MiB) and negates the DMA bandwidth benefit of pinned memory.
+  ///
+  /// The returned HostBuffer uses a no-op deleter because this object owns
+  /// the pinned allocation and frees it in destroyState(). np.array() copies
+  /// the data synchronously, so the buffer is never in use across two
+  /// concurrent calls on the same state object.
   HostBuffer toHostBuffer(std::size_t numElements) const override {
     if (numElements != size)
       throw std::runtime_error(
           "[custatevec-state] toHostBuffer: invalid number of elements.");
     checkAndSetDevice();
-    std::size_t numBytes = numElements * sizeof(std::complex<ScalarType>);
-    void *pinnedPtr = nullptr;
-    auto err = cudaMallocHost(&pinnedPtr, numBytes);
-    if (err != cudaSuccess) {
-      // Fallback to non-pinned allocation if pinning fails (e.g., very large
-      // states that exceed the OS limit for pinned memory).
-      return SimulationState::toHostBuffer(numElements);
+    const std::size_t numBytes = numElements * sizeof(std::complex<ScalarType>);
+    // Allocate pinned buffer on first call; reuse it on all subsequent calls.
+    if (cachedPinnedPtr == nullptr) {
+      auto err = cudaMallocHost(&cachedPinnedPtr, numBytes);
+      if (err != cudaSuccess) {
+        // Fallback to pageable allocation if pinning fails (e.g., the OS
+        // pinned-memory limit is exhausted).
+        return SimulationState::toHostBuffer(numElements);
+      }
     }
-    auto cpyErr =
-        cudaMemcpy(pinnedPtr, devicePtr, numBytes, cudaMemcpyDeviceToHost);
-    if (cpyErr != cudaSuccess) {
-      cudaFreeHost(pinnedPtr);
+    auto cpyErr = cudaMemcpy(cachedPinnedPtr, devicePtr, numBytes,
+                             cudaMemcpyDeviceToHost);
+    if (cpyErr != cudaSuccess)
       HANDLE_CUDA_ERROR(cpyErr);
-    }
-    return {pinnedPtr, [](void *p) { cudaFreeHost(p); }};
+    // No-op deleter: this state object owns the pinned buffer.
+    return {cachedPinnedPtr, [](void *) {}};
   }
 
-  /// @brief Free the device data.
+  /// @brief Free the device data and the cached pinned host buffer.
   void destroyState() override {
+    if (cachedPinnedPtr) {
+      cudaFreeHost(cachedPinnedPtr);
+      cachedPinnedPtr = nullptr;
+    }
     if (!ownsDevicePtr)
       return;
 
@@ -327,6 +346,13 @@ class CusvState : public cudaq::SimulationState {
 
     HANDLE_CUDA_ERROR(cudaFree(devicePtr));
   }
+
+  ~CusvState() override {
+    if (cachedPinnedPtr) {
+      cudaFreeHost(cachedPinnedPtr);
+      cachedPinnedPtr = nullptr;
+    }
+  }
 };
 
 } // namespace cudaq