Fix dense --> sparse conversion in get_decoder to avoid redundant copies (#589)

bmhowe23 · web-flow · commit 80bc70c6738b · 2026-06-08T10:35:48.000-07:00
## Description

Replace the `pcmToTensor`-based `make_sparse_from_dense` helper with a
direct scan of the raw numpy buffer. This makes use of the new
`sparse_binary_matrix` type. This avoids an intermediate dense tensor
copy, so peak memory is just the caller's dense array plus the sparse
output. The inner loop always runs in the contiguous memory direction,
so both layouts are cache-friendly.

Update the Fortran-order test to reflect that F-order input is now
handled correctly rather than rejected. Therefore, I removed the `
test_decoder_initialization_with_error()` test and replaced it with a
new test to verify it works correctly.

## Runtime / performance impact

If the user is providing dense PCMs, this allows them to use larger
dense PCMs before running out of memory on their system by avoiding
unnecessary allocations and copies.

Signed-off-by: Ben Howe &lt;bhowe@nvidia.com&gt;
diff --git a/libs/qec/python/bindings/py_decoder.cpp b/libs/qec/python/bindings/py_decoder.cpp
@@ -90,6 +90,54 @@ sparse_binary_matrix_from_py_dict(const nb::dict &d) {
       "Sparse H dict layout must be \"nested_csc\" or \"nested_csr\".");
 }
 
+/// Convert a dense 2-D NumPy uint8 array to sparse_binary_matrix without
+/// any intermediate dense tensor allocation.  Strides are read directly so
+/// both C-contiguous (row-major) and Fortran-contiguous (column-major) arrays
+/// are handled efficiently: the inner loop always traverses contiguous memory.
+static sparse_binary_matrix
+make_sparse_from_dense(const nb::ndarray<nb::numpy, uint8_t> &arr) {
+  if (arr.ndim() != 2)
+    throw std::invalid_argument("H must be a 2-D uint8 array");
+  const std::size_t num_rows = arr.shape(0);
+  const std::size_t num_cols = arr.shape(1);
+  const std::ptrdiff_t rs = arr.stride(0); // bytes per row step
+  const std::ptrdiff_t cs = arr.stride(1); // bytes per col step
+  const uint8_t *base = static_cast<const uint8_t *>(arr.data());
+
+  using index_t = sparse_binary_matrix::index_type;
+  std::vector<index_t> ptr, idx;
+
+  // C-order: inner loop over columns is sequential → build CSR.
+  // F-order: inner loop over rows is sequential → build CSC.
+  if (cs <= rs) {
+    ptr.reserve(num_rows + 1);
+    ptr.push_back(0);
+    for (std::size_t i = 0; i < num_rows; ++i) {
+      for (std::size_t j = 0; j < num_cols; ++j) {
+        if (base[i * rs + j * cs])
+          idx.push_back(static_cast<index_t>(j));
+      }
+      ptr.push_back(static_cast<index_t>(idx.size()));
+    }
+    return sparse_binary_matrix::from_csr(static_cast<index_t>(num_rows),
+                                          static_cast<index_t>(num_cols),
+                                          std::move(ptr), std::move(idx));
+  } else {
+    ptr.reserve(num_cols + 1);
+    ptr.push_back(0);
+    for (std::size_t j = 0; j < num_cols; ++j) {
+      for (std::size_t i = 0; i < num_rows; ++i) {
+        if (base[i * rs + j * cs])
+          idx.push_back(static_cast<index_t>(i));
+      }
+      ptr.push_back(static_cast<index_t>(idx.size()));
+    }
+    return sparse_binary_matrix::from_csc(static_cast<index_t>(num_rows),
+                                          static_cast<index_t>(num_cols),
+                                          std::move(ptr), std::move(idx));
+  }
+}
+
 class PyDecoder : public decoder {
 public:
   NB_TRAMPOLINE(decoder, 1);
@@ -724,13 +772,6 @@ void bindDecoder(nb::module_ &mod) {
 
         cudaq::qec::sparse_binary_matrix H_sparse;
 
-        auto make_sparse_from_dense =
-            [](const nb::ndarray<nb::numpy, uint8_t> &arr) {
-              auto tensor_H = cudaqx::pcmToTensor(arr);
-              return cudaq::qec::sparse_binary_matrix(
-                  tensor_H, cudaq::qec::sparse_binary_matrix_layout::csc);
-            };
-
         if (nb::isinstance<nb::dict>(H))
           H_sparse = sparse_binary_matrix_from_py_dict(nb::cast<nb::dict>(H));
         else
diff --git a/libs/qec/python/tests/test_decoder.py b/libs/qec/python/tests/test_decoder.py
@@ -29,11 +29,18 @@ def test_decoder_initialization():
     assert hasattr(decoder, 'decode')
 
 
-def test_decoder_initialization_with_error():
-    # We do not support column-major order (Fortran order)
-    H_bad = np.zeros((10, 20), dtype=np.uint8, order='F')
-    with pytest.raises(RuntimeError) as e:
-        decoder = qec.get_decoder('single_error_lut_example', H_bad)
+def test_decoder_initialization_with_fortran_order():
+    # Fortran-order (column-major) arrays are now handled via stride-aware
+    # scanning and should work correctly.
+    H_f = np.eye(10, 20, dtype=np.uint8, order='F')
+    H_c = np.ascontiguousarray(H_f)
+    decoder_f = qec.get_decoder('single_error_lut_example', H_f)
+    decoder_c = qec.get_decoder('single_error_lut_example', H_c)
+    syndrome = np.zeros(H_f.shape[0], dtype=np.uint8)
+    r_f = decoder_f.decode(syndrome)
+    r_c = decoder_c.decode(syndrome)
+    assert r_f.converged == r_c.converged
+    assert list(r_f.result) == list(r_c.result)
 
 
 def test_decoder_api():