Skip to content

Commit e937431

Browse files
vedika-saravananbmhowe23github-actions[bot]actions-useranjbur
authored
Sparse parity-check matrix support for decoders (#550)
Continuation of [#440](#440). Closes [#379](#379). Introduces `cudaq::qec::sparse_binary_matrix` (CSC/CSR, `uint32_t` indices) as the decoder PCM type, so large PCMs no longer have to be materialized as a dense `cudaqx::tensor`. The sparse path is wired through PCM generation, round slicing, decoder construction, and Python bindings. `sliding_window` canonicalizes once and reads `H.ptr() / H.indices()` directly per window. ## Breaking changes - **Decoder constructor** now takes `const sparse_binary_matrix &H`. Dense `cudaqx::tensor<uint8_t>` callers still compile via implicit conversion, but the extension-point template changed — **out-of-tree plugins must be recompiled**. - **`generate_random_pcm` throws above `k_max_dense_pcm_elements` (400M)**. Use `generate_random_pcm_sparse` for larger matrices. - **`single_error_lut_example`** now uses column `qErr` of H directly instead of XOR-of-other-columns (the old form was only correct when every row of H had even weight). Example plugin only; production `single_error_lut` was already correct. ## Follow-up PRs - Sparse-aware PCM utilities: migrate sort_pcm_columns, simplify_pcm, reorder_pcm_columns, shuffle_pcm_columns, pcm_extend_to_n_rounds, pcm_to_sparse_*, pcm_from_sparse_*, and detector_error_model::canonicalize_for_rounds to sparse_binary_matrix so the full #379 scale pipeline avoids any dense cudaqx::tensor<uint8_t> allocation. - Docs: Sphinx entries for the new type/functions and the 400M dense cap. Note to reviewers: Quick context on why it's larger than the ~1000-line guideline: it's a foundational type-system change rather than an additive feature. The new sparse_binary_matrix type, the decoder constructor breaking change, all the plugin updates, the sparse generator, and the Python bindings all have to land together for the #379 fix to actually be usable end-to-end. Splitting it would create intermediate states where the breaking change has landed but users still can't construct a >50k-mechanism PCM which would mean shipping the API churn without the benefit. --------- Signed-off-by: Ben Howe <bhowe@nvidia.com> Signed-off-by: vedika-saravanan <vsaravanan@nvidia.com> Signed-off-by: Angela Burton <angelab@nvidia.com> Co-authored-by: Ben Howe <bhowe@nvidia.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: GitHub Action <action@github.com> Co-authored-by: Angela Burton <angelab@nvidia.com>
1 parent a0f54ba commit e937431

24 files changed

Lines changed: 2329 additions & 271 deletions

libs/qec/include/cudaq/qec/decoder.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/****************************************************************-*- C++ -*-****
2-
* Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. *
2+
* Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. *
33
* All rights reserved. *
44
* *
55
* This source code and the accompanying materials are made available under *
@@ -11,6 +11,7 @@
1111
#include "cuda-qx/core/extension_point.h"
1212
#include "cuda-qx/core/heterogeneous_map.h"
1313
#include "cuda-qx/core/tensor.h"
14+
#include "sparse_binary_matrix.h"
1415
#include <future>
1516
#include <optional>
1617
#include <vector>
@@ -121,7 +122,8 @@ class async_decoder_result {
121122
/// arbitrary constructor parameters that can be unique to each specific
122123
/// decoder.
123124
class decoder
124-
: public cudaqx::extension_point<decoder, const cudaqx::tensor<uint8_t> &,
125+
: public cudaqx::extension_point<decoder,
126+
const cudaq::qec::sparse_binary_matrix &,
125127
const cudaqx::heterogeneous_map &> {
126128
private:
127129
struct rt_impl;
@@ -134,11 +136,9 @@ class decoder
134136
decoder() = delete;
135137

136138
/// @brief Constructor
137-
/// @param H Decoder's parity check matrix represented as a tensor. The tensor
138-
/// is required be rank 2 and must be of dimensions \p syndrome_size x
139-
/// \p block_size.
140-
/// will use the same \p H.
141-
decoder(const cudaqx::tensor<uint8_t> &H);
139+
/// @param H Decoder's parity check matrix. Taken by value so rvalue
140+
/// arguments are moved into the base member.
141+
decoder(cudaq::qec::sparse_binary_matrix H);
142142

143143
/// @brief Decode a single syndrome
144144
/// @param syndrome A vector of syndrome measurements where the floating point
@@ -174,7 +174,7 @@ class decoder
174174

175175
/// @brief This `get` overload supports default values.
176176
static std::unique_ptr<decoder>
177-
get(const std::string &name, const cudaqx::tensor<uint8_t> &H,
177+
get(const std::string &name, const cudaq::qec::sparse_binary_matrix &H,
178178
const cudaqx::heterogeneous_map &param_map = cudaqx::heterogeneous_map());
179179

180180
std::size_t get_block_size() { return block_size; }
@@ -273,7 +273,7 @@ class decoder
273273
std::size_t syndrome_size = 0;
274274

275275
/// @brief The decoder's parity check matrix
276-
cudaqx::tensor<uint8_t> H;
276+
sparse_binary_matrix H;
277277

278278
/// @brief The decoder's observable matrix in sparse format
279279
std::vector<std::vector<uint32_t>> O_sparse;
@@ -425,6 +425,6 @@ inline void convert_vec_hard_to_soft(const std::vector<std::vector<t_hard>> &in,
425425
}
426426

427427
std::unique_ptr<decoder>
428-
get_decoder(const std::string &name, const cudaqx::tensor<uint8_t> &H,
428+
get_decoder(const std::string &name, const cudaq::qec::sparse_binary_matrix &H,
429429
const cudaqx::heterogeneous_map options = {});
430430
} // namespace cudaq::qec

libs/qec/include/cudaq/qec/pcm_utils.h

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/****************************************************************-*- C++ -*-****
2-
* Copyright (c) 2025 NVIDIA Corporation & Affiliates. *
2+
* Copyright (c) 2025 - 2026 NVIDIA Corporation & Affiliates. *
33
* All rights reserved. *
44
* *
55
* This source code and the accompanying materials are made available under *
@@ -8,6 +8,7 @@
88
#pragma once
99

1010
#include "cuda-qx/core/tensor.h"
11+
#include "sparse_binary_matrix.h"
1112
#include <limits>
1213
#include <random>
1314

@@ -111,6 +112,14 @@ get_sorted_pcm_column_indices(const cudaqx::tensor<uint8_t> &pcm,
111112
bool pcm_is_sorted(const cudaqx::tensor<uint8_t> &pcm,
112113
std::uint32_t num_syndromes_per_round = 0);
113114

115+
/// @brief Check if a PCM is sorted.
116+
/// @param sparse_pcm The sparse PCM to check (in the same format as
117+
/// dense_to_sparse())
118+
/// @param num_syndromes_per_round The number of syndromes per round.
119+
/// @return True if the PCM is sorted, false otherwise.
120+
bool pcm_is_sorted(const std::vector<std::vector<std::uint32_t>> &sparse_pcm,
121+
std::uint32_t num_syndromes_per_round = 0);
122+
114123
/// @brief Reorder the columns of a PCM according to the given column order.
115124
/// Note: this may return a subset of the columns in the original PCM if the
116125
/// \p column_order does not contain all of the columns in the original PCM.
@@ -165,7 +174,53 @@ get_pcm_for_rounds(const cudaqx::tensor<uint8_t> &pcm,
165174
bool straddle_start_round = false,
166175
bool straddle_end_round = false);
167176

177+
/// @brief Same semantics as the overload taking a dense tensor \p pcm, but
178+
/// reads from \p pcm as ``sparse_binary_matrix`` so the full dense PCM is not
179+
/// required (only the returned sub-matrix is dense). Parameter meanings match
180+
/// the dense overload.
181+
///
182+
/// @param pcm_is_canonical If true, the caller asserts \p pcm has
183+
/// sorted-unique per-group indices (i.e. is the output of `canonicalize_pcm`
184+
/// or was constructed canonically). In that case the per-call
185+
/// canonicalization step is skipped — useful for callers like
186+
/// `sliding_window` that canonicalize once at construction and then call
187+
/// this function in a per-window loop on the same matrix. Default `false`
188+
/// preserves the original "canonicalize on entry" behavior.
189+
///
190+
/// @warning When \p pcm_is_canonical is `true` the precondition is *not*
191+
/// checked. `select_pcm_columns_for_round_range` reads `.front()` / `.back()`
192+
/// of each CSC column to derive first/last round; those are only the true
193+
/// min/max row if the column list is sorted-unique. Passing a non-canonical
194+
/// PCM with this flag (e.g. a raw DEM decomposition, a hand-built sparse
195+
/// matrix with duplicate or unsorted indices, or a canonical CSR that the
196+
/// caller forgot to re-canonicalize after conversion) silently produces
197+
/// wrong round assignments. If unsure, leave the flag `false`.
198+
/// @return A tuple with the new PCM with the columns in the range [start_round,
199+
/// end_round], the first column included, and the last column included.
200+
std::tuple<cudaqx::tensor<uint8_t>, std::uint32_t, std::uint32_t>
201+
get_pcm_for_rounds(const sparse_binary_matrix &pcm,
202+
std::uint32_t num_syndromes_per_round,
203+
std::uint32_t start_round, std::uint32_t end_round,
204+
bool straddle_start_round = false,
205+
bool straddle_end_round = false,
206+
bool pcm_is_canonical = false);
207+
208+
/// @brief Upper bound on \f$\texttt{rows} \times \texttt{cols}\f$ for the
209+
/// dense `generate_random_pcm` path. Above this, the call throws and the
210+
/// caller should switch to `generate_random_pcm_sparse`. Exposed as a named
211+
/// symbol so tests and users can reference it without grepping the source.
212+
inline constexpr std::size_t k_max_dense_pcm_elements =
213+
static_cast<std::size_t>(400u) * 1024u * 1024u;
214+
168215
/// @brief Generate a random PCM with the given parameters.
216+
///
217+
/// The PCM has shape `(n_rounds * n_syndromes_per_round) × (n_rounds *
218+
/// n_errs_per_round)`. If the product
219+
/// \f$n\_rounds^2 \times n\_syndromes\_per\_round \times n\_errs\_per\_round\f$
220+
/// exceeds `k_max_dense_pcm_elements`
221+
/// (\f$400 \times 1024 \times 1024 \approx 4.19 \times 10^8\f$ entries),
222+
/// throws `std::invalid_argument`; use `generate_random_pcm_sparse` for
223+
/// matrices that large without a dense allocation.
169224
/// @param n_rounds The number of rounds in the PCM.
170225
/// @param n_errs_per_round The number of errors per round in the PCM.
171226
/// @param n_syndromes_per_round The number of syndromes per round in the PCM.
@@ -178,6 +233,27 @@ cudaqx::tensor<uint8_t> generate_random_pcm(std::size_t n_rounds,
178233
std::size_t n_syndromes_per_round,
179234
int weight, std::mt19937_64 &&rng);
180235

236+
/// @brief Same distribution as generate_random_pcm, but constructs a CSC
237+
/// sparse_binary_matrix directly without allocating a dense rank-2 tensor.
238+
/// Intended for large PCMs whose dense form would exceed
239+
/// k_max_dense_pcm_elements.
240+
sparse_binary_matrix
241+
generate_random_pcm_sparse(std::size_t n_rounds, std::size_t n_errs_per_round,
242+
std::size_t n_syndromes_per_round, int weight,
243+
std::mt19937_64 &&rng);
244+
245+
/// @brief Return a GF(2)-canonical copy of \p pcm: each column (CSC) or row
246+
/// (CSR) has its indices sorted ascending, and duplicate indices are XOR-
247+
/// merged — an index that appears \p k times is kept iff \p k is odd. The
248+
/// output has the same layout as the input and is idempotent under further
249+
/// `canonicalize_pcm` calls.
250+
///
251+
/// Use this when a PCM source, such as a DEM decomposition, may legitimately
252+
/// emit duplicate indices within a column/row and the caller wants to apply
253+
/// GF(2) duplicate-collapse semantics before passing the matrix to consumers
254+
/// that require at-most-one entry per row per column.
255+
sparse_binary_matrix canonicalize_pcm(const sparse_binary_matrix &pcm);
256+
181257
/// @brief Randomly permute the columns of a PCM.
182258
/// @param pcm The PCM to permute.
183259
/// @param rng The random number generator to use (e.g.
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/****************************************************************-*- C++ -*-****
2+
* Copyright (c) 2026 NVIDIA Corporation & Affiliates. *
3+
* All rights reserved. *
4+
* *
5+
* This source code and the accompanying materials are made available under *
6+
* the terms of the Apache License 2.0 which accompanies this distribution. *
7+
******************************************************************************/
8+
#pragma once
9+
10+
#include "cuda-qx/core/tensor.h"
11+
#include <cstdint>
12+
#include <vector>
13+
14+
namespace cudaq::qec {
15+
16+
/// @brief Storage layout for the sparse PCM: Compressed Sparse Column (CSC)
17+
/// or Compressed Sparse Row (CSR). All non-zero entries are assumed to be 1;
18+
/// values are not stored.
19+
enum class sparse_binary_matrix_layout { csc, csr };
20+
21+
/// @brief Sparse parity-check matrix in either CSC or CSR form.
22+
///
23+
/// Input index lists are stored as given: not required to be sorted or
24+
/// GF(2)-unique. Consumers that require cuSPARSE-style compressed groups can
25+
/// call `validate_sorted_unique_indices`; consumers that need GF(2)-collapsed
26+
/// per-group indices can call `cudaq::qec::canonicalize_pcm` on entry.
27+
///
28+
/// `index_type` is `uint32_t`, so each dimension and `nnz` must fit in
29+
/// `~4×10^9`.
30+
class sparse_binary_matrix {
31+
public:
32+
using index_type = std::uint32_t;
33+
34+
/// @brief Construct a sparse PCM in CSC form.
35+
/// @param num_rows Number of rows.
36+
/// @param num_cols Number of columns.
37+
/// @param col_ptrs Column pointer array (length num_cols + 1); column j has
38+
/// indices in \p row_indices[col_ptrs[j] .. col_ptrs[j+1]-1].
39+
/// @param row_indices Row indices of non-zeros (length nnz).
40+
static sparse_binary_matrix from_csc(index_type num_rows, index_type num_cols,
41+
std::vector<index_type> col_ptrs,
42+
std::vector<index_type> row_indices);
43+
44+
/// @brief Construct a sparse PCM in CSR form.
45+
/// @param num_rows Number of rows.
46+
/// @param num_cols Number of columns.
47+
/// @param row_ptrs Row pointer array (length num_rows + 1); row i has
48+
/// indices in \p col_indices[row_ptrs[i] .. row_ptrs[i+1]-1].
49+
/// @param col_indices Column indices of non-zeros (length nnz).
50+
static sparse_binary_matrix from_csr(index_type num_rows, index_type num_cols,
51+
std::vector<index_type> row_ptrs,
52+
std::vector<index_type> col_indices);
53+
54+
/// @brief Construct from nested CSC: \p nested[j] is the list of row indices
55+
/// for column j; \p nested.size() must equal \p num_cols.
56+
static sparse_binary_matrix
57+
from_nested_csc(index_type num_rows, index_type num_cols,
58+
const std::vector<std::vector<index_type>> &nested);
59+
60+
/// @brief Construct from nested CSR: \p nested[i] is the list of column
61+
/// indices for row i; \p nested.size() must equal \p num_rows.
62+
static sparse_binary_matrix
63+
from_nested_csr(index_type num_rows, index_type num_cols,
64+
const std::vector<std::vector<index_type>> &nested);
65+
66+
/// @brief Construct from a rank-2 dense PCM (any non-zero treated as 1).
67+
/// Intentionally not `explicit` so call sites that take
68+
/// `sparse_binary_matrix` accept a dense `cudaqx::tensor` unchanged.
69+
sparse_binary_matrix(
70+
const cudaqx::tensor<std::uint8_t> &dense,
71+
sparse_binary_matrix_layout layout = sparse_binary_matrix_layout::csc);
72+
73+
sparse_binary_matrix() = default;
74+
sparse_binary_matrix(const sparse_binary_matrix &) = default;
75+
sparse_binary_matrix(sparse_binary_matrix &&) noexcept = default;
76+
sparse_binary_matrix &operator=(const sparse_binary_matrix &) = default;
77+
sparse_binary_matrix &operator=(sparse_binary_matrix &&) noexcept = default;
78+
79+
sparse_binary_matrix_layout layout() const { return layout_; }
80+
index_type num_rows() const { return num_rows_; }
81+
index_type num_cols() const { return num_cols_; }
82+
index_type num_nnz() const {
83+
return static_cast<index_type>(indices_.size());
84+
}
85+
86+
/// @brief For CSC: ptr has length num_cols+1; for CSR: ptr has length
87+
/// num_rows+1.
88+
const std::vector<index_type> &ptr() const { return ptr_; }
89+
/// @brief For CSC: row indices; for CSR: column indices.
90+
const std::vector<index_type> &indices() const { return indices_; }
91+
92+
/// @brief Throw if each compressed column/row does not have strictly
93+
/// increasing indices. This rejects duplicate entries in the stored layout.
94+
void validate_sorted_unique_indices(
95+
const char *context = "sparse_binary_matrix") const;
96+
97+
/// @brief Return a copy of this matrix in CSC layout. No-op if already CSC.
98+
sparse_binary_matrix to_csc() const;
99+
100+
/// @brief Return a copy of this matrix in CSR layout. No-op if already CSR.
101+
sparse_binary_matrix to_csr() const;
102+
103+
/// @brief Convert to a dense PCM tensor (rows x columns). Non-zero entries
104+
/// are set to 1.
105+
cudaqx::tensor<std::uint8_t> to_dense() const;
106+
107+
/// @brief Nested CSC: outer vector has size num_cols; inner vector for
108+
/// column j lists row indices of non-zeros in that column.
109+
std::vector<std::vector<index_type>> to_nested_csc() const;
110+
111+
/// @brief Nested CSR: outer vector has size num_rows; inner vector for row i
112+
/// lists column indices of non-zeros in that row.
113+
std::vector<std::vector<index_type>> to_nested_csr() const;
114+
115+
private:
116+
sparse_binary_matrix(sparse_binary_matrix_layout layout, index_type num_rows,
117+
index_type num_cols, std::vector<index_type> ptr,
118+
std::vector<index_type> indices);
119+
120+
sparse_binary_matrix_layout layout_ = sparse_binary_matrix_layout::csc;
121+
index_type num_rows_ = 0;
122+
index_type num_cols_ = 0;
123+
std::vector<index_type> ptr_;
124+
std::vector<index_type> indices_;
125+
};
126+
127+
} // namespace cudaq::qec

libs/qec/lib/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# ============================================================================ #
2-
# Copyright (c) 2024 - 2025 NVIDIA Corporation & Affiliates. #
2+
# Copyright (c) 2024 - 2026 NVIDIA Corporation & Affiliates. #
33
# All rights reserved. #
44
# #
55
# This source code and the accompanying materials are made available under #
@@ -43,6 +43,7 @@ set(QEC_SOURCES
4343
experiments.cpp
4444
pcm_utils.cpp
4545
plugin_loader.cpp
46+
sparse_binary_matrix.cpp
4647
stabilizer_utils.cpp
4748
decoders/lut.cpp
4849
decoders/sliding_window.cpp

libs/qec/lib/decoder.cpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
2+
* Copyright (c) 2022 - 2026 NVIDIA Corporation & Affiliates. *
33
* All rights reserved. *
44
* *
55
* This source code and the accompanying materials are made available under *
@@ -17,8 +17,10 @@
1717
#include <filesystem>
1818
#include <vector>
1919

20-
INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &)
21-
INSTANTIATE_REGISTRY(cudaq::qec::decoder, const cudaqx::tensor<uint8_t> &,
20+
INSTANTIATE_REGISTRY(cudaq::qec::decoder,
21+
const cudaq::qec::sparse_binary_matrix &)
22+
INSTANTIATE_REGISTRY(cudaq::qec::decoder,
23+
const cudaq::qec::sparse_binary_matrix &,
2224
const cudaqx::heterogeneous_map &)
2325

2426
// Include decoder implementations AFTER registry instantiation
@@ -70,12 +72,11 @@ struct decoder::rt_impl {
7072

7173
void decoder::rt_impl_deleter::operator()(rt_impl *p) const { delete p; }
7274

73-
decoder::decoder(const cudaqx::tensor<uint8_t> &H)
74-
: H(H), pimpl(std::unique_ptr<rt_impl, rt_impl_deleter>(new rt_impl())) {
75-
const auto H_shape = H.shape();
76-
assert(H_shape.size() == 2 && "H tensor must be of rank 2");
77-
syndrome_size = H_shape[0];
78-
block_size = H_shape[1];
75+
decoder::decoder(cudaq::qec::sparse_binary_matrix H)
76+
: H(std::move(H)),
77+
pimpl(std::unique_ptr<rt_impl, rt_impl_deleter>(new rt_impl())) {
78+
syndrome_size = this->H.num_rows();
79+
block_size = this->H.num_cols();
7980
reset_decoder();
8081
pimpl->persistent_detector_buffer.resize(this->syndrome_size);
8182
pimpl->persistent_soft_detector_buffer.resize(this->syndrome_size);
@@ -130,7 +131,7 @@ decoder::decode_async(const std::vector<float_t> &syndrome) {
130131
}
131132

132133
std::unique_ptr<decoder>
133-
decoder::get(const std::string &name, const cudaqx::tensor<uint8_t> &H,
134+
decoder::get(const std::string &name, const cudaq::qec::sparse_binary_matrix &H,
134135
const cudaqx::heterogeneous_map &param_map) {
135136
auto [mutex, registry] = get_registry();
136137
std::lock_guard<std::recursive_mutex> lock(mutex);
@@ -479,7 +480,7 @@ void decoder::reset_decoder() {
479480
}
480481

481482
std::unique_ptr<decoder> get_decoder(const std::string &name,
482-
const cudaqx::tensor<uint8_t> &H,
483+
const cudaq::qec::sparse_binary_matrix &H,
483484
const cudaqx::heterogeneous_map options) {
484485
return decoder::get(name, H, options);
485486
}

0 commit comments

Comments
 (0)