diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt
index d359335a066..a835eb5ee2d 100644
--- a/tts-cpp/CMakeLists.txt
+++ b/tts-cpp/CMakeLists.txt
@@ -857,6 +857,40 @@ if (TTS_CPP_BUILD_TESTS)
     tts_cpp_apply_ccache(test-supertonic-vector-estimator-backward)
     tts_cpp_register_test(test-supertonic-vector-estimator-backward LABEL "unit")
 
+    # QVAC-20983 — analytic backward of the vocoder (latent unpack, denorm,
+    # causal conv1d, causal depthwise, channel layer norm, gelu, per-channel-gamma
+    # convnext, affine batch norm, leaky-relu head, full chain d(loss)/d(latent)).
+    # The "transposed convolution" upsampling is a fixed reshape+permute (the
+    # latent unpack), so its backward is a pure permutation. Model-free: every
+    # analytic gradient is gradchecked against finite differences via the Task 2
+    # harness, so it ALWAYS runs on a fresh checkout (no-skip policy).
+    add_executable(test-supertonic-vocoder-backward
+        test/test_supertonic_vocoder_backward.cpp
+        src/supertonic_vocoder_backward.cpp
+        src/supertonic_vector_estimator_backward.cpp
+        src/voiceclone_gradcheck.cpp)
+    target_include_directories(test-supertonic-vocoder-backward PRIVATE src)
+    tts_cpp_apply_ccache(test-supertonic-vocoder-backward)
+    tts_cpp_register_test(test-supertonic-vocoder-backward LABEL "unit")
+
+    # QVAC-20983 — forward-parity guard for the vocoder backward.  The gradcheck
+    # above is self-referential (analytic backward vs the in-file reference
+    # forward); this builds a synthetic CPU-backed `supertonic_model` from
+    # deterministic weights and asserts the reference forward
+    # (`VocoderBackward::forward`) matches the PRODUCTION
+    # `supertonic_vocoder_forward_cpu` on the same buffers.  Catches any drift
+    # between the two forwards (per-channel gamma layout, dilation schedule,
+    # weight index order) the gradcheck cannot see.  Model-free (weights
+    # synthesized in-memory) so it always runs in the `unit` tier.
+    add_executable(test-supertonic-vocoder-backward-parity
+        test/test_supertonic_vocoder_backward_parity.cpp
+        src/supertonic_vocoder_backward.cpp
+        src/supertonic_vector_estimator_backward.cpp)
+    target_link_libraries(test-supertonic-vocoder-backward-parity PRIVATE tts-cpp ggml)
+    target_include_directories(test-supertonic-vocoder-backward-parity PRIVATE ggml/include src include)
+    tts_cpp_apply_ccache(test-supertonic-vocoder-backward-parity)
+    tts_cpp_register_test(test-supertonic-vocoder-backward-parity LABEL "unit")
+
     # Engine-level streaming-callback contract test for the per-sentence
     # segmentation path (Fix #2): monotonic global chunk_index, single final
     # is_last, result.pcm == concat(callbacks), accumulated stats.  Gated on
diff --git a/tts-cpp/docs/voiceclone-backward-vocoder.md b/tts-cpp/docs/voiceclone-backward-vocoder.md
new file mode 100644
index 00000000000..af33fd1b245
--- /dev/null
+++ b/tts-cpp/docs/voiceclone-backward-vocoder.md
@@ -0,0 +1,122 @@
+# Voice-clone backward — vocoder (QVAC-20983)
+
+Scope for ticket *"6. GGML backward pass: vocoder"*. Make the Supertonic vocoder
+differentiable for enrollment, with the **transposed convolution** called out as
+the main risk op. This doc records the op × backend gap for the vocoder path and
+the CPU-fallback behavior, and is committed alongside the deliverable: an
+analytic, gradchecked C++ backward of the full vocoder
+(`src/supertonic_vocoder_backward.{h,cpp}`).
+
+It is the vocoder counterpart of `voiceclone-backward-gap-matrix.md` (text
+encoder); the *why analytic* rationale and the Task-2 gradcheck contract are
+shared.
+
+## Why the gap exists
+
+Voice cloning optimizes only `style_ttl` (model weights frozen). The vocoder
+maps the CFM latent to a waveform, so the gradient enrollment needs from it is
+`d(loss)/d(latent)` — the audio-loss gradient backpropagated through the frozen
+vocoder into the latent, which then flows back through the vector estimator and
+text encoder to `style_ttl`.
+
+A fully GGML-native backward (the on-device goal) needs every op on the vocoder
+path to have a case in `ggml_compute_backward` (`ggml/src/ggml.c`) **and** a CPU
+kernel for the ops the backward expands into. Several are missing, and the
+vocoder additionally leans on custom `ggml_supertonic_*` ops that only run on the
+CPU backend.
+
+## The "transposed convolution" risk, resolved
+
+The ticket flags the transposed convolution as the main risk. In the Supertonic
+vocoder there is **no `ggml_conv_transpose_*` op**: the time upsampling (factor
+`ttl_chunk_compress_factor`) is realized as a fixed `reshape + permute + cont`
+(the latent unpack, `build_supertonic_vocoder_cache`), i.e. a pure permutation
+`x[(t*factor+r)*C_latent + c] = latent[(c*factor+r)*latent_len + t]`. Its
+backward is the transpose gather (`latent_unpack_backward`), with no kernel risk
+— the feared conv-transpose backward does not arise here.
+
+## Forward ops on the vocoder path
+
+Source: `src/supertonic_vocoder.cpp`.
+
+| Forward op | Where (forward) |
+| --- | --- |
+| reshape/permute/cont | latent unpack (the notional "transposed conv" upsample) |
+| `ggml_scale`, `ggml_mul`, `ggml_add` (broadcast) | denorm, BN affine, residuals, per-channel gamma |
+| `ggml_im2col` + `ggml_mul_mat` | causal conv1d (embed, pw1/pw2, head1/head2) |
+| custom causal depthwise (`ggml_custom` / `ggml_supertonic_depthwise_1d_causal_ct`) | ConvNeXt depthwise |
+| `ggml_norm` (+ `ggml_supertonic_layer_norm_channel*`) | ConvNeXt channel layer norm |
+| `ggml_gelu_erf` (+ `ggml_supertonic_bias_gelu*`) | ConvNeXt FFN |
+| leaky-relu lowering (`leaky_relu_portable_ggml`) | head PReLU |
+| `ggml_supertonic_edge_pad_1d`, `convnext_block_fused` | fused CPU/Metal fast paths |
+
+## Gap matrix
+
+Legend: **OK** = implemented; **MISSING** = aborts / not implemented.
+
+"Graph backward" = a case in `ggml_compute_backward` (`ggml/src/ggml.c`),
+backend-agnostic. "CPU bwd kernel" = the kernels the backward expands into exist
+for the CPU backend, which is the only backend enrollment needs. GPU columns are
+out of scope for Phase 2 (enrollment runs on CPU).
+
+| Op | Graph backward (ggml.c) | CPU bwd kernel | CUDA / Metal / Vulkan / OpenCL |
+| --- | --- | --- | --- |
+| `RESHAPE`/`PERMUTE`/`CONT` (latent unpack) | OK | OK | out of scope |
+| `SCALE` / `ADD` / `MUL` (denorm, BN, gamma) | OK | OK | out of scope |
+| `MUL_MAT` / `IM2COL` (conv1d) | OK (`mul_mat`/`im2col_back`) | OK | out of scope |
+| `NORM` (channel layer norm) | **MISSING** | — | — |
+| `GELU_ERF` (unary) | **MISSING** | — | — |
+| leaky-relu / PReLU | partial (`STEP`/`RELU` only) | — | — |
+| custom `ggml_supertonic_*` ops | **MISSING** | — | CPU-only forward (see below) |
+
+Confirmed against the `ggml_compute_backward` switch: `NORM`, `GELU`/`GELU_ERF`
+fall through to `GGML_ABORT`; the custom ops have no backward at all. This
+mirrors the text-encoder matrix — the blocking gaps are the same `NORM` and the
+elementwise activation (`GELU_ERF` here; the vector estimator adds it too).
+
+## CPU fallback behavior (enrollment)
+
+Two layers of CPU-only behavior matter for enrollment:
+
+1. **Forward custom ops are CPU-only.** `GGML_OP_CUSTOM` is rejected on every GPU
+   backend (CUDA / Metal / Vulkan / OpenCL), so the vocoder's custom causal
+   depthwise, fused ConvNeXt block, edge-pad and `_ct` fused ops only execute on
+   the CPU backend. On GPU backends the forward already falls back to the
+   pure-GGML `im2col + mul_mat` / granular-op chain (see
+   `supertonic_use_cpu_custom_ops()` / `supertonic_use_fused_supertonic_ops()`
+   guards in `supertonic_vocoder.cpp`).
+2. **The backward runs on CPU, analytically.** Because `NORM`, `GELU_ERF` and the
+   custom ops have no GGML backward, the enrollment gradient cannot be produced
+   by `ggml`'s autodiff on any backend today. The differentiable vocoder is
+   therefore provided as the analytic C++ backward in this PR, which runs on the
+   CPU (the enrollment target). **Every backend must fall back to CPU for the
+   vocoder backward during enrollment.** This is acceptable: enrollment is a
+   one-time, offline optimization loop, not the realtime synthesis path (which
+   keeps its GPU fast paths unchanged).
+
+## Solution shipped in this PR
+
+The `VocoderBackward` class (`src/supertonic_vocoder_backward.{h,cpp}`) owns the
+frozen weights and caches per-call activations as state: `forward(latent)` runs
+the chain and `backward(d_wav)` consumes the cached activations to return
+`d(loss)/d(latent)`. It is model-free and validated component-wise against
+central finite differences via the Task-2 gradcheck harness
+(`test/test_supertonic_vocoder_backward.cpp`, always-on `unit` tier). The
+stateless math primitives are exposed as static members, each gradchecked
+individually:
+
+- `denorm_backward_input` — latent denormalization
+- `conv1d_causal_backward_input` — full causal conv1d (embed / head1)
+- `depthwise_causal_backward_input` — causal depthwise conv1d
+- `batch_norm_backward_input` — affine BN at inference
+- `leaky_relu_backward` — head PReLU
+- `latent_unpack_backward` — the "transposed conv" upsample (permutation)
+- `convnext_backward_input` — full per-channel-gamma ConvNeXt block
+- `VocoderBackward::backward` — the whole chain → `d(loss)/d(latent)`
+
+Channel layer norm, erf-GELU and pointwise (1x1) convs are shared with the
+vector-estimator backward (`tts_cpp::ve_grad`), since the math is identical.
+
+This is mathematically exact, runs on CPU, and serves as the reference oracle for
+the per-stage gradcheck once the GGML-native ops (`NORM`, `GELU_ERF`, custom-op
+backward / lowering) are implemented.
diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp
index d7f51c420c7..dfb255591cb 100644
--- a/tts-cpp/src/supertonic_vocoder.cpp
+++ b/tts-cpp/src/supertonic_vocoder.cpp
@@ -453,12 +453,10 @@ ggml_tensor * convnext_block_ggml(ggml_context * ctx,
         y = conv1d_causal_ggml(ctx, y, w.pw1_w, w.pw1_b);
         y = ggml_gelu_erf(ctx, y);
     }
-    // NOTE: the vector_estimator's `ggml_supertonic_pw2_residual` op
-    // expects `gamma` to be `[C]` (per-channel scale); the vocoder
-    // however stores `gamma` as a `[1]` scalar (single learnable
-    // scale per ConvNeXt block).  The shapes are incompatible, so we
-    // keep the unfused chain here.  A vocoder-specific fused op with
-    // scalar gamma is possible but the win would be tiny (~10
+    // NOTE: `gamma` is the per-channel `[C]` residual scale (same shape as the
+    // vector_estimator's), broadcast over time by `repeat_like` below.  We keep
+    // the unfused `mul + add` tail rather than the vector_estimator's
+    // `ggml_supertonic_pw2_residual` fused op because the win would be tiny (~10
     // dispatches × ~40μs = 0.4 ms).
     y = conv1d_causal_ggml(ctx, y, w.pw2_w, w.pw2_b);
     y = ggml_mul(ctx, y, repeat_like(ctx, w.gamma, y));
@@ -488,10 +486,10 @@ ggml_tensor * pointwise_matmul_ct_voc(ggml_context * ctx,
 // Vocoder ConvNeXt differs from vector_estimator's: (1) depthwise is
 // **causal** (left-only pad) rather than symmetric edge-clamp — handled
 // by the `_causal_ct` variant of the fused depthwise kernel (port-v14).
-// (2) `gamma` is a scalar `[1]`, not per-channel, so the `pw2_residual_ct`
-// fused op doesn't fit — unfused scalar `mul + add` tail.  (3) `norm_g` /
-// `norm_b` ship as `[1, C]` (same flatten-needed quirk as vector_estimator's
-// `.gamma`).
+// (2) the per-channel `[C]` `gamma` residual scale is applied with an
+// unfused `mul + add` tail (the `pw2_residual_ct` fused op isn't wired up
+// here).  (3) `norm_g` / `norm_b` ship as `[1, C]` (same flatten-needed
+// quirk as vector_estimator's `.gamma`).
 //
 // Caller: `SUPERTONIC_DISABLE_CT_VOCODER=1` reverts to legacy
 // `convnext_block_ggml`.
@@ -515,7 +513,7 @@ ggml_tensor * convnext_block_ggml_ct(ggml_context * ctx,
     y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw1_w, /*bias=*/nullptr);
     y_ct = ggml_supertonic_bias_gelu_ct(ctx, y_ct, flatten_1d(w.pw1_b));
     y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw2_w, flatten_1d(w.pw2_b));
-    // Scalar gamma multiply (broadcasts in any layout).
+    // Per-channel `[C]` gamma multiply (broadcasts over time in any layout).
     y_ct = ggml_mul(ctx, y_ct, repeat_like(ctx, w.gamma, y_ct));
     return ggml_add(ctx, residual, y_ct);
 }
@@ -630,7 +628,7 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache,
     ggml_set_name(x, "vocoder_embed");
     // Phase B2 follow-up: route the 10-block ConvNeXt chain through the
     // `[C, T]` variant on Metal.  Each block runs depthwise (causal_ct) +
-    // layer_norm + pw1 + bias_gelu + pw2 + scalar gamma + residual add
+    // layer_norm + pw1 + bias_gelu + pw2 + per-channel gamma + residual add
     // entirely on `[C, T]` — no intra-block permutes.  The single
     // `[T, C] -> [C, T]` permute happens once before the chain and the
     // single reverse permute once after.  Override:
diff --git a/tts-cpp/src/supertonic_vocoder_backward.cpp b/tts-cpp/src/supertonic_vocoder_backward.cpp
new file mode 100644
index 00000000000..2b2cfea5175
--- /dev/null
+++ b/tts-cpp/src/supertonic_vocoder_backward.cpp
@@ -0,0 +1,305 @@
+#include "supertonic_vocoder_backward.h"
+
+#include "supertonic_vector_estimator_backward.h"  // shared conv1x1 / layer_norm / gelu
+
+#include <cmath>
+#include <cstddef>
+#include <stdexcept>
+#include <utility>
+
+namespace tts_cpp {
+namespace voc_grad {
+
+namespace {
+
+inline int causal_src_index(int t, int k, int dilation, int pad_left) {
+    int st = t + k * dilation - pad_left;
+    return st < 0 ? 0 : st;  // replicate ("causal") left padding
+}
+
+}  // namespace
+
+VocoderBackward::VocoderBackward(VocoderWeights weights) : weights_(std::move(weights)) {}
+
+// --- denorm -----------------------------------------------------------------
+
+std::vector<double> VocoderBackward::denorm_forward(const std::vector<double> & x, int L, int C,
+                                                    double normalizer_scale, const std::vector<double> & std,
+                                                    const std::vector<double> & mean) const {
+    std::vector<double> y((std::size_t) L * C);
+    const double inv = 1.0 / normalizer_scale;
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            const std::size_t i = (std::size_t) t * C + c;
+            y[i] = x[i] * inv * std[(std::size_t) c] + mean[(std::size_t) c];
+        }
+    }
+    return y;
+}
+
+std::vector<double> VocoderBackward::denorm_backward_input(const std::vector<double> & d_y, int L, int C,
+                                                           double normalizer_scale,
+                                                           const std::vector<double> & std) const {
+    std::vector<double> d_x((std::size_t) L * C);
+    const double inv = 1.0 / normalizer_scale;
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            const std::size_t i = (std::size_t) t * C + c;
+            d_x[i] = d_y[i] * std[(std::size_t) c] * inv;
+        }
+    }
+    return d_x;
+}
+
+// --- full causal conv1d -----------------------------------------------------
+
+std::vector<double> VocoderBackward::conv1d_causal_forward(const std::vector<double> & x, int L, int IC,
+                                                           int OC, int K, const std::vector<double> & w,
+                                                           const std::vector<double> & b) const {
+    std::vector<double> y((std::size_t) L * OC);
+    const int pad_left = K - 1;
+    const bool has_bias = !b.empty();
+    for (int t = 0; t < L; ++t) {
+        for (int oc = 0; oc < OC; ++oc) {
+            double sum = has_bias ? b[(std::size_t) oc] : 0.0;
+            for (int ic = 0; ic < IC; ++ic) {
+                const std::size_t wbase = ((std::size_t) oc * IC + ic) * K;
+                for (int k = 0; k < K; ++k) {
+                    const int st = causal_src_index(t, k, 1, pad_left);
+                    sum += w[wbase + k] * x[(std::size_t) st * IC + ic];
+                }
+            }
+            y[(std::size_t) t * OC + oc] = sum;
+        }
+    }
+    return y;
+}
+
+std::vector<double> VocoderBackward::conv1d_causal_backward_input(const std::vector<double> & d_y, int L,
+                                                                  int IC, int OC, int K,
+                                                                  const std::vector<double> & w) const {
+    std::vector<double> d_x((std::size_t) L * IC, 0.0);
+    const int pad_left = K - 1;
+    for (int t = 0; t < L; ++t) {
+        for (int oc = 0; oc < OC; ++oc) {
+            const double g = d_y[(std::size_t) t * OC + oc];
+            for (int ic = 0; ic < IC; ++ic) {
+                const std::size_t wbase = ((std::size_t) oc * IC + ic) * K;
+                for (int k = 0; k < K; ++k) {
+                    const int st = causal_src_index(t, k, 1, pad_left);
+                    d_x[(std::size_t) st * IC + ic] += g * w[wbase + k];
+                }
+            }
+        }
+    }
+    return d_x;
+}
+
+// --- causal depthwise conv1d ------------------------------------------------
+
+std::vector<double> VocoderBackward::depthwise_causal_forward(const std::vector<double> & x, int L, int C,
+                                                              int K, int dilation,
+                                                              const std::vector<double> & w,
+                                                              const std::vector<double> & b) const {
+    std::vector<double> y((std::size_t) L * C);
+    const int pad_left = (K - 1) * dilation;
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            double sum = b[(std::size_t) c];
+            const std::size_t wbase = (std::size_t) c * K;
+            for (int k = 0; k < K; ++k) {
+                const int st = causal_src_index(t, k, dilation, pad_left);
+                sum += w[wbase + k] * x[(std::size_t) st * C + c];
+            }
+            y[(std::size_t) t * C + c] = sum;
+        }
+    }
+    return y;
+}
+
+std::vector<double> VocoderBackward::depthwise_causal_backward_input(const std::vector<double> & d_y, int L,
+                                                                     int C, int K, int dilation,
+                                                                     const std::vector<double> & w) const {
+    std::vector<double> d_x((std::size_t) L * C, 0.0);
+    const int pad_left = (K - 1) * dilation;
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            const double g = d_y[(std::size_t) t * C + c];
+            const std::size_t wbase = (std::size_t) c * K;
+            for (int k = 0; k < K; ++k) {
+                const int st = causal_src_index(t, k, dilation, pad_left);
+                d_x[(std::size_t) st * C + c] += g * w[wbase + k];
+            }
+        }
+    }
+    return d_x;
+}
+
+// --- affine batch norm (inference) ------------------------------------------
+
+std::vector<double> VocoderBackward::batch_norm_forward(const std::vector<double> & x, int L, int C,
+                                                        const std::vector<double> & gamma,
+                                                        const std::vector<double> & beta,
+                                                        const std::vector<double> & running_mean,
+                                                        const std::vector<double> & running_var,
+                                                        double eps) const {
+    std::vector<double> y((std::size_t) L * C);
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            const std::size_t i = (std::size_t) t * C + c;
+            const double inv = 1.0 / std::sqrt(running_var[(std::size_t) c] + eps);
+            y[i] = (x[i] - running_mean[(std::size_t) c]) * inv * gamma[(std::size_t) c] + beta[(std::size_t) c];
+        }
+    }
+    return y;
+}
+
+std::vector<double> VocoderBackward::batch_norm_backward_input(const std::vector<double> & d_y, int L, int C,
+                                                               const std::vector<double> & gamma,
+                                                               const std::vector<double> & running_var,
+                                                               double eps) const {
+    std::vector<double> d_x((std::size_t) L * C);
+    for (int t = 0; t < L; ++t) {
+        for (int c = 0; c < C; ++c) {
+            const std::size_t i = (std::size_t) t * C + c;
+            const double inv = 1.0 / std::sqrt(running_var[(std::size_t) c] + eps);
+            d_x[i] = d_y[i] * gamma[(std::size_t) c] * inv;
+        }
+    }
+    return d_x;
+}
+
+// --- leaky relu / prelu -----------------------------------------------------
+
+std::vector<double> VocoderBackward::leaky_relu_forward(const std::vector<double> & x, double slope) const {
+    std::vector<double> y(x.size());
+    for (std::size_t i = 0; i < x.size(); ++i) y[i] = x[i] >= 0.0 ? x[i] : slope * x[i];
+    return y;
+}
+
+std::vector<double> VocoderBackward::leaky_relu_backward(const std::vector<double> & x,
+                                                         const std::vector<double> & d_y, double slope) const {
+    std::vector<double> d_x(x.size());
+    for (std::size_t i = 0; i < x.size(); ++i) d_x[i] = d_y[i] * (x[i] >= 0.0 ? 1.0 : slope);
+    return d_x;
+}
+
+// --- latent unpack ----------------------------------------------------------
+
+std::vector<double> VocoderBackward::latent_unpack_forward(const std::vector<double> & latent, int latent_len,
+                                                           int C_latent, int factor) const {
+    const int T0 = latent_len * factor;
+    std::vector<double> x((std::size_t) T0 * C_latent);
+    for (int c = 0; c < C_latent; ++c) {
+        for (int t = 0; t < latent_len; ++t) {
+            for (int r = 0; r < factor; ++r) {
+                const int src_c = c * factor + r;
+                x[(std::size_t) (t * factor + r) * C_latent + c] =
+                    latent[(std::size_t) src_c * latent_len + t];
+            }
+        }
+    }
+    return x;
+}
+
+std::vector<double> VocoderBackward::latent_unpack_backward(const std::vector<double> & d_x, int latent_len,
+                                                            int C_latent, int factor) const {
+    const int latent_channels = C_latent * factor;
+    std::vector<double> d_latent((std::size_t) latent_channels * latent_len);
+    for (int c = 0; c < C_latent; ++c) {
+        for (int t = 0; t < latent_len; ++t) {
+            for (int r = 0; r < factor; ++r) {
+                const int src_c = c * factor + r;
+                d_latent[(std::size_t) src_c * latent_len + t] =
+                    d_x[(std::size_t) (t * factor + r) * C_latent + c];
+            }
+        }
+    }
+    return d_latent;
+}
+
+// --- vocoder ConvNeXt block -------------------------------------------------
+
+std::vector<double> VocoderBackward::convnext_forward(const VocConvNextWeights & w,
+                                                      const std::vector<double> & x, int L,
+                                                      VocConvNextActivations & acts) const {
+    acts.dw_out = depthwise_causal_forward(x, L, w.C, w.K, w.dilation, w.dw_w, w.dw_b);
+    const std::vector<double> ln = ve_grad::layer_norm_forward(acts.dw_out, L, w.C, w.ln_gamma, w.ln_beta);
+    acts.z1 = ve_grad::conv1x1_forward(ln, L, w.C, w.hidden, w.pw1_w, w.pw1_b);
+    const std::vector<double> g  = ve_grad::gelu_forward(acts.z1);
+    const std::vector<double> z2 = ve_grad::conv1x1_forward(g, L, w.hidden, w.C, w.pw2_w, w.pw2_b);
+
+    std::vector<double> out((std::size_t) L * w.C);
+    for (std::size_t i = 0; i < out.size(); ++i) {
+        const std::size_t c = i % (std::size_t) w.C;  // [L, C] time-major, gamma is per-channel
+        out[i] = x[i] + w.gamma[c] * z2[i];
+    }
+    return out;
+}
+
+std::vector<double> VocoderBackward::convnext_backward_input(const VocConvNextWeights & w,
+                                                             const VocConvNextActivations & acts,
+                                                             const std::vector<double> & d_out, int L) const {
+    std::vector<double> d_z2((std::size_t) L * w.C);
+    for (std::size_t i = 0; i < d_z2.size(); ++i) {
+        const std::size_t c = i % (std::size_t) w.C;  // [L, C] time-major, gamma is per-channel
+        d_z2[i] = w.gamma[c] * d_out[i];
+    }
+
+    const std::vector<double> d_g  = ve_grad::conv1x1_backward_input(d_z2, L, w.hidden, w.C, w.pw2_w);
+    const std::vector<double> d_z1 = ve_grad::gelu_backward(acts.z1, d_g);
+    const std::vector<double> d_ln = ve_grad::conv1x1_backward_input(d_z1, L, w.C, w.hidden, w.pw1_w);
+    const std::vector<double> d_dw = ve_grad::layer_norm_backward_input(acts.dw_out, L, w.C, w.ln_gamma, d_ln);
+    const std::vector<double> d_x_dw =
+        depthwise_causal_backward_input(d_dw, L, w.C, w.K, w.dilation, w.dw_w);
+
+    std::vector<double> d_x((std::size_t) L * w.C);
+    for (std::size_t i = 0; i < d_x.size(); ++i) d_x[i] = d_out[i] + d_x_dw[i];  // residual path
+    return d_x;
+}
+
+// --- full vocoder -----------------------------------------------------------
+
+std::vector<double> VocoderBackward::forward(const std::vector<double> & latent) {
+    const VocoderWeights & w = weights_;
+    const int T0 = w.latent_len * w.factor;
+
+    std::vector<double> x = latent_unpack_forward(latent, w.latent_len, w.C_latent, w.factor);
+    x = denorm_forward(x, T0, w.C_latent, w.normalizer_scale, w.latent_std, w.latent_mean);
+    x = conv1d_causal_forward(x, T0, w.C_latent, w.C, w.K_embed, w.embed_w, w.embed_b);
+
+    block_acts_.assign(w.convnext.size(), VocConvNextActivations{});
+    for (std::size_t i = 0; i < w.convnext.size(); ++i) {
+        x = convnext_forward(w.convnext[i], x, T0, block_acts_[i]);
+    }
+
+    x = batch_norm_forward(x, T0, w.C, w.bn_gamma, w.bn_beta, w.bn_running_mean, w.bn_running_var);
+
+    head1_out_ = conv1d_causal_forward(x, T0, w.C, w.Hh, w.K_head1, w.head1_w, w.head1_b);
+    const std::vector<double> p = leaky_relu_forward(head1_out_, w.prelu_slope);
+    return ve_grad::conv1x1_forward(p, T0, w.Hh, w.OUT, w.head2_w, /*b=*/{});
+}
+
+std::vector<double> VocoderBackward::backward(const std::vector<double> & d_wav) const {
+    if (head1_out_.empty()) {
+        throw std::logic_error("VocoderBackward::backward called before forward (no cached activations)");
+    }
+    const VocoderWeights & w = weights_;
+    const int T0 = w.latent_len * w.factor;
+
+    std::vector<double> d_p = ve_grad::conv1x1_backward_input(d_wav, T0, w.Hh, w.OUT, w.head2_w);
+    std::vector<double> d_x = leaky_relu_backward(head1_out_, d_p, w.prelu_slope);
+    d_x = conv1d_causal_backward_input(d_x, T0, w.C, w.Hh, w.K_head1, w.head1_w);
+    d_x = batch_norm_backward_input(d_x, T0, w.C, w.bn_gamma, w.bn_running_var);
+
+    for (std::size_t i = w.convnext.size(); i-- > 0;) {
+        d_x = convnext_backward_input(w.convnext[i], block_acts_[i], d_x, T0);
+    }
+
+    d_x = conv1d_causal_backward_input(d_x, T0, w.C_latent, w.C, w.K_embed, w.embed_w);
+    d_x = denorm_backward_input(d_x, T0, w.C_latent, w.normalizer_scale, w.latent_std);
+    return latent_unpack_backward(d_x, w.latent_len, w.C_latent, w.factor);
+}
+
+}  // namespace voc_grad
+}  // namespace tts_cpp
diff --git a/tts-cpp/src/supertonic_vocoder_backward.h b/tts-cpp/src/supertonic_vocoder_backward.h
new file mode 100644
index 00000000000..b407ad77531
--- /dev/null
+++ b/tts-cpp/src/supertonic_vocoder_backward.h
@@ -0,0 +1,210 @@
+#pragma once
+
+// Analytic backward pass for the Supertonic vocoder — voice-clone roadmap,
+// ticket "GGML backward pass: vocoder" (QVAC-20983).
+//
+// Scope: the vocoder maps a latent (the CFM output) to a waveform. For
+// gradient-based voice cloning only `style_ttl` is optimized, so the gradient
+// this class produces is `d(loss)/d(latent)` — the signal the on-device
+// enrollment loop backprops from the audio loss down through the vocoder into
+// the latent, which then flows back through the vector estimator and text
+// encoder to `style_ttl`. Model weights are frozen, so the backward returns the
+// input gradient only.
+//
+// Why analytic (not ggml autodiff): the vocoder forward uses ops whose backward
+// is not implemented in the vendored ggml (`ggml_norm` layer norm, `ggml_gelu_*`,
+// the custom causal depthwise op, and the leaky-relu/prelu lowering). The
+// "transposed convolution" the upsampling notionally needs is realized as a
+// fixed reshape+permute (the latent unpack), so its backward is a pure
+// permutation rather than a conv-transpose kernel.
+//
+// `VocoderBackward` owns the frozen weights and caches the per-call activations
+// as state: `forward(latent)` runs the chain and stores the activations needed
+// by `backward(d_wav)`. The math is computed in double for a well-conditioned
+// reference and validated component-wise against central finite differences by
+// the voiceclone gradcheck harness (Task 2 / QVAC-20979). The class has no
+// dependency on `supertonic_model`; a thin adapter binds the real GGUF weights
+// into `VocoderWeights` elsewhere.
+//
+// Pointwise (1x1) convs, channel layer norm and erf-GELU are shared with the
+// vector-estimator backward (`tts_cpp::ve_grad`) since the math is identical;
+// the class adds the vocoder-specific causal convs, affine batch norm,
+// leaky-relu, per-channel-gamma ConvNeXt block, latent unpack and the full chain.
+
+#include <vector>
+
+namespace tts_cpp {
+namespace voc_grad {
+
+// --- Plain data holders ------------------------------------------------------
+
+// Vocoder ConvNeXt block weights (matches `convnext_block`). Differs from the
+// vector-estimator block in that the depthwise conv is *causal* (left pad);
+// `gamma` is the per-channel `[C]` residual scale, as in the production model.
+struct VocConvNextWeights {
+    std::vector<double> dw_w;      // depthwise [C * K]
+    std::vector<double> dw_b;      // [C]
+    std::vector<double> ln_gamma;  // [C]
+    std::vector<double> ln_beta;   // [C]
+    std::vector<double> pw1_w;     // [hidden * C]
+    std::vector<double> pw1_b;     // [hidden]
+    std::vector<double> pw2_w;     // [C * hidden]
+    std::vector<double> pw2_b;     // [C]
+    std::vector<double> gamma;     // [C], per-channel residual scale
+    int C        = 0;
+    int hidden   = 0;
+    int K        = 0;
+    int dilation = 1;
+};
+
+// Activations cached by a ConvNeXt forward for reuse in its backward.
+struct VocConvNextActivations {
+    std::vector<double> dw_out;  // [L, C], depthwise output (input to layer norm)
+    std::vector<double> z1;      // [L, hidden], pwconv1 output (input to gelu)
+};
+
+// Full vocoder weights (matches `supertonic_vocoder_forward_cpu`).
+struct VocoderWeights {
+    int latent_len    = 0;  // L in the packed latent
+    int C_latent      = 0;  // unpacked latent channels
+    int factor        = 0;  // time upsample factor (latent_channels = C_latent * factor)
+    int C             = 0;  // embed output channels (ConvNeXt width)
+    double normalizer_scale = 1.0;
+    std::vector<double> latent_mean;  // [C_latent]
+    std::vector<double> latent_std;   // [C_latent]
+
+    std::vector<double> embed_w;  // [C * C_latent * K_embed] (OC=C, IC=C_latent)
+    std::vector<double> embed_b;  // [C]
+    int K_embed = 0;
+
+    std::vector<VocConvNextWeights> convnext;  // ConvNeXt chain (carries dilations)
+
+    std::vector<double> bn_gamma;         // [C]
+    std::vector<double> bn_beta;          // [C]
+    std::vector<double> bn_running_mean;  // [C]
+    std::vector<double> bn_running_var;   // [C]
+
+    std::vector<double> head1_w;  // [Hh * C * K_head1]
+    std::vector<double> head1_b;  // [Hh]
+    int K_head1 = 0;
+    int Hh      = 0;  // head1 output channels
+    double prelu_slope = 0.0;
+
+    std::vector<double> head2_w;  // [OUT * Hh] pointwise (K=1), no bias
+    int OUT = 0;                  // waveform output channels
+};
+
+// --- Vocoder backward --------------------------------------------------------
+//
+// Stateful: construct with the frozen weights, call `forward(latent)` (which
+// caches the activations), then `backward(d_wav)` (which consumes them). Only
+// the construction + forward/backward surface is public; the stateless math
+// primitives are private implementation details. The gradcheck self-tests reach
+// them through a `friend` so each primitive is still validated individually
+// against finite differences without widening the public API.
+class VocoderBackward {
+public:
+    explicit VocoderBackward(VocoderWeights weights);
+
+    const VocoderWeights & weights() const { return weights_; }
+
+    // Forward: `latent` is channel-major [latent_channels, latent_len]. Runs the
+    // chain, caches the activations as state and returns the waveform, time-major
+    // [T0, OUT] with T0 = latent_len * factor.
+    std::vector<double> forward(const std::vector<double> & latent);
+
+    // Backward: from d_wav [T0, OUT] return d_latent in the channel-major
+    // [latent_channels, latent_len] layout the forward consumes. Uses the
+    // activations cached by the most recent `forward`.
+    std::vector<double> backward(const std::vector<double> & d_wav) const;
+
+private:
+    // Grants the gradcheck self-tests access to the private primitives below.
+    friend struct VocoderBackwardTester;
+
+    // The primitives below are pure: they read no member state, so they are
+    // marked `const` (callable from `backward`). They are private helpers of the
+    // forward/backward chain, individually gradchecked through the friend tester.
+
+    // --- Latent denormalization (matches the vocoder "denorm" stage) ---------
+    // y[t, c] = (x[t, c] / normalizer_scale) * std[c] + mean[c]. `x` is
+    // time-major [L, C]; `std`/`mean` are per-channel [C].
+    std::vector<double> denorm_forward(const std::vector<double> & x, int L, int C, double normalizer_scale,
+                                       const std::vector<double> & std,
+                                       const std::vector<double> & mean) const;
+
+    std::vector<double> denorm_backward_input(const std::vector<double> & d_y, int L, int C,
+                                              double normalizer_scale, const std::vector<double> & std) const;
+
+    // --- Full causal conv1d, IC -> OC, K taps (matches `conv1d_causal`) ------
+    // Left replicate ("causal") padding by K-1. Weight is ONNX row-major
+    // [OC, IC, K] with raw index ((oc * IC + ic) * K + k); bias is optional
+    // ([OC] or empty). `x` is time-major [L, IC]; output is [L, OC].
+    std::vector<double> conv1d_causal_forward(const std::vector<double> & x, int L, int IC, int OC, int K,
+                                              const std::vector<double> & w,
+                                              const std::vector<double> & b) const;
+
+    std::vector<double> conv1d_causal_backward_input(const std::vector<double> & d_y, int L, int IC, int OC,
+                                                     int K, const std::vector<double> & w) const;
+
+    // --- Causal depthwise conv1d (matches `depthwise_conv1d_causal`) ---------
+    // Left replicate padding by (K-1)*dilation. Weight is [C, K] with raw index
+    // c * K + k; bias is per-channel [C]. `x` is time-major [L, C].
+    std::vector<double> depthwise_causal_forward(const std::vector<double> & x, int L, int C, int K,
+                                                 int dilation, const std::vector<double> & w,
+                                                 const std::vector<double> & b) const;
+
+    std::vector<double> depthwise_causal_backward_input(const std::vector<double> & d_y, int L, int C, int K,
+                                                        int dilation, const std::vector<double> & w) const;
+
+    // --- Affine batch norm at inference (matches `batch_norm_channel`) -------
+    // y[t, c] = (x[t, c] - running_mean[c]) / sqrt(running_var[c] + eps) *
+    //           gamma[c] + beta[c]. Per-channel [C]; constants at inference, so
+    // the backward into the input is a per-channel scale.
+    std::vector<double> batch_norm_forward(const std::vector<double> & x, int L, int C,
+                                           const std::vector<double> & gamma, const std::vector<double> & beta,
+                                           const std::vector<double> & running_mean,
+                                           const std::vector<double> & running_var, double eps = 1e-5) const;
+
+    std::vector<double> batch_norm_backward_input(const std::vector<double> & d_y, int L, int C,
+                                                  const std::vector<double> & gamma,
+                                                  const std::vector<double> & running_var,
+                                                  double eps = 1e-5) const;
+
+    // --- Leaky-relu / prelu with a scalar negative slope (head prelu) --------
+    // y = x >= 0 ? x : slope * x, elementwise.
+    std::vector<double> leaky_relu_forward(const std::vector<double> & x, double slope) const;
+
+    std::vector<double> leaky_relu_backward(const std::vector<double> & x, const std::vector<double> & d_y,
+                                            double slope) const;
+
+    // --- Latent unpack (the notional "transposed conv" upsampling) -----------
+    // The latent ships channel-major [latent_channels, latent_len] (raw index
+    // (c*factor + r) * latent_len + t) and is unpacked to the time-major
+    // activation [T0, C_latent] with T0 = latent_len * factor and
+    // latent_channels = C_latent * factor, via
+    //   x[(t*factor + r) * C_latent + c] = latent[(c*factor + r)*latent_len + t].
+    // This is a pure permutation; its backward is the transpose gather.
+    std::vector<double> latent_unpack_forward(const std::vector<double> & latent, int latent_len, int C_latent,
+                                              int factor) const;
+
+    std::vector<double> latent_unpack_backward(const std::vector<double> & d_x, int latent_len, int C_latent,
+                                               int factor) const;
+
+    // --- Vocoder ConvNeXt block (matches `convnext_block`) -------------------
+    // out = x + gamma * pwconv2(gelu(pwconv1(layer_norm(depthwise_causal(x))))).
+    // pwconv1/pwconv2 are 1x1 out-major-weight convs (shared `ve_grad::conv1x1`).
+    std::vector<double> convnext_forward(const VocConvNextWeights & w, const std::vector<double> & x, int L,
+                                         VocConvNextActivations & acts) const;
+
+    std::vector<double> convnext_backward_input(const VocConvNextWeights & w,
+                                                const VocConvNextActivations & acts,
+                                                const std::vector<double> & d_out, int L) const;
+
+    VocoderWeights weights_;
+    std::vector<VocConvNextActivations> block_acts_;  // per ConvNeXt block
+    std::vector<double> head1_out_;                   // [T0, Hh], the prelu input
+};
+
+}  // namespace voc_grad
+}  // namespace tts_cpp
diff --git a/tts-cpp/test/test_supertonic_vocoder_backward.cpp b/tts-cpp/test/test_supertonic_vocoder_backward.cpp
new file mode 100644
index 00000000000..51c6a86a21b
--- /dev/null
+++ b/tts-cpp/test/test_supertonic_vocoder_backward.cpp
@@ -0,0 +1,291 @@
+// Gradcheck self-tests for the Supertonic vocoder backward (voice-clone ticket
+// "GGML backward pass: vocoder", QVAC-20983). Pure host logic, model-free: every
+// analytic gradient is checked component-wise against a central finite-difference
+// numeric gradient of the matching forward. Runs in the always-on `unit` ctest
+// tier.
+//
+// Standalone build (single line):
+//   g++ -std=c++17 -I src test/test_supertonic_vocoder_backward.cpp \
+//       src/supertonic_vocoder_backward.cpp \
+//       src/supertonic_vector_estimator_backward.cpp \
+//       src/voiceclone_gradcheck.cpp -o /tmp/t && /tmp/t
+
+#include "supertonic_vocoder_backward.h"
+#include "voiceclone_gradcheck.h"
+
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <exception>
+#include <vector>
+
+using namespace tts_cpp::voc_grad;
+using tts_cpp::voiceclone::compare_gradients;
+using tts_cpp::voiceclone::finite_diff_gradient;
+using tts_cpp::voiceclone::GradcheckReport;
+using tts_cpp::voiceclone::ScalarLossFn;
+
+namespace {
+
+int g_failures = 0;
+int g_checks   = 0;
+
+#define CHECK(cond, ...) do {                                  \
+    ++g_checks;                                                \
+    if (!(cond)) {                                             \
+        ++g_failures;                                          \
+        fprintf(stderr, "FAIL %s:%d  ", __FILE__, __LINE__);   \
+        fprintf(stderr, __VA_ARGS__);                          \
+        fprintf(stderr, "\n");                                 \
+    }                                                          \
+} while (0)
+
+double sample(int i, double phase) {
+    return std::sin(i * 0.9 + phase) * 0.8;
+}
+
+std::vector<double> make_vector(int n, double phase) {
+    std::vector<double> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[i] = sample(i, phase);
+    return v;
+}
+
+// Strictly positive samples for variance-like quantities (running_var).
+std::vector<double> make_positive(int n, double phase) {
+    std::vector<double> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[i] = 0.5 + 0.4 * (sample(i, phase) + 1.0);
+    return v;
+}
+
+double dot(const std::vector<double> & a, const std::vector<double> & b) {
+    double acc = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i) acc += a[i] * b[i];
+    return acc;
+}
+
+void report_check(const char * name, const GradcheckReport & r) {
+    CHECK(r.passed, "%s: gradcheck failed (max_abs=%.3e max_rel=%.3e worst=%zu)",
+          name, r.max_abs_err, r.max_rel_err, r.worst_index);
+}
+
+// ---------------------------------------------------------------------------
+
+VocConvNextWeights make_block(int C, int hidden, int K, int dilation, double phase) {
+    VocConvNextWeights w;
+    w.C        = C;
+    w.hidden   = hidden;
+    w.K        = K;
+    w.dilation = dilation;
+    w.dw_w     = make_vector(C * K, phase + 0.1);
+    w.dw_b     = make_vector(C, phase + 0.2);
+    w.ln_gamma = make_vector(C, phase + 0.3);
+    w.ln_beta  = make_vector(C, phase + 0.4);
+    w.pw1_w    = make_vector(hidden * C, phase + 0.5);
+    w.pw1_b    = make_vector(hidden, phase + 0.6);
+    w.pw2_w    = make_vector(C * hidden, phase + 0.7);
+    w.pw2_b    = make_vector(C, phase + 0.8);
+    w.gamma    = make_vector(C, phase + 0.9);  // per-channel residual scale
+    return w;
+}
+
+VocoderWeights make_vocoder() {
+    VocoderWeights w;
+    w.latent_len = 2;
+    w.C_latent   = 3;
+    w.factor     = 2;  // latent_channels = 6, T0 = 4
+    w.C          = 5;
+    w.normalizer_scale = 1.7;
+    w.latent_mean = make_vector(w.C_latent, 0.2);
+    w.latent_std  = make_vector(w.C_latent, 0.5);
+
+    w.K_embed = 3;
+    w.embed_w = make_vector(w.C * w.C_latent * w.K_embed, 0.9);
+    w.embed_b = make_vector(w.C, 1.0);
+
+    // Three ConvNeXt blocks exercising the production dilation variety (1/2/4).
+    const int hidden = 7, Kdw = 3;
+    w.convnext.push_back(make_block(w.C, hidden, Kdw, 1, 1.0));
+    w.convnext.push_back(make_block(w.C, hidden, Kdw, 2, 2.0));
+    w.convnext.push_back(make_block(w.C, hidden, Kdw, 4, 3.0));
+
+    w.bn_gamma        = make_vector(w.C, 0.4);
+    w.bn_beta         = make_vector(w.C, 0.7);
+    w.bn_running_mean = make_vector(w.C, 0.1);
+    w.bn_running_var  = make_positive(w.C, 0.5);
+
+    w.Hh      = 6;
+    w.K_head1 = 3;
+    w.head1_w = make_vector(w.Hh * w.C * w.K_head1, 0.3);
+    w.head1_b = make_vector(w.Hh, 0.45);
+    w.prelu_slope = 0.1;
+
+    w.OUT     = 1;
+    w.head2_w = make_vector(w.OUT * w.Hh, 0.65);
+    return w;
+}
+
+}  // namespace
+
+namespace tts_cpp {
+namespace voc_grad {
+
+// Friend of VocoderBackward: validates the private math primitives individually
+// (and the full forward/backward chain) against finite differences. Declared a
+// friend so the gradchecks reach the primitives without widening the public API.
+struct VocoderBackwardTester {
+    // The primitives are pure (ignore weights), so a default-constructed instance
+    // is all the friend tester needs to exercise them.
+    static VocoderBackward op() { return VocoderBackward{VocoderWeights{}}; }
+
+    static void test_denorm_backward() {
+        const int L = 4, C = 3;
+        const double normalizer_scale = 1.7;
+        const std::vector<double> std    = make_vector(C, 0.4);
+        const std::vector<double> mean   = make_vector(C, 1.2);
+        const std::vector<double> coeffs = make_vector(L * C, 2.0);
+        const std::vector<double> x0     = make_vector(L * C, 0.7);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            return dot(coeffs, vb.denorm_forward(x, L, C, normalizer_scale, std, mean));
+        };
+        const std::vector<double> analytic = vb.denorm_backward_input(coeffs, L, C, normalizer_scale, std);
+        report_check("denorm_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_conv1d_causal_backward() {
+        const int L = 6, IC = 4, OC = 5, K = 3;
+        const std::vector<double> w      = make_vector(OC * IC * K, 0.3);
+        const std::vector<double> b      = make_vector(OC, 1.1);
+        const std::vector<double> coeffs = make_vector(L * OC, 2.0);
+        const std::vector<double> x0     = make_vector(L * IC, 0.7);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            return dot(coeffs, vb.conv1d_causal_forward(x, L, IC, OC, K, w, b));
+        };
+        const std::vector<double> analytic = vb.conv1d_causal_backward_input(coeffs, L, IC, OC, K, w);
+        report_check("conv1d_causal_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_depthwise_causal_backward() {
+        const int L = 7, C = 4, K = 3, dilation = 2;
+        const std::vector<double> w      = make_vector(C * K, 0.4);
+        const std::vector<double> b      = make_vector(C, 0.9);
+        const std::vector<double> coeffs = make_vector(L * C, 1.6);
+        const std::vector<double> x0     = make_vector(L * C, 0.5);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            return dot(coeffs, vb.depthwise_causal_forward(x, L, C, K, dilation, w, b));
+        };
+        const std::vector<double> analytic = vb.depthwise_causal_backward_input(coeffs, L, C, K, dilation, w);
+        report_check("depthwise_causal_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_batch_norm_backward() {
+        const int L = 5, C = 4;
+        const std::vector<double> gamma = make_vector(C, 0.5);
+        const std::vector<double> beta  = make_vector(C, 1.7);
+        const std::vector<double> rmean = make_vector(C, 0.3);
+        const std::vector<double> rvar  = make_positive(C, 0.8);
+        const std::vector<double> coeffs = make_vector(L * C, 2.3);
+        const std::vector<double> x0     = make_vector(L * C, 0.2);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            return dot(coeffs, vb.batch_norm_forward(x, L, C, gamma, beta, rmean, rvar));
+        };
+        const std::vector<double> analytic = vb.batch_norm_backward_input(coeffs, L, C, gamma, rvar);
+        report_check("batch_norm_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_leaky_relu_backward() {
+        const int n = 16;
+        const double slope = 0.1;
+        const std::vector<double> coeffs = make_vector(n, 1.3);
+        const std::vector<double> x0     = make_vector(n, 0.6);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            return dot(coeffs, vb.leaky_relu_forward(x, slope));
+        };
+        const std::vector<double> analytic = vb.leaky_relu_backward(x0, coeffs, slope);
+        // The kink at 0 is measure-zero for these samples; central diff is exact away from it.
+        report_check("leaky_relu_backward", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_latent_unpack_backward() {
+        const int latent_len = 3, C_latent = 4, factor = 2;
+        const int T0 = latent_len * factor;
+        const int latent_channels = C_latent * factor;
+        const std::vector<double> coeffs = make_vector(T0 * C_latent, 1.4);
+        const std::vector<double> latent0 = make_vector(latent_channels * latent_len, 0.5);
+
+        const VocoderBackward vb = op();
+        const ScalarLossFn f = [&](const std::vector<double> & latent) {
+            return dot(coeffs, vb.latent_unpack_forward(latent, latent_len, C_latent, factor));
+        };
+        const std::vector<double> analytic = vb.latent_unpack_backward(coeffs, latent_len, C_latent, factor);
+        report_check("latent_unpack_backward", compare_gradients(finite_diff_gradient(f, latent0), analytic));
+    }
+
+    static void test_convnext_backward() {
+        const int L = 6, C = 5, hidden = 7, K = 3, dilation = 2;
+        const VocConvNextWeights w = make_block(C, hidden, K, dilation, 0.0);
+        const std::vector<double> coeffs = make_vector(L * C, 1.1);
+        const std::vector<double> x0     = make_vector(L * C, 0.3);
+
+        const VocoderBackward vb = op();
+        VocConvNextActivations acts;
+        vb.convnext_forward(w, x0, L, acts);
+        const std::vector<double> analytic = vb.convnext_backward_input(w, acts, coeffs, L);
+
+        const ScalarLossFn f = [&](const std::vector<double> & x) {
+            VocConvNextActivations a;
+            return dot(coeffs, vb.convnext_forward(w, x, L, a));
+        };
+        report_check("voc_convnext d_x", compare_gradients(finite_diff_gradient(f, x0), analytic));
+    }
+
+    static void test_vocoder_backward() {
+        const VocoderWeights w = make_vocoder();
+        const int T0 = w.latent_len * w.factor;
+        const int latent_channels = w.C_latent * w.factor;
+        const std::vector<double> coeffs  = make_vector(T0 * w.OUT, 1.2);
+        const std::vector<double> latent0 = make_vector(latent_channels * w.latent_len, 0.4);
+
+        VocoderBackward vb(w);
+        vb.forward(latent0);
+        const std::vector<double> analytic = vb.backward(coeffs);
+
+        const ScalarLossFn f = [&](const std::vector<double> & latent) {
+            VocoderBackward local(w);
+            return dot(coeffs, local.forward(latent));
+        };
+        report_check("vocoder d_latent", compare_gradients(finite_diff_gradient(f, latent0), analytic));
+    }
+};
+
+}  // namespace voc_grad
+}  // namespace tts_cpp
+
+int main() {
+    using tts_cpp::voc_grad::VocoderBackwardTester;
+    try {
+        VocoderBackwardTester::test_denorm_backward();
+        VocoderBackwardTester::test_conv1d_causal_backward();
+        VocoderBackwardTester::test_depthwise_causal_backward();
+        VocoderBackwardTester::test_batch_norm_backward();
+        VocoderBackwardTester::test_leaky_relu_backward();
+        VocoderBackwardTester::test_latent_unpack_backward();
+        VocoderBackwardTester::test_convnext_backward();
+        VocoderBackwardTester::test_vocoder_backward();
+    } catch (const std::exception & e) {
+        ++g_failures;
+        fprintf(stderr, "FAIL uncaught exception: %s\n", e.what());
+    }
+    fprintf(stderr, "\n%s: %d/%d checks passed\n",
+            g_failures == 0 ? "PASS" : "FAIL", g_checks - g_failures, g_checks);
+    return g_failures == 0 ? 0 : 1;
+}
diff --git a/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp b/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp
new file mode 100644
index 00000000000..2fe6c96b27e
--- /dev/null
+++ b/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp
@@ -0,0 +1,381 @@
+// Forward-parity test for the Supertonic vocoder backward (voice-clone ticket
+// "GGML backward pass: vocoder", QVAC-20983).
+//
+// Why this test exists
+// --------------------
+// `test_supertonic_vocoder_backward.cpp` gradchecks the analytic backward
+// against the *in-file* `VocoderBackward::forward`. That proves the backward is
+// the correct derivative of that forward, but it is self-referential: if
+// `VocoderBackward::forward` itself drifted from the production vocoder, the
+// gradcheck would still pass while the gradients flow through the wrong
+// function (this is exactly how a `gamma` dimensionality bug slipped past it).
+//
+// This test closes that gap. It builds a synthetic `supertonic_model` on a CPU
+// backend with deterministic weights, feeds the *identical* raw weight buffers
+// to both `supertonic_vocoder_forward_cpu` (production) and
+// `VocoderBackward::forward` (the backward's reference forward), and asserts the
+// two waveforms match. Any divergence between the reference forward and the
+// production forward — wrong gamma layout, wrong dilation schedule, swapped
+// weight index order — fails here.
+//
+// Model-free: weights are synthesized in-memory, so it always runs in the
+// always-on `unit` ctest tier (no GGUF, no fixtures).
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+
+#include "supertonic_internal.h"
+#include "supertonic_vocoder_backward.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <stdexcept>
+#include <vector>
+
+using namespace tts_cpp::supertonic::detail;
+using tts_cpp::voc_grad::VocConvNextWeights;
+using tts_cpp::voc_grad::VocoderBackward;
+using tts_cpp::voc_grad::VocoderWeights;
+
+namespace {
+
+int g_failures = 0;
+int g_checks   = 0;
+
+#define CHECK(cond, ...) do {                                  \
+    ++g_checks;                                                \
+    if (!(cond)) {                                             \
+        ++g_failures;                                          \
+        std::fprintf(stderr, "FAIL %s:%d  ", __FILE__, __LINE__); \
+        std::fprintf(stderr, __VA_ARGS__);                     \
+        std::fprintf(stderr, "\n");                            \
+    }                                                          \
+} while (0)
+
+// Production ConvNeXt dilation schedule (`convnext_block` in
+// supertonic_vocoder.cpp). The 10-block count is fixed by the model.
+constexpr int kNumBlocks = 10;
+constexpr int kDilations[kNumBlocks] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1};
+
+// Topology kept tiny (microsecond runtime) but structurally faithful: real
+// dilation variety, per-channel gamma, the full denorm -> embed -> 10x convnext
+// -> batch-norm -> head1 -> prelu -> head2 chain.
+struct VocoderDims {
+    int C_latent  = 4;
+    int factor    = 2;   // latent_channels = C_latent * factor = 8
+    int latent_len = 5;  // T0 = latent_len * factor = 10
+    int C         = 8;   // ConvNeXt width
+    int K_embed   = 3;
+    int hidden    = 16;
+    int K_dw      = 3;
+    int Hh        = 4;   // head1 output channels
+    int K_head1   = 3;
+    int OUT       = 1;   // waveform channels
+
+    int latent_channels() const { return C_latent * factor; }
+    int T0() const { return latent_len * factor; }
+};
+
+// Deterministic, bounded weight generator. Small magnitudes keep the 10-block
+// residual chain well-scaled so float (production) vs double (reference)
+// rounding stays in the sub-1e-3 band; the values still vary per element so the
+// test is not degenerate.
+float gen_value(int index, double phase, double scale) {
+    return (float) (scale * std::sin(index * 0.7 + phase));
+}
+
+std::vector<float> gen_buffer(int n, double phase, double scale = 0.25) {
+    std::vector<float> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[(std::size_t) i] = gen_value(i, phase, scale);
+    return v;
+}
+
+// Strictly positive buffer for batch-norm running variance.
+std::vector<float> gen_positive(int n, double phase) {
+    std::vector<float> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[(std::size_t) i] = 0.5f + 0.3f * (gen_value(i, phase, 1.0) + 1.0f);
+    return v;
+}
+
+struct BlockBuffers {
+    std::vector<float> dw_w, dw_b, norm_g, norm_b, pw1_w, pw1_b, pw2_w, pw2_b, gamma;
+    int dilation = 1;
+};
+
+// All vocoder weight buffers in their raw (production / ggml-linear) layout.
+// The same buffers feed both forwards, so any layout mismatch is a real bug,
+// not a test artifact.
+struct VocoderBuffers {
+    VocoderDims dims;
+    std::vector<float> normalizer_scale, latent_mean, latent_std, embed_w, embed_b;
+    std::vector<BlockBuffers> blocks;
+    std::vector<float> final_g, final_b, final_mean, final_var;
+    std::vector<float> head1_w, head1_b, head_prelu, head2_w;
+    std::vector<float> latent;
+};
+
+BlockBuffers gen_block(const VocoderDims & d, int dilation, double phase) {
+    BlockBuffers b;
+    b.dilation = dilation;
+    b.dw_w   = gen_buffer(d.C * d.K_dw, phase + 0.1);
+    b.dw_b   = gen_buffer(d.C, phase + 0.2);
+    b.norm_g = gen_buffer(d.C, phase + 0.3);
+    b.norm_b = gen_buffer(d.C, phase + 0.4);
+    b.pw1_w  = gen_buffer(d.hidden * d.C, phase + 0.5);
+    b.pw1_b  = gen_buffer(d.hidden, phase + 0.6);
+    b.pw2_w  = gen_buffer(d.C * d.hidden, phase + 0.7);
+    b.pw2_b  = gen_buffer(d.C, phase + 0.8);
+    b.gamma  = gen_buffer(d.C, phase + 0.9, 0.15);  // per-channel residual scale
+    return b;
+}
+
+VocoderBuffers gen_vocoder_buffers() {
+    VocoderBuffers vb;
+    const VocoderDims & d = vb.dims;
+
+    vb.normalizer_scale = {1.7f};
+    vb.latent_mean = gen_buffer(d.C_latent, 0.2);
+    vb.latent_std  = gen_buffer(d.C_latent, 0.5);
+    vb.embed_w = gen_buffer(d.C * d.C_latent * d.K_embed, 0.9);
+    vb.embed_b = gen_buffer(d.C, 1.0);
+
+    for (int i = 0; i < kNumBlocks; ++i) {
+        vb.blocks.push_back(gen_block(d, kDilations[i], 1.0 + 0.31 * i));
+    }
+
+    vb.final_g    = gen_buffer(d.C, 0.4);
+    vb.final_b    = gen_buffer(d.C, 0.7);
+    vb.final_mean = gen_buffer(d.C, 0.1);
+    vb.final_var  = gen_positive(d.C, 0.5);
+
+    vb.head1_w = gen_buffer(d.Hh * d.C * d.K_head1, 0.3);
+    vb.head1_b = gen_buffer(d.Hh, 0.45);
+    vb.head_prelu = {0.1f};
+    vb.head2_w = gen_buffer(d.OUT * d.Hh, 0.65);
+
+    vb.latent = gen_buffer(d.latent_channels() * d.latent_len, 1.3, 0.6);
+    return vb;
+}
+
+// --- production-side model assembly -----------------------------------------
+
+// Owns the ggml resources backing the synthetic model so the test can release
+// them deterministically (free_supertonic_model is avoided — this model never
+// went through load_supertonic_gguf and carries no scheduler / source map).
+struct GgmlModelArena {
+    ggml_backend_t backend = nullptr;
+    ggml_context * ctx = nullptr;
+    ggml_backend_buffer_t buffer = nullptr;
+
+    ~GgmlModelArena() {
+        if (buffer) ggml_backend_buffer_free(buffer);
+        if (ctx) ggml_free(ctx);
+        if (backend) ggml_backend_free(backend);
+    }
+};
+
+void set_tensor(ggml_tensor * t, const std::vector<float> & data) {
+    if ((std::size_t) ggml_nelements(t) != data.size()) {
+        throw std::runtime_error("tensor element count mismatch while uploading weights");
+    }
+    ggml_backend_tensor_set(t, data.data(), 0, data.size() * sizeof(float));
+}
+
+void build_block_tensors(ggml_context * ctx, const VocoderDims & d,
+                         supertonic_vocoder_convnext_weights & w) {
+    w.dw_w   = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_dw, 1, d.C);
+    w.dw_b   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    w.norm_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    w.norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    w.pw1_w  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.C, d.hidden);
+    w.pw1_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.hidden);
+    w.pw2_w  = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.hidden, d.C);
+    w.pw2_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    w.gamma  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+}
+
+void build_vocoder_tensors(ggml_context * ctx, const VocoderDims & d, supertonic_model & model) {
+    supertonic_vocoder_weights & v = model.vocoder;
+    v.normalizer_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    v.latent_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C_latent);
+    v.latent_std  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C_latent);
+    v.embed_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_embed, d.C_latent, d.C);
+    v.embed_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    for (int i = 0; i < kNumBlocks; ++i) build_block_tensors(ctx, d, v.convnext[(std::size_t) i]);
+    v.final_norm_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    v.final_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    v.final_norm_running_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    v.final_norm_running_var  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C);
+    v.head1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_head1, d.C, d.Hh);
+    v.head1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.Hh);
+    v.head_prelu = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+    v.head2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.Hh, d.OUT);
+}
+
+void upload_vocoder_weights(const VocoderBuffers & vb, supertonic_model & model) {
+    const supertonic_vocoder_weights & v = model.vocoder;
+    set_tensor(v.normalizer_scale, vb.normalizer_scale);
+    set_tensor(v.latent_mean, vb.latent_mean);
+    set_tensor(v.latent_std, vb.latent_std);
+    set_tensor(v.embed_w, vb.embed_w);
+    set_tensor(v.embed_b, vb.embed_b);
+    for (int i = 0; i < kNumBlocks; ++i) {
+        const BlockBuffers & b = vb.blocks[(std::size_t) i];
+        const supertonic_vocoder_convnext_weights & w = v.convnext[(std::size_t) i];
+        set_tensor(w.dw_w, b.dw_w);
+        set_tensor(w.dw_b, b.dw_b);
+        set_tensor(w.norm_g, b.norm_g);
+        set_tensor(w.norm_b, b.norm_b);
+        set_tensor(w.pw1_w, b.pw1_w);
+        set_tensor(w.pw1_b, b.pw1_b);
+        set_tensor(w.pw2_w, b.pw2_w);
+        set_tensor(w.pw2_b, b.pw2_b);
+        set_tensor(w.gamma, b.gamma);
+    }
+    set_tensor(v.final_norm_g, vb.final_g);
+    set_tensor(v.final_norm_b, vb.final_b);
+    set_tensor(v.final_norm_running_mean, vb.final_mean);
+    set_tensor(v.final_norm_running_var, vb.final_var);
+    set_tensor(v.head1_w, vb.head1_w);
+    set_tensor(v.head1_b, vb.head1_b);
+    set_tensor(v.head_prelu, vb.head_prelu);
+    set_tensor(v.head2_w, vb.head2_w);
+}
+
+// Builds a CPU-backed `supertonic_model` whose vocoder weights are the synthetic
+// buffers. `arena` owns the ggml resources for deterministic teardown.
+void build_synthetic_model(const VocoderBuffers & vb, supertonic_model & model, GgmlModelArena & arena) {
+    const VocoderDims & d = vb.dims;
+    model.hparams.latent_dim = d.C_latent;
+    model.hparams.ttl_chunk_compress_factor = d.factor;
+    model.hparams.latent_channels = d.latent_channels();
+
+    arena.backend = ggml_backend_cpu_init();
+    if (!arena.backend) throw std::runtime_error("ggml_backend_cpu_init failed");
+
+    constexpr int kMaxTensors = 256;  // 5 + 10*9 + 4 + 4 = 103 tensors, padded.
+    const std::size_t mem = ggml_tensor_overhead() * kMaxTensors;
+    ggml_init_params params = { mem, nullptr, /*no_alloc=*/true };
+    arena.ctx = ggml_init(params);
+    if (!arena.ctx) throw std::runtime_error("ggml_init failed");
+
+    build_vocoder_tensors(arena.ctx, d, model);
+    arena.buffer = ggml_backend_alloc_ctx_tensors(arena.ctx, arena.backend);
+    if (!arena.buffer) throw std::runtime_error("ggml_backend_alloc_ctx_tensors failed");
+
+    upload_vocoder_weights(vb, model);
+
+    model.backend = arena.backend;
+    model.backend_is_cpu = true;
+}
+
+// --- reference-side weight assembly -----------------------------------------
+
+std::vector<double> to_double(const std::vector<float> & v) {
+    return std::vector<double>(v.begin(), v.end());
+}
+
+VocoderWeights build_reference_weights(const VocoderBuffers & vb) {
+    const VocoderDims & d = vb.dims;
+    VocoderWeights w;
+    w.latent_len = d.latent_len;
+    w.C_latent   = d.C_latent;
+    w.factor     = d.factor;
+    w.C          = d.C;
+    w.normalizer_scale = vb.normalizer_scale[0];
+    w.latent_mean = to_double(vb.latent_mean);
+    w.latent_std  = to_double(vb.latent_std);
+
+    w.K_embed = d.K_embed;
+    w.embed_w = to_double(vb.embed_w);
+    w.embed_b = to_double(vb.embed_b);
+
+    for (int i = 0; i < kNumBlocks; ++i) {
+        const BlockBuffers & b = vb.blocks[(std::size_t) i];
+        VocConvNextWeights c;
+        c.C = d.C; c.hidden = d.hidden; c.K = d.K_dw; c.dilation = b.dilation;
+        c.dw_w = to_double(b.dw_w);
+        c.dw_b = to_double(b.dw_b);
+        c.ln_gamma = to_double(b.norm_g);
+        c.ln_beta = to_double(b.norm_b);
+        c.pw1_w = to_double(b.pw1_w);
+        c.pw1_b = to_double(b.pw1_b);
+        c.pw2_w = to_double(b.pw2_w);
+        c.pw2_b = to_double(b.pw2_b);
+        c.gamma = to_double(b.gamma);
+        w.convnext.push_back(std::move(c));
+    }
+
+    w.bn_gamma        = to_double(vb.final_g);
+    w.bn_beta         = to_double(vb.final_b);
+    w.bn_running_mean = to_double(vb.final_mean);
+    w.bn_running_var  = to_double(vb.final_var);
+
+    w.Hh = d.Hh;
+    w.K_head1 = d.K_head1;
+    w.head1_w = to_double(vb.head1_w);
+    w.head1_b = to_double(vb.head1_b);
+    w.prelu_slope = vb.head_prelu[0];
+
+    w.OUT = d.OUT;
+    w.head2_w = to_double(vb.head2_w);
+    return w;
+}
+
+// --- the parity check --------------------------------------------------------
+
+void test_forward_parity() {
+    const VocoderBuffers vb = gen_vocoder_buffers();
+    const VocoderDims & d = vb.dims;
+
+    supertonic_model model;
+    GgmlModelArena arena;
+    build_synthetic_model(vb, model, arena);
+
+    std::vector<float> wav_prod;
+    std::string error;
+    const bool ok = supertonic_vocoder_forward_cpu(model, vb.latent.data(), d.latent_len, wav_prod, &error);
+    CHECK(ok, "supertonic_vocoder_forward_cpu failed: %s", error.c_str());
+    if (!ok) return;
+
+    VocoderBackward backward(build_reference_weights(vb));
+    const std::vector<double> wav_ref = backward.forward(to_double(vb.latent));
+
+    const std::size_t expected = (std::size_t) d.T0() * d.OUT;
+    CHECK(wav_prod.size() == expected, "production wav size %zu != expected %zu", wav_prod.size(), expected);
+    CHECK(wav_ref.size() == expected, "reference wav size %zu != expected %zu", wav_ref.size(), expected);
+    if (wav_prod.size() != expected || wav_ref.size() != expected) return;
+
+    double max_abs = 0.0;
+    double max_mag = 0.0;
+    for (std::size_t i = 0; i < expected; ++i) {
+        max_abs = std::max(max_abs, std::fabs((double) wav_prod[i] - wav_ref[i]));
+        max_mag = std::max(max_mag, std::fabs(wav_ref[i]));
+    }
+    // float production vs double reference over a 10-block chain; observed error
+    // is ~2e-8, so this tight bar is a meaningful parity check (~500x margin)
+    // while the bug it guards (per-channel gamma modeled as scalar, wrong
+    // dilation, swapped weight index) shifts the output by O(output magnitude).
+    constexpr double kAbsTol = 1e-5;
+    std::fprintf(stderr, "[vocoder forward parity] max_abs_err=%.3e max_ref_mag=%.3e atol=%.0e\n",
+                 max_abs, max_mag, kAbsTol);
+    CHECK(max_abs <= kAbsTol, "forward parity exceeded tolerance: max_abs=%.3e > %.0e", max_abs, kAbsTol);
+}
+
+}  // namespace
+
+int main() {
+    try {
+        test_forward_parity();
+    } catch (const std::exception & e) {
+        ++g_failures;
+        std::fprintf(stderr, "FAIL uncaught exception: %s\n", e.what());
+    }
+    std::fprintf(stderr, "\n%s: %d/%d checks passed\n",
+                 g_failures == 0 ? "PASS" : "FAIL", g_checks - g_failures, g_checks);
+    return g_failures == 0 ? 0 : 1;
+}