diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt
index d359335a066..7b7ece8bb8a 100644
--- a/tts-cpp/CMakeLists.txt
+++ b/tts-cpp/CMakeLists.txt
@@ -857,6 +857,35 @@ if (TTS_CPP_BUILD_TESTS)
     tts_cpp_apply_ccache(test-supertonic-vector-estimator-backward)
     tts_cpp_register_test(test-supertonic-vector-estimator-backward LABEL "unit")
 
+    # QVAC-20984 — analytic backward of the CAMPPlus speaker encoder (FCM Conv2d
+    # head + residual blocks, TDNN, CAM dense-TDNN blocks with context-attention
+    # gating and dense concat, statistics pooling, dense head). Model-free: every
+    # analytic input-gradient is gradchecked against finite differences via the
+    # Task 2 harness, so it ALWAYS runs on a fresh checkout (no-skip policy, no
+    # model/fixtures needed).
+    add_executable(test-campplus-backward
+        test/test_campplus_backward.cpp
+        src/campplus_backward.cpp
+        src/voiceclone_gradcheck.cpp)
+    target_include_directories(test-campplus-backward PRIVATE src)
+    tts_cpp_apply_ccache(test-campplus-backward)
+    tts_cpp_register_test(test-campplus-backward LABEL "unit")
+
+    # Forward-parity: the analytic double forward must match the production scalar
+    # CAMPPlus forward (campplus_embed_cpu) on synthetic weights, anchoring the
+    # gradcheck to the real model. Links campplus.cpp -> ggml.
+    add_executable(test-campplus-backward-parity
+        test/test_campplus_backward_parity.cpp
+        src/campplus_backward.cpp
+        src/campplus.cpp)
+    target_link_libraries(test-campplus-backward-parity PRIVATE ggml)
+    target_include_directories(test-campplus-backward-parity PRIVATE ggml/include src)
+    if (OpenMP_CXX_FOUND)
+        target_link_libraries(test-campplus-backward-parity PRIVATE OpenMP::OpenMP_CXX)
+    endif()
+    tts_cpp_apply_ccache(test-campplus-backward-parity)
+    tts_cpp_register_test(test-campplus-backward-parity LABEL "unit")
+
     # Engine-level streaming-callback contract test for the per-sentence
     # segmentation path (Fix #2): monotonic global chunk_index, single final
     # is_last, result.pcm == concat(callbacks), accumulated stats.  Gated on
diff --git a/tts-cpp/docs/voiceclone-backward-campplus.md b/tts-cpp/docs/voiceclone-backward-campplus.md
new file mode 100644
index 00000000000..1c4c56570d6
--- /dev/null
+++ b/tts-cpp/docs/voiceclone-backward-campplus.md
@@ -0,0 +1,143 @@
+# Voice-clone backward — CAMPPlus speaker encoder (op × backend gap matrix)
+
+Scope for ticket *"GGML backward pass: CAMPPlus speaker encoder"* (QVAC-20984).
+This doc scopes the work to make the CAMPPlus speaker encoder **differentiable in
+GGML** on the CPU path used for enrollment, and records which backward ops are
+still missing in the vendored `ggml`.
+
+It is committed alongside the interim deliverable of this PR: an analytic,
+gradchecked C++ backward of the whole CAMPPlus chain
+(`src/campplus_backward.{h,cpp}`). See
+[Interim vs Phase-2](#interim-solution-shipped-in-this-pr) for how the two
+relate.
+
+## Why the gap exists
+
+In the enrollment loop CAMPPlus provides the **speaker-similarity loss** between
+the target-WAV embedding (constant, forward-only) and the generated-audio
+embedding. Only the generated-audio path needs gradients, so the gradient we
+need is `d(loss)/d(fbank)` — the input gradient with the model weights frozen.
+The fbank is differentiated further back to the waveform by a separate stage;
+this module stops at the CAMPPlus input.
+
+A fully GGML-native backward (the Phase-2 goal, needed by the on-device
+enrollment loop) requires every op on the forward graph to have a backward in
+`ggml_compute_backward` (`ggml/src/ggml.c`) **and** a CPU kernel for the ops the
+backward expands into. Several are missing today.
+
+## Forward ops on the CAMPPlus path
+
+Source: `src/campplus_forward.inc` (the GGML graph) and `src/campplus.cpp` (the
+scalar CPU reference `campplus_embed_cpu`).
+
+| Forward op | Where (forward) |
+| --- | --- |
+| `ggml_conv_2d` / `ggml_im2col` + `ggml_mul_mat` | FCM Conv2d head + residual blocks |
+| `conv1d_f32` (`ggml_im2col` + `ggml_mul_mat`) | TDNN, linear1, linear_local, cam linear1/2, transits, dense |
+| `ggml_mul` / `ggml_add` (broadcast) | pre-fused BN (scale/shift), bias adds, residuals |
+| `ggml_relu` | every nonlinear1/2, transit, out_nonlinear, FCM |
+| `ggml_sigmoid` | CAMLayer context gate |
+| `ggml_mean` | CAMLayer global context, stats-pool mean + variance |
+| `ggml_sum_rows` | CAMLayer seg-pool reduction |
+| `ggml_pad` / `ggml_repeat` | CAMLayer seg-pool reshape + broadcast |
+| `ggml_sqrt` | stats-pool std |
+| `ggml_concat` | dense concat (CAMDenseTDNN), stats-pool mean‖std |
+| `ggml_cont`/`reshape`/`view` | layout shuffles, FCM (32,10,T)→(320,T) flatten |
+
+## Gap matrix
+
+Legend: **OK** = implemented; **MISSING** = aborts / not implemented; **n/a** =
+not on the enrollment path.
+
+"Graph backward" = a case in `ggml_compute_backward` (`ggml/src/ggml.c`). It is
+backend-agnostic: if it aborts, no backend can differentiate the op. "CPU bwd
+kernel" = the kernels the backward expands into exist for the CPU backend
+(`ggml-cpu`), the only backend enrollment needs in Phase 2. GPU columns are out
+of scope for Phase 2 (enrollment runs on CPU) and tracked only for visibility.
+
+| Op | Graph backward (ggml.c) | CPU bwd kernel | CUDA / Metal / Vulkan / OpenCL |
+| --- | --- | --- | --- |
+| `MUL_MAT` | OK | OK (`out_prod`/`mul_mat`) | out of scope |
+| `ADD` / `MUL` | OK | OK | out of scope |
+| `CONT`/`RESHAPE`/`VIEW`/`PERMUTE` | OK | OK | out of scope |
+| `IM2COL` | OK (`im2col_back`) | OK | out of scope |
+| `RELU` (unary) | OK | OK | out of scope |
+| `SIGMOID` (unary) | **MISSING** | — | — |
+| `MEAN` | **MISSING** | — | — |
+| `SUM_ROWS` | **MISSING** | — | — |
+| `SQRT` (unary) | **MISSING** | — | — |
+| `PAD` | **MISSING** | — | — |
+| `REPEAT` | **MISSING** | — | — |
+| `CONCAT` | **MISSING** | — | — |
+
+Confirmed against the `ggml_compute_backward` switch: handled ops include `ADD`,
+`MUL`, `SCALE`, `CPY`, `CONT`, `RESHAPE`, `PERMUTE`, `TRANSPOSE`, `GET_ROWS`,
+`DIAG_MASK_INF`, `RMS_NORM`, `MUL_MAT`, `SOFT_MAX`, `IM2COL`, and a subset of
+`UNARY` (`ABS`, `SGN`, `NEG`, `STEP`, `RELU`, `SILU`, `EXP`, `EXPM1`,
+`SOFTPLUS`). `SIGMOID`, `SQRT`, `MEAN`, `SUM_ROWS`, `PAD`, `REPEAT`, and `CONCAT`
+fall through to `GGML_ABORT`.
+
+## Remaining Phase-2 work items
+
+To reach a fully GGML-native, on-device backward of CAMPPlus:
+
+1. **`SIGMOID` backward** — add `s*(1-s)` to the `UNARY` switch + CPU kernel
+   (needed by the CAMLayer gate).
+2. **`SQRT` backward** — add `1/(2*sqrt(x))` to the `UNARY` switch + CPU kernel
+   (stats-pool std).
+3. **`MEAN` / `SUM_ROWS` backward** — broadcast the upstream grad back over the
+   reduced axis (`1/N` for mean) + CPU kernels.
+4. **`PAD` / `REPEAT` backward** — slice off the padding / sum over the repeated
+   axis (`ggml_repeat_back` already exists; wire it into `ggml_compute_backward`).
+5. **`CONCAT` backward** — slice-and-route the grad to each input (dense concat
+   and stats-pool concat).
+6. **Per-stage gradcheck** — wire each lowered stage into the Task 2 harness;
+   the analytic backward from this PR is the reference oracle.
+
+Alternatively, the seg-pool / stats-pool subgraphs can be lowered to
+`mul_mat`-based reductions (which already have backward), avoiding new kernels for
+`MEAN`/`SUM_ROWS`/`REPEAT`.
+
+## Interim solution shipped in this PR
+
+Because the gaps above block a GGML-native backward today, this PR ships an
+**analytic C++ backward** of the whole CAMPPlus chain, validated component-wise
+against finite differences via the Task 2 gradcheck harness
+(`src/voiceclone_gradcheck.{h,cpp}`):
+
+- `conv1d_backward_input` / `conv2d_backward_input` — transpose-conv input grad
+  (stride / pad / dilation aware)
+- `bn_backward_input` — pre-fused affine BN (per-channel scale)
+- `relu_backward` / `sigmoid_backward` — pointwise nonlinearities
+- `mean_T_backward` / `seg_pool_backward` — CAMLayer context reductions
+- `stats_pool_backward_input` — mean + unbiased std pooling
+- `fcm_resblock_backward` — Conv2d residual block (with optional shortcut)
+- `cam_layer_backward` — CAMDenseTDNN layer (gate + dense-concat split)
+- `CampplusBackward::backward` — full chain → `d(loss)/d(fbank)`
+
+It mirrors the layout and conventions of `campplus_embed_cpu` exactly. Two tests
+guard it (both in the always-on `unit` ctest tier, model-free):
+
+- `test-campplus-backward` — gradchecks every primitive and the full chain
+  against central finite differences.
+- `test-campplus-backward-parity` — asserts the analytic double forward matches
+  the production scalar forward (`campplus_embed_cpu`) on synthetic weights
+  (multi-layer CAM blocks, 2/3/2, so the dense-concat accumulation is exercised),
+  anchoring the gradcheck's relevance to the real model.
+
+The scalar CPU forward is the path every `campplus_embed` caller uses today
+(production `main.cpp`, `test-campplus`, `test-voice-embedding` all pass
+`backend==nullptr`), and `test-campplus` / `test-voice-embedding` validate it
+against the Python reference embedding. So the trust chain is complete:
+Python → `campplus_embed_cpu` → analytic forward → gradchecked backward. The
+`campplus_embed_ggml` graph path is not wired to any caller yet; when it is, it
+gets its own fixture parity against the CPU/Python path.
+
+This is mathematically exact, runs on CPU (the enrollment target), and serves as
+the **reference oracle** for the per-stage gradcheck once the GGML-native ops in
+the work items above are implemented.
+
+> Note: `campplus_embed_cpu`'s `fcm_forward` hardcodes the input feature
+> dimension to 80 (the production fbank width), so the production scalar path is
+> only self-consistent at `feat_dim=80`; the parity test uses that. The analytic
+> backward derives every dimension from `feat_dim`, so it is geometry-agnostic.
diff --git a/tts-cpp/src/campplus_backward.cpp b/tts-cpp/src/campplus_backward.cpp
new file mode 100644
index 00000000000..82096e1c102
--- /dev/null
+++ b/tts-cpp/src/campplus_backward.cpp
@@ -0,0 +1,702 @@
+#include "campplus_backward.h"
+
+#include <cmath>
+#include <cstddef>
+#include <stdexcept>
+#include <utility>
+
+namespace tts_cpp {
+namespace cp_grad {
+
+namespace {
+
+int conv_out_len(int L_in, int k, int stride, int pad, int dilation) {
+    return (L_in + 2 * pad - dilation * (k - 1) - 1) / stride + 1;
+}
+
+// Per-channel sum over the time axis of a channel-major (C, T) buffer.
+std::vector<double> row_sum_T(const std::vector<double> & x, int C, int T) {
+    std::vector<double> s((std::size_t) C, 0.0);
+    for (int c = 0; c < C; ++c) {
+        const double * row = x.data() + (std::size_t) c * T;
+        double acc = 0.0;
+        for (int t = 0; t < T; ++t) acc += row[t];
+        s[(std::size_t) c] = acc;
+    }
+    return s;
+}
+
+void add_in_place(std::vector<double> & a, const std::vector<double> & b) {
+    for (std::size_t i = 0; i < a.size(); ++i) a[i] += b[i];
+}
+
+}  // namespace
+
+CampplusBackward::CampplusBackward(CpWeights weights) : weights_(std::move(weights)) {}
+
+// --- elementwise / pooling primitives ---------------------------------------
+
+std::vector<double> CampplusBackward::bn_forward(const std::vector<double> & x, int C, int T,
+                                                 const std::vector<double> & scale,
+                                                 const std::vector<double> & shift) {
+    std::vector<double> y(x.size());
+    for (int c = 0; c < C; ++c) {
+        const double s = scale[(std::size_t) c];
+        const double b = shift[(std::size_t) c];
+        const std::size_t base = (std::size_t) c * T;
+        for (int t = 0; t < T; ++t) y[base + t] = x[base + t] * s + b;
+    }
+    return y;
+}
+
+std::vector<double> CampplusBackward::bn_backward_input(const std::vector<double> & d_y, int C, int T,
+                                                        const std::vector<double> & scale) {
+    std::vector<double> d_x(d_y.size());
+    for (int c = 0; c < C; ++c) {
+        const double s = scale[(std::size_t) c];
+        const std::size_t base = (std::size_t) c * T;
+        for (int t = 0; t < T; ++t) d_x[base + t] = d_y[base + t] * s;
+    }
+    return d_x;
+}
+
+std::vector<double> CampplusBackward::relu_forward(const std::vector<double> & x) {
+    std::vector<double> y(x.size());
+    for (std::size_t i = 0; i < x.size(); ++i) y[i] = x[i] > 0.0 ? x[i] : 0.0;
+    return y;
+}
+
+std::vector<double> CampplusBackward::relu_backward(const std::vector<double> & relu_in,
+                                                    const std::vector<double> & d_y) {
+    std::vector<double> d_x(d_y.size());
+    for (std::size_t i = 0; i < d_y.size(); ++i) d_x[i] = relu_in[i] > 0.0 ? d_y[i] : 0.0;
+    return d_x;
+}
+
+std::vector<double> CampplusBackward::sigmoid_backward(const std::vector<double> & s,
+                                                       const std::vector<double> & d_y) {
+    std::vector<double> d_x(d_y.size());
+    for (std::size_t i = 0; i < d_y.size(); ++i) d_x[i] = d_y[i] * s[i] * (1.0 - s[i]);
+    return d_x;
+}
+
+// --- conv1d -----------------------------------------------------------------
+// y[co, to] = b[co] + sum_{ci, kk} w[(co*C_in+ci)*k + kk] * x[ci, to*stride + kk*dilation - pad]
+// (valid taps only; zero padding). Channel-major (C, T).
+
+std::vector<double> CampplusBackward::conv1d_forward(const std::vector<double> & x, int C_in, int T_in,
+                                                     const std::vector<double> & w,
+                                                     const std::vector<double> & b, int C_out, int k,
+                                                     int stride, int pad, int dilation, int T_out) {
+    std::vector<double> y((std::size_t) C_out * T_out);
+    const bool has_bias = !b.empty();
+    for (int co = 0; co < C_out; ++co) {
+        const double bias = has_bias ? b[(std::size_t) co] : 0.0;
+        const std::size_t w_co = (std::size_t) co * C_in * k;
+        double * y_row = y.data() + (std::size_t) co * T_out;
+        for (int to = 0; to < T_out; ++to) {
+            double acc = bias;
+            const int base_t = to * stride - pad;
+            for (int ci = 0; ci < C_in; ++ci) {
+                const double * x_row = x.data() + (std::size_t) ci * T_in;
+                const double * w_row = w.data() + w_co + (std::size_t) ci * k;
+                for (int kk = 0; kk < k; ++kk) {
+                    const int ti = base_t + kk * dilation;
+                    if (ti >= 0 && ti < T_in) acc += w_row[kk] * x_row[ti];
+                }
+            }
+            y_row[to] = acc;
+        }
+    }
+    return y;
+}
+
+std::vector<double> CampplusBackward::conv1d_backward_input(const std::vector<double> & d_y, int C_in,
+                                                            int T_in, const std::vector<double> & w,
+                                                            int C_out, int k, int stride, int pad,
+                                                            int dilation, int T_out) {
+    std::vector<double> d_x((std::size_t) C_in * T_in, 0.0);
+    for (int co = 0; co < C_out; ++co) {
+        const std::size_t w_co = (std::size_t) co * C_in * k;
+        const double * d_row = d_y.data() + (std::size_t) co * T_out;
+        for (int to = 0; to < T_out; ++to) {
+            const double g = d_row[to];
+            if (g == 0.0) continue;
+            const int base_t = to * stride - pad;
+            for (int ci = 0; ci < C_in; ++ci) {
+                double * dx_row = d_x.data() + (std::size_t) ci * T_in;
+                const double * w_row = w.data() + w_co + (std::size_t) ci * k;
+                for (int kk = 0; kk < k; ++kk) {
+                    const int ti = base_t + kk * dilation;
+                    if (ti >= 0 && ti < T_in) dx_row[ti] += g * w_row[kk];
+                }
+            }
+        }
+    }
+    return d_x;
+}
+
+// --- conv2d -----------------------------------------------------------------
+// Channel-major (C, H, W); weight (C_out, C_in, kH, kW) row-major.
+
+std::vector<double> CampplusBackward::conv2d_forward(const std::vector<double> & x, int C_in, int H, int W,
+                                                     const std::vector<double> & w,
+                                                     const std::vector<double> & b, int C_out, int kH,
+                                                     int kW, int sH, int sW, int pH, int pW, int H_out,
+                                                     int W_out) {
+    std::vector<double> y((std::size_t) C_out * H_out * W_out);
+    const bool has_bias = !b.empty();
+    for (int co = 0; co < C_out; ++co) {
+        const double bias = has_bias ? b[(std::size_t) co] : 0.0;
+        const std::size_t w_co = (std::size_t) co * C_in * kH * kW;
+        for (int ho = 0; ho < H_out; ++ho) {
+            for (int wo = 0; wo < W_out; ++wo) {
+                double acc = bias;
+                const int base_h = ho * sH - pH;
+                const int base_w = wo * sW - pW;
+                for (int ci = 0; ci < C_in; ++ci) {
+                    const double * x_c = x.data() + (std::size_t) ci * H * W;
+                    const double * w_c = w.data() + w_co + (std::size_t) ci * kH * kW;
+                    for (int kh = 0; kh < kH; ++kh) {
+                        const int hi = base_h + kh;
+                        if (hi < 0 || hi >= H) continue;
+                        for (int kw = 0; kw < kW; ++kw) {
+                            const int wi = base_w + kw;
+                            if (wi < 0 || wi >= W) continue;
+                            acc += w_c[(std::size_t) kh * kW + kw] * x_c[(std::size_t) hi * W + wi];
+                        }
+                    }
+                }
+                y[(std::size_t) co * H_out * W_out + (std::size_t) ho * W_out + wo] = acc;
+            }
+        }
+    }
+    return y;
+}
+
+std::vector<double> CampplusBackward::conv2d_backward_input(const std::vector<double> & d_y, int C_in, int H,
+                                                            int W, const std::vector<double> & w, int C_out,
+                                                            int kH, int kW, int sH, int sW, int pH, int pW,
+                                                            int H_out, int W_out) {
+    std::vector<double> d_x((std::size_t) C_in * H * W, 0.0);
+    for (int co = 0; co < C_out; ++co) {
+        const std::size_t w_co = (std::size_t) co * C_in * kH * kW;
+        for (int ho = 0; ho < H_out; ++ho) {
+            for (int wo = 0; wo < W_out; ++wo) {
+                const double g = d_y[(std::size_t) co * H_out * W_out + (std::size_t) ho * W_out + wo];
+                if (g == 0.0) continue;
+                const int base_h = ho * sH - pH;
+                const int base_w = wo * sW - pW;
+                for (int ci = 0; ci < C_in; ++ci) {
+                    double * dx_c = d_x.data() + (std::size_t) ci * H * W;
+                    const double * w_c = w.data() + w_co + (std::size_t) ci * kH * kW;
+                    for (int kh = 0; kh < kH; ++kh) {
+                        const int hi = base_h + kh;
+                        if (hi < 0 || hi >= H) continue;
+                        for (int kw = 0; kw < kW; ++kw) {
+                            const int wi = base_w + kw;
+                            if (wi < 0 || wi >= W) continue;
+                            dx_c[(std::size_t) hi * W + wi] += g * w_c[(std::size_t) kh * kW + kw];
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return d_x;
+}
+
+// --- mean over time ---------------------------------------------------------
+
+std::vector<double> CampplusBackward::mean_T_forward(const std::vector<double> & x, int C, int T) {
+    std::vector<double> m((std::size_t) C);
+    for (int c = 0; c < C; ++c) {
+        const double * row = x.data() + (std::size_t) c * T;
+        double acc = 0.0;
+        for (int t = 0; t < T; ++t) acc += row[t];
+        m[(std::size_t) c] = acc / (double) T;
+    }
+    return m;
+}
+
+std::vector<double> CampplusBackward::mean_T_backward(const std::vector<double> & d_m, int C, int T) {
+    std::vector<double> d_x((std::size_t) C * T);
+    const double inv = 1.0 / (double) T;
+    for (int c = 0; c < C; ++c) {
+        const double g = d_m[(std::size_t) c] * inv;
+        double * row = d_x.data() + (std::size_t) c * T;
+        for (int t = 0; t < T; ++t) row[t] = g;
+    }
+    return d_x;
+}
+
+// --- segment pooling (ceil mode, true-count last bin), expanded to (C, T) ----
+
+std::vector<double> CampplusBackward::seg_pool_forward(const std::vector<double> & x, int C, int T,
+                                                       int seg_len) {
+    std::vector<double> out((std::size_t) C * T);
+    const int S = (T + seg_len - 1) / seg_len;
+    for (int c = 0; c < C; ++c) {
+        const double * row = x.data() + (std::size_t) c * T;
+        double * dst = out.data() + (std::size_t) c * T;
+        for (int s = 0; s < S; ++s) {
+            const int t0 = s * seg_len;
+            const int t1 = (T < t0 + seg_len) ? T : t0 + seg_len;
+            const int n = t1 - t0;
+            double acc = 0.0;
+            for (int t = t0; t < t1; ++t) acc += row[t];
+            const double avg = acc / (double) (n > 0 ? n : 1);
+            for (int t = t0; t < t1; ++t) dst[t] = avg;
+        }
+    }
+    return out;
+}
+
+std::vector<double> CampplusBackward::seg_pool_backward(const std::vector<double> & d_out, int C, int T,
+                                                        int seg_len) {
+    std::vector<double> d_x((std::size_t) C * T);
+    const int S = (T + seg_len - 1) / seg_len;
+    for (int c = 0; c < C; ++c) {
+        const double * d_row = d_out.data() + (std::size_t) c * T;
+        double * dx_row = d_x.data() + (std::size_t) c * T;
+        for (int s = 0; s < S; ++s) {
+            const int t0 = s * seg_len;
+            const int t1 = (T < t0 + seg_len) ? T : t0 + seg_len;
+            const int n = t1 - t0;
+            double acc = 0.0;
+            for (int t = t0; t < t1; ++t) acc += d_row[t];
+            const double g = acc / (double) (n > 0 ? n : 1);
+            for (int t = t0; t < t1; ++t) dx_row[t] = g;
+        }
+    }
+    return d_x;
+}
+
+// --- statistics pooling (mean + unbiased std) -------------------------------
+
+std::vector<double> CampplusBackward::stats_pool_forward(const std::vector<double> & x, int C, int T,
+                                                         std::vector<double> & mean_out,
+                                                         std::vector<double> & std_out) {
+    std::vector<double> out((std::size_t) 2 * C);
+    mean_out.assign((std::size_t) C, 0.0);
+    std_out.assign((std::size_t) C, 0.0);
+    const double denom = (double) (T > 1 ? T - 1 : 1);
+    for (int c = 0; c < C; ++c) {
+        const double * row = x.data() + (std::size_t) c * T;
+        double sum = 0.0;
+        for (int t = 0; t < T; ++t) sum += row[t];
+        const double mean = sum / (double) T;
+        double sq = 0.0;
+        for (int t = 0; t < T; ++t) {
+            const double d = row[t] - mean;
+            sq += d * d;
+        }
+        const double sd = std::sqrt(sq / denom);
+        mean_out[(std::size_t) c] = mean;
+        std_out[(std::size_t) c] = sd;
+        out[(std::size_t) c] = mean;
+        out[(std::size_t) C + c] = sd;
+    }
+    return out;
+}
+
+std::vector<double> CampplusBackward::stats_pool_backward_input(const std::vector<double> & d_out,
+                                                               const std::vector<double> & x, int C, int T,
+                                                               const std::vector<double> & mean,
+                                                               const std::vector<double> & std_) {
+    std::vector<double> d_x((std::size_t) C * T);
+    const double inv_T = 1.0 / (double) T;
+    const double denom = (double) (T > 1 ? T - 1 : 1);
+    for (int c = 0; c < C; ++c) {
+        const double d_mean = d_out[(std::size_t) c];
+        const double d_std = d_out[(std::size_t) C + c];
+        const double sd = std_[(std::size_t) c];
+        const double m = mean[(std::size_t) c];
+        // d_std/d_x[c,t] = d_std * (x - mean) / ((T-1) * std); the mean-coupling
+        // term vanishes because sum_t (x - mean) = 0.
+        const double std_coeff = sd > 0.0 ? d_std / (denom * sd) : 0.0;
+        const double * row = x.data() + (std::size_t) c * T;
+        double * dx_row = d_x.data() + (std::size_t) c * T;
+        for (int t = 0; t < T; ++t) {
+            dx_row[t] = d_mean * inv_T + std_coeff * (row[t] - m);
+        }
+    }
+    return d_x;
+}
+
+// --- FCM residual block ------------------------------------------------------
+
+std::vector<double> CampplusBackward::fcm_resblock_forward(const CpResBlock & blk,
+                                                           const std::vector<double> & x, int C_in, int H,
+                                                           int W, int & H_out, int & W_out,
+                                                           CpResBlockActs & acts) const {
+    const int planes = blk.conv1.C_out;
+    const int sH = blk.stride_h;
+    H_out = conv_out_len(H, 3, sH, 1, 1);
+    W_out = conv_out_len(W, 3, 1, 1, 1);
+    acts.H_in = H; acts.W_in = W; acts.H_out = H_out; acts.W_out = W_out;
+
+    std::vector<double> t1 = conv2d_forward(x, C_in, H, W, blk.conv1.w, {}, planes, 3, 3, sH, 1, 1, 1,
+                                            H_out, W_out);
+    t1 = bn_forward(t1, planes, H_out * W_out, blk.bn1.scale, blk.bn1.shift);
+    acts.relu1_in = t1;
+    t1 = relu_forward(t1);
+
+    std::vector<double> t2 = conv2d_forward(t1, planes, H_out, W_out, blk.conv2.w, {}, planes, 3, 3, 1, 1,
+                                            1, 1, H_out, W_out);
+    t2 = bn_forward(t2, planes, H_out * W_out, blk.bn2.scale, blk.bn2.shift);
+
+    std::vector<double> sc;
+    if (blk.has_shortcut) {
+        sc = conv2d_forward(x, C_in, H, W, blk.sc.w, {}, planes, 1, 1, sH, 1, 0, 0, H_out, W_out);
+        sc = bn_forward(sc, planes, H_out * W_out, blk.sc_bn.scale, blk.sc_bn.shift);
+    }
+
+    std::vector<double> y((std::size_t) planes * H_out * W_out);
+    if (sc.empty()) {
+        for (std::size_t i = 0; i < y.size(); ++i) y[i] = t2[i] + x[i];
+    } else {
+        for (std::size_t i = 0; i < y.size(); ++i) y[i] = t2[i] + sc[i];
+    }
+    acts.relu_out_in = y;
+    return relu_forward(y);
+}
+
+std::vector<double> CampplusBackward::fcm_resblock_backward(const CpResBlock & blk,
+                                                            const CpResBlockActs & acts,
+                                                            const std::vector<double> & d_out,
+                                                            int C_in) const {
+    const int planes = blk.conv1.C_out;
+    const int sH = blk.stride_h;
+    const int H = acts.H_in, W = acts.W_in, Ho = acts.H_out, Wo = acts.W_out;
+
+    // y = relu(t2 + sc)
+    std::vector<double> d_pre = relu_backward(acts.relu_out_in, d_out);  // d(t2+sc)
+
+    // t2 = bn2(conv2(t1))
+    std::vector<double> d_t2_bn = bn_backward_input(d_pre, planes, Ho * Wo, blk.bn2.scale);
+    std::vector<double> d_t1 = conv2d_backward_input(d_t2_bn, planes, Ho, Wo, blk.conv2.w, planes, 3, 3, 1,
+                                                     1, 1, 1, Ho, Wo);
+    // t1 = relu(bn1(conv1(x)))
+    std::vector<double> d_t1_relu = relu_backward(acts.relu1_in, d_t1);
+    std::vector<double> d_t1_bn = bn_backward_input(d_t1_relu, planes, Ho * Wo, blk.bn1.scale);
+    std::vector<double> d_x = conv2d_backward_input(d_t1_bn, C_in, H, W, blk.conv1.w, planes, 3, 3, sH, 1,
+                                                    1, 1, Ho, Wo);
+
+    // shortcut path
+    if (blk.has_shortcut) {
+        std::vector<double> d_sc_bn = bn_backward_input(d_pre, planes, Ho * Wo, blk.sc_bn.scale);
+        std::vector<double> d_x_sc = conv2d_backward_input(d_sc_bn, C_in, H, W, blk.sc.w, planes, 1, 1, sH,
+                                                           1, 0, 0, Ho, Wo);
+        add_in_place(d_x, d_x_sc);
+    } else {
+        // identity shortcut: y += x (shape-preserving block)
+        add_in_place(d_x, d_pre);
+    }
+    return d_x;
+}
+
+// --- FCM ---------------------------------------------------------------------
+
+std::vector<double> CampplusBackward::fcm_forward(const std::vector<double> & fbank_ct, int T, int & T_out,
+                                                  CpFcmActs & acts) const {
+    const CpFcm & f = weights_.head;
+    const int F = weights_.feat_dim;
+    acts.T = T;
+
+    // conv1: (1 -> 32, k3, s1, p1)
+    int H = conv_out_len(F, 3, 1, 1, 1);
+    int W = conv_out_len(T, 3, 1, 1, 1);
+    std::vector<double> x = conv2d_forward(fbank_ct, 1, F, T, f.conv1.w, {}, 32, 3, 3, 1, 1, 1, 1, H, W);
+    x = bn_forward(x, 32, H * W, f.bn1.scale, f.bn1.shift);
+    acts.conv1_relu_in = x;
+    x = relu_forward(x);
+
+    acts.layer1.assign(f.layer1.size(), CpResBlockActs{});
+    for (std::size_t i = 0; i < f.layer1.size(); ++i) {
+        int Hn, Wn;
+        x = fcm_resblock_forward(f.layer1[i], x, 32, H, W, Hn, Wn, acts.layer1[i]);
+        H = Hn; W = Wn;
+    }
+    acts.layer2.assign(f.layer2.size(), CpResBlockActs{});
+    for (std::size_t i = 0; i < f.layer2.size(); ++i) {
+        int Hn, Wn;
+        x = fcm_resblock_forward(f.layer2[i], x, 32, H, W, Hn, Wn, acts.layer2[i]);
+        H = Hn; W = Wn;
+    }
+
+    // conv2: (32 -> 32, k3, s(sH=2, sW=1), p1)
+    const int H2 = conv_out_len(H, 3, 2, 1, 1);
+    const int W2 = conv_out_len(W, 3, 1, 1, 1);
+    std::vector<double> y = conv2d_forward(x, 32, H, W, f.conv2.w, {}, 32, 3, 3, 2, 1, 1, 1, H2, W2);
+    y = bn_forward(y, 32, H2 * W2, f.bn2.scale, f.bn2.shift);
+    acts.conv2_relu_in = y;
+    y = relu_forward(y);
+
+    acts.H_after = H2;
+    T_out = W2;  // == T (sW=1 throughout)
+    // (32, H2, W2) reinterpreted as (32*H2, W2) channel-major — identical memory.
+    return y;
+}
+
+std::vector<double> CampplusBackward::fcm_backward(const std::vector<double> & d_out,
+                                                   const CpFcmActs & acts) const {
+    const CpFcm & f = weights_.head;
+    const int F = weights_.feat_dim;
+    const int T = acts.T;
+    const int H_after = acts.H_after;
+
+    // conv2: input (32, H_l2, T), output (32, H_after, T)
+    const int H_l2 = acts.layer2.back().H_out;
+    std::vector<double> d = relu_backward(acts.conv2_relu_in, d_out);  // (32, H_after*T)
+    d = bn_backward_input(d, 32, H_after * T, f.bn2.scale);
+    d = conv2d_backward_input(d, 32, H_l2, T, f.conv2.w, 32, 3, 3, 2, 1, 1, 1, H_after, T);
+
+    for (std::size_t i = f.layer2.size(); i-- > 0;) {
+        d = fcm_resblock_backward(f.layer2[i], acts.layer2[i], d, 32);
+    }
+    for (std::size_t i = f.layer1.size(); i-- > 0;) {
+        d = fcm_resblock_backward(f.layer1[i], acts.layer1[i], d, 32);
+    }
+
+    // conv1: input (1, F, T), output (32, F, T)
+    d = relu_backward(acts.conv1_relu_in, d);
+    d = bn_backward_input(d, 32, F * T, f.bn1.scale);
+    return conv2d_backward_input(d, 1, F, T, f.conv1.w, 32, 3, 3, 1, 1, 1, 1, F, T);  // (F, T)
+}
+
+// --- CAMDenseTDNN layer ------------------------------------------------------
+
+std::vector<double> CampplusBackward::cam_layer_forward(const CpCamLayer & L,
+                                                        const std::vector<double> & x_in, int C_in, int T,
+                                                        int growth, int kernel_size, int dilation,
+                                                        int bn_channels, int seg_pool_len,
+                                                        CpCamLayerActs & acts) const {
+    acts.C_in = C_in;
+
+    // nonlinear1 = BN + ReLU on x_in
+    std::vector<double> y = bn_forward(x_in, C_in, T, L.bn1.scale, L.bn1.shift);
+    acts.relu1_in = y;
+    y = relu_forward(y);
+
+    // linear1: 1x1 conv (C_in -> bn_channels)
+    std::vector<double> z = conv1d_forward(y, C_in, T, L.linear1.w, {}, bn_channels, 1, 1, 0, 1, T);
+
+    // nonlinear2 = BN + ReLU
+    z = bn_forward(z, bn_channels, T, L.bn2.scale, L.bn2.shift);
+    acts.relu2_in = z;
+    z = relu_forward(z);  // CAMLayer input
+
+    // linear_local
+    const int pad = (kernel_size - 1) / 2 * dilation;
+    acts.y_local = conv1d_forward(z, bn_channels, T, L.loc.w, {}, growth, kernel_size, 1, pad, dilation, T);
+
+    // context = mean_T(z) + seg_pool(z)
+    const std::vector<double> mean_ctx = mean_T_forward(z, bn_channels, T);
+    std::vector<double> context = seg_pool_forward(z, bn_channels, T, seg_pool_len);
+    for (int c = 0; c < bn_channels; ++c) {
+        const double m = mean_ctx[(std::size_t) c];
+        double * row = context.data() + (std::size_t) c * T;
+        for (int t = 0; t < T; ++t) row[t] += m;
+    }
+
+    // cam linear1: 1x1 (bn_channels -> bn_channels/2) + bias, ReLU
+    const int mid = L.cam1.C_out;
+    std::vector<double> h1 = conv1d_forward(context, bn_channels, T, L.cam1.w, L.cam1.b, mid, 1, 1, 0, 1, T);
+    acts.h1_in = h1;
+    h1 = relu_forward(h1);
+
+    // cam linear2: 1x1 (bn_channels/2 -> growth) + bias, sigmoid
+    std::vector<double> gate = conv1d_forward(h1, mid, T, L.cam2.w, L.cam2.b, growth, 1, 1, 0, 1, T);
+    for (std::size_t i = 0; i < gate.size(); ++i) gate[i] = 1.0 / (1.0 + std::exp(-gate[i]));
+    acts.gate = gate;
+
+    // cam_out = y_local * gate, then dense concat [x_in; cam_out]
+    std::vector<double> out((std::size_t) (C_in + growth) * T);
+    for (std::size_t i = 0; i < (std::size_t) C_in * T; ++i) out[i] = x_in[i];
+    double * cam_dst = out.data() + (std::size_t) C_in * T;
+    for (std::size_t i = 0; i < acts.y_local.size(); ++i) cam_dst[i] = acts.y_local[i] * gate[i];
+    return out;
+}
+
+std::vector<double> CampplusBackward::cam_layer_backward(const CpCamLayer & L, const CpCamLayerActs & acts,
+                                                         const std::vector<double> & d_out, int C_in, int T,
+                                                         int growth, int kernel_size, int dilation,
+                                                         int bn_channels, int seg_pool_len) const {
+    const int mid = L.cam1.C_out;
+    const int pad = (kernel_size - 1) / 2 * dilation;
+
+    // split dense concat
+    std::vector<double> d_x((std::size_t) C_in * T);
+    for (std::size_t i = 0; i < d_x.size(); ++i) d_x[i] = d_out[i];  // direct identity path
+    const double * d_cam = d_out.data() + (std::size_t) C_in * T;    // (growth, T)
+
+    // cam_out = y_local * gate
+    std::vector<double> d_y_local((std::size_t) growth * T);
+    std::vector<double> d_gate((std::size_t) growth * T);
+    for (std::size_t i = 0; i < d_y_local.size(); ++i) {
+        d_y_local[i] = d_cam[i] * acts.gate[i];
+        d_gate[i] = d_cam[i] * acts.y_local[i];
+    }
+
+    // gate = sigmoid(conv2(h1)+b)
+    std::vector<double> d_g_pre = sigmoid_backward(acts.gate, d_gate);
+    std::vector<double> d_h1 = conv1d_backward_input(d_g_pre, mid, T, L.cam2.w, growth, 1, 1, 0, 1, T);
+    // h1 = relu(cam1(context)+b)
+    std::vector<double> d_h1_pre = relu_backward(acts.h1_in, d_h1);
+    std::vector<double> d_context = conv1d_backward_input(d_h1_pre, bn_channels, T, L.cam1.w, mid, 1, 1, 0,
+                                                          1, T);
+
+    // context = seg_pool(z) + mean_T(z) (broadcast)
+    std::vector<double> d_mean = row_sum_T(d_context, bn_channels, T);
+    std::vector<double> d_z = seg_pool_backward(d_context, bn_channels, T, seg_pool_len);
+    add_in_place(d_z, mean_T_backward(d_mean, bn_channels, T));
+
+    // y_local = conv_local(z)
+    add_in_place(d_z, conv1d_backward_input(d_y_local, bn_channels, T, L.loc.w, growth, kernel_size, 1, pad,
+                                            dilation, T));
+
+    // z = relu(bn2(linear1(relu(bn1(x_in)))))
+    std::vector<double> d_z_relu = relu_backward(acts.relu2_in, d_z);
+    std::vector<double> d_lin1 = bn_backward_input(d_z_relu, bn_channels, T, L.bn2.scale);
+    std::vector<double> d_y = conv1d_backward_input(d_lin1, C_in, T, L.linear1.w, bn_channels, 1, 1, 0, 1, T);
+    std::vector<double> d_y_relu = relu_backward(acts.relu1_in, d_y);
+    std::vector<double> d_x_branch = bn_backward_input(d_y_relu, C_in, T, L.bn1.scale);
+
+    add_in_place(d_x, d_x_branch);
+    return d_x;
+}
+
+// --- full chain --------------------------------------------------------------
+
+std::vector<double> CampplusBackward::forward(const std::vector<double> & fbank_t_by_c, int T) {
+    const CpWeights & w = weights_;
+    const int F = w.feat_dim;
+    acts_ = CpActs{};
+    acts_.T = T;
+
+    // transpose (T, F) -> (F, T) channel-major
+    std::vector<double> fbank_ct((std::size_t) F * T);
+    for (int t = 0; t < T; ++t)
+        for (int c = 0; c < F; ++c)
+            fbank_ct[(std::size_t) c * T + t] = fbank_t_by_c[(std::size_t) t * F + c];
+
+    int T_after_fcm = 0;
+    std::vector<double> x = fcm_forward(fbank_ct, T, T_after_fcm, acts_.fcm);
+    const int fcm_out_ch = 32 * acts_.fcm.H_after;
+
+    // tdnn: Conv1d(fcm_out -> init_channels, k5, s2, p2) + BN + ReLU
+    const int init_C = w.tdnn.C_out;
+    const int T_cam = conv_out_len(T_after_fcm, 5, 2, 2, 1);
+    acts_.T_cam = T_cam;
+    x = conv1d_forward(x, fcm_out_ch, T_after_fcm, w.tdnn.w, {}, init_C, 5, 2, 2, 1, T_cam);
+    x = bn_forward(x, init_C, T_cam, w.tdnn_bn.scale, w.tdnn_bn.shift);
+    acts_.tdnn_relu_in = x;
+    x = relu_forward(x);
+
+    int C_cur = init_C;
+
+    auto run_block = [&](const CpCamBlock & blk, const CpTransit & tr, std::vector<CpCamLayerActs> & bacts,
+                         std::vector<double> & tr_relu_in, int & tr_Cin) {
+        bacts.assign(blk.layers.size(), CpCamLayerActs{});
+        for (std::size_t i = 0; i < blk.layers.size(); ++i) {
+            x = cam_layer_forward(blk.layers[i], x, C_cur, T_cam, blk.growth, blk.kernel_size, blk.dilation,
+                                  blk.bn_channels, w.seg_pool_len, bacts[i]);
+            C_cur += blk.growth;
+        }
+        // transit: BN + ReLU + 1x1 conv (halves channels)
+        tr_Cin = C_cur;
+        x = bn_forward(x, C_cur, T_cam, tr.bn.scale, tr.bn.shift);
+        tr_relu_in = x;
+        x = relu_forward(x);
+        const int C_out = tr.linear.C_out;
+        x = conv1d_forward(x, C_cur, T_cam, tr.linear.w, {}, C_out, 1, 1, 0, 1, T_cam);
+        C_cur = C_out;
+    };
+
+    run_block(w.block1, w.transit1, acts_.block1, acts_.tr1_relu_in, acts_.tr1_Cin);
+    run_block(w.block2, w.transit2, acts_.block2, acts_.tr2_relu_in, acts_.tr2_Cin);
+    run_block(w.block3, w.transit3, acts_.block3, acts_.tr3_relu_in, acts_.tr3_Cin);
+
+    acts_.final_ch = C_cur;
+
+    // out_nonlinear: BN + ReLU
+    x = bn_forward(x, C_cur, T_cam, w.out_bn.scale, w.out_bn.shift);
+    acts_.out_relu_in = x;
+    x = relu_forward(x);
+    acts_.stats_x = x;
+
+    // stats pool -> (2*final)
+    std::vector<double> stats = stats_pool_forward(x, C_cur, T_cam, acts_.stats_mean, acts_.stats_std);
+
+    // dense: 1x1 conv (2*final -> E) + BN(affine-less)
+    const int E = w.embedding_size;
+    std::vector<double> emb = conv1d_forward(stats, 2 * C_cur, 1, w.dense.w, {}, E, 1, 1, 0, 1, 1);
+    emb = bn_forward(emb, E, 1, w.dense_bn.scale, w.dense_bn.shift);
+    return emb;
+}
+
+std::vector<double> CampplusBackward::backward(const std::vector<double> & d_emb) const {
+    if (acts_.stats_x.empty()) {
+        throw std::logic_error("CampplusBackward::backward called before forward (no cached activations)");
+    }
+    const CpWeights & w = weights_;
+    const int T = acts_.T;
+    const int T_cam = acts_.T_cam;
+    const int final_ch = acts_.final_ch;
+    const int E = w.embedding_size;
+
+    // dense: emb = bn(conv1d(stats))
+    std::vector<double> d = bn_backward_input(d_emb, E, 1, w.dense_bn.scale);
+    std::vector<double> d_stats = conv1d_backward_input(d, 2 * final_ch, 1, w.dense.w, E, 1, 1, 0, 1, 1);
+
+    // stats pool
+    std::vector<double> d_x = stats_pool_backward_input(d_stats, acts_.stats_x, final_ch, T_cam,
+                                                        acts_.stats_mean, acts_.stats_std);
+
+    // out_nonlinear: relu(bn(prev))
+    d_x = relu_backward(acts_.out_relu_in, d_x);
+    d_x = bn_backward_input(d_x, final_ch, T_cam, w.out_bn.scale);
+
+    auto run_block_backward = [&](const CpCamBlock & blk, const CpTransit & tr,
+                                  const std::vector<CpCamLayerActs> & bacts,
+                                  const std::vector<double> & tr_relu_in, int tr_Cin) {
+        // transit: x = conv1d(relu(bn(prev)))
+        d_x = conv1d_backward_input(d_x, tr_Cin, T_cam, tr.linear.w, tr.linear.C_out, 1, 1, 0, 1, T_cam);
+        d_x = relu_backward(tr_relu_in, d_x);
+        d_x = bn_backward_input(d_x, tr_Cin, T_cam, tr.bn.scale);
+        // block layers in reverse
+        int C_in = tr_Cin;
+        for (std::size_t i = blk.layers.size(); i-- > 0;) {
+            C_in -= blk.growth;
+            d_x = cam_layer_backward(blk.layers[i], bacts[i], d_x, C_in, T_cam, blk.growth, blk.kernel_size,
+                                     blk.dilation, blk.bn_channels, w.seg_pool_len);
+        }
+    };
+
+    run_block_backward(w.block3, w.transit3, acts_.block3, acts_.tr3_relu_in, acts_.tr3_Cin);
+    run_block_backward(w.block2, w.transit2, acts_.block2, acts_.tr2_relu_in, acts_.tr2_Cin);
+    run_block_backward(w.block1, w.transit1, acts_.block1, acts_.tr1_relu_in, acts_.tr1_Cin);
+
+    // tdnn: relu(bn(conv1d(fcm_out)))
+    const int init_C = w.tdnn.C_out;
+    const int fcm_out_ch = 32 * acts_.fcm.H_after;
+    d_x = relu_backward(acts_.tdnn_relu_in, d_x);
+    d_x = bn_backward_input(d_x, init_C, T_cam, w.tdnn_bn.scale);
+    std::vector<double> d_fcm_out = conv1d_backward_input(d_x, fcm_out_ch, T, w.tdnn.w, init_C, 5, 2, 2, 1,
+                                                          T_cam);
+
+    // fcm -> d_fbank_ct (F, T)
+    std::vector<double> d_fbank_ct = fcm_backward(d_fcm_out, acts_.fcm);
+
+    // transpose (F, T) -> (T, F)
+    const int F = w.feat_dim;
+    std::vector<double> d_fbank((std::size_t) T * F);
+    for (int c = 0; c < F; ++c)
+        for (int t = 0; t < T; ++t)
+            d_fbank[(std::size_t) t * F + c] = d_fbank_ct[(std::size_t) c * T + t];
+    return d_fbank;
+}
+
+}  // namespace cp_grad
+}  // namespace tts_cpp
diff --git a/tts-cpp/src/campplus_backward.h b/tts-cpp/src/campplus_backward.h
new file mode 100644
index 00000000000..5eeba330270
--- /dev/null
+++ b/tts-cpp/src/campplus_backward.h
@@ -0,0 +1,262 @@
+#pragma once
+
+// Analytic backward pass for the CAMPPlus speaker encoder — voice-clone roadmap,
+// ticket "GGML backward pass: CAMPPlus speaker encoder" (QVAC-20984).
+//
+// Scope: CAMPPlus maps an 80-channel Kaldi-fbank spectrogram to a 192-d speaker
+// embedding. In the enrollment loop it provides the speaker-similarity loss
+// between the (constant) target-WAV embedding and the generated-audio embedding.
+// Only the generated-audio path needs gradients, so the gradient this class
+// produces is `d(loss)/d(fbank)` — the input gradient with the model weights
+// frozen. The fbank itself is differentiated further back to the waveform by a
+// separate stage; this module stops at the CAMPPlus input.
+//
+// Why analytic (not ggml autodiff): the CAMPPlus forward leans on ops whose
+// backward is not implemented in the vendored ggml (im2col-based conv1d/conv2d,
+// `ggml_mean`, `ggml_sqrt`, `ggml_sigmoid`, `ggml_pad`, the seg-pool reshape /
+// sum_rows / repeat chain, ...). The math here is the standard conv / batch-norm
+// (pre-fused affine) / pooling / gating backward, computed in double for a
+// well-conditioned reference and validated component-wise against central finite
+// differences by the voiceclone gradcheck harness (Task 2 / QVAC-20979).
+//
+// Layout convention (mirrors `campplus_embed_cpu` in campplus.cpp):
+//   1-D feature map: channel-major (C, T), access x[c * T + t].
+//   2-D feature map: channel-major (C, H, W), access x[c * H * W + h * W + w].
+// The public fbank in/out uses the (T, feat_dim) row-major layout of the public
+// `campplus_embed` API; the transpose to/from channel-major happens internally.
+//
+// `CampplusBackward` owns the frozen weights and caches the per-call activations
+// as state: `forward(fbank)` runs the chain and stores the activations needed by
+// `backward(d_emb)`. The class has no dependency on the ggml graph or the GGUF
+// loader; a thin adapter binds the real weights into `CpWeights` elsewhere.
+
+#include <vector>
+
+namespace tts_cpp {
+namespace cp_grad {
+
+// --- Plain data holders (double mirror of campplus.h structs) ---------------
+
+// Conv weight in PyTorch row-major layout: Conv1d (C_out, C_in, k) flattened as
+// ((co * C_in) + ci) * k + kk; Conv2d (C_out, C_in, kH, kW). `b` empty => no bias.
+struct CpConv {
+    std::vector<double> w;
+    std::vector<double> b;
+    int C_out = 0, C_in = 0;
+    int k = 0;                       // Conv1d kernel
+    int kH = 0, kW = 0;              // Conv2d kernel
+    int stride = 1, pad = 0, dilation = 1;          // Conv1d
+    int stride_h = 1, stride_w = 1, pad_h = 0, pad_w = 0;  // Conv2d
+};
+
+// Pre-fused affine batch norm: y[c] = x[c] * scale[c] + shift[c]. Frozen at
+// inference, so the input-gradient is a per-channel scale.
+struct CpBn {
+    std::vector<double> scale;       // [C]
+    std::vector<double> shift;       // [C]
+};
+
+// FCM BasicResBlock: conv1 + bn1 + relu + conv2 + bn2 (+ optional shortcut),
+// residual add then relu. Conv2d, stride only on H.
+struct CpResBlock {
+    CpConv conv1;  CpBn bn1;
+    CpConv conv2;  CpBn bn2;
+    CpConv sc;     CpBn sc_bn;       // shortcut; sc.w empty => identity
+    int stride_h = 1;
+    bool has_shortcut = false;
+};
+
+struct CpFcm {
+    CpConv conv1;  CpBn bn1;
+    std::vector<CpResBlock> layer1;  // 2 blocks, first stride 2
+    std::vector<CpResBlock> layer2;  // 2 blocks, first stride 2
+    CpConv conv2;  CpBn bn2;         // stride (sH=2, sW=1)
+};
+
+// CAMDenseTDNNLayer: bn1+relu -> linear1(1x1) -> bn2+relu -> CAMLayer, then
+// dense concat [x_in, cam_out].
+struct CpCamLayer {
+    CpBn bn1;
+    CpConv linear1;                  // 1x1 (C_in -> bn_channels)
+    CpBn bn2;
+    CpConv loc;                      // linear_local (bn_channels -> growth, k, dil), no bias
+    CpConv cam1;                     // 1x1 (bn_channels -> bn_channels/2), bias
+    CpConv cam2;                     // 1x1 (bn_channels/2 -> growth), bias
+};
+
+struct CpCamBlock {
+    int num_layers = 0;
+    int kernel_size = 3;
+    int dilation = 1;
+    int growth = 32;
+    int bn_channels = 128;
+    int C_in = 0;                    // channels entering layer 0
+    std::vector<CpCamLayer> layers;
+};
+
+struct CpTransit {
+    CpBn bn;
+    CpConv linear;                   // 1x1
+};
+
+struct CpWeights {
+    int feat_dim       = 80;
+    int embedding_size = 192;
+    int seg_pool_len   = 100;
+
+    CpFcm head;
+
+    CpConv tdnn;  CpBn tdnn_bn;      // Conv1d (fcm_out -> init_channels, k=5, s=2, p=2)
+
+    CpCamBlock block1;  CpTransit transit1;
+    CpCamBlock block2;  CpTransit transit2;
+    CpCamBlock block3;  CpTransit transit3;
+
+    CpBn out_bn;                     // out_nonlinear BN
+    CpConv dense;                    // 1x1 (final*2 -> embedding)
+    CpBn dense_bn;                   // affine-less BN (scale = 1/sqrt(var+eps))
+};
+
+// --- Activation caches -------------------------------------------------------
+
+// ReLU is recovered from the cached pre-activation (relu input). Conv / BN
+// input-gradients need only the frozen weights, so the input tensors are not
+// cached; only the values the nonlinearities and poolings need are kept.
+struct CpResBlockActs {
+    std::vector<double> relu1_in;    // bn1(conv1(x)) pre-relu, (planes, Ho*Wo)
+    std::vector<double> relu_out_in; // (conv2 path + shortcut) pre-final-relu
+    int H_in = 0, W_in = 0;          // block input dims (for conv backward)
+    int H_out = 0, W_out = 0;        // block output dims
+};
+
+struct CpCamLayerActs {
+    std::vector<double> relu1_in;    // bn1(x_in) pre-relu, (C_in, T)
+    std::vector<double> relu2_in;    // bn2(linear1(.)) pre-relu, (bn_channels, T)
+    std::vector<double> y_local;     // linear_local output, (growth, T)
+    std::vector<double> h1_in;       // cam1(context)+b pre-relu, (bn_channels/2, T)
+    std::vector<double> gate;        // sigmoid output, (growth, T)
+    int C_in = 0;                    // layer input channels
+};
+
+struct CpFcmActs {
+    std::vector<double> conv1_relu_in;          // (32, 80*T) pre-relu
+    std::vector<CpResBlockActs> layer1;
+    std::vector<CpResBlockActs> layer2;
+    std::vector<double> conv2_relu_in;          // (32, 10*T) pre-relu
+    int T = 0;                                  // FCM width (== input T)
+    int H_after = 0;                            // 10
+};
+
+struct CpActs {
+    int T = 0;                                  // input frames
+    CpFcmActs fcm;
+    std::vector<double> tdnn_relu_in;           // (init_channels, T_cam) pre-relu
+    int T_cam = 0;
+    std::vector<CpCamLayerActs> block1, block2, block3;
+    std::vector<double> tr1_relu_in, tr2_relu_in, tr3_relu_in;  // pre-relu of each transit BN
+    int tr1_Cin = 0, tr2_Cin = 0, tr3_Cin = 0;
+    std::vector<double> out_relu_in;            // out_nonlinear BN pre-relu, (final, T_cam)
+    std::vector<double> stats_x;                // out_nonlinear output (final, T_cam) post-relu
+    std::vector<double> stats_mean;             // (final)
+    std::vector<double> stats_std;              // (final)
+    int final_ch = 0;
+};
+
+// --- CAMPPlus backward -------------------------------------------------------
+//
+// Stateful: construct with the frozen weights, call `forward(fbank, T)` (caches
+// activations), then `backward(d_emb)` (consumes them). The stateless math
+// primitives are private; the gradcheck self-tests reach them through a friend
+// tester so each is validated individually against finite differences.
+class CampplusBackward {
+public:
+    explicit CampplusBackward(CpWeights weights);
+
+    const CpWeights & weights() const { return weights_; }
+
+    // Forward: `fbank_t_by_c` is row-major (T, feat_dim). Runs the chain, caches
+    // activations and returns the raw 192-d embedding.
+    std::vector<double> forward(const std::vector<double> & fbank_t_by_c, int T);
+
+    // Backward: from d_emb (embedding_size) return d_fbank in the (T, feat_dim)
+    // row-major layout the forward consumes. Uses the most recent forward cache.
+    std::vector<double> backward(const std::vector<double> & d_emb) const;
+
+private:
+    friend struct CampplusBackwardTester;
+
+    // --- elementwise / pooling primitives (channel-major (C, T)) -------------
+    static std::vector<double> bn_forward(const std::vector<double> & x, int C, int T,
+                                          const std::vector<double> & scale,
+                                          const std::vector<double> & shift);
+    static std::vector<double> bn_backward_input(const std::vector<double> & d_y, int C, int T,
+                                                 const std::vector<double> & scale);
+
+    static std::vector<double> relu_forward(const std::vector<double> & x);
+    // d_x = d_y * (relu_in > 0)
+    static std::vector<double> relu_backward(const std::vector<double> & relu_in,
+                                             const std::vector<double> & d_y);
+
+    // d_x = d_y * s * (1 - s), s = sigmoid output (cached)
+    static std::vector<double> sigmoid_backward(const std::vector<double> & s,
+                                                const std::vector<double> & d_y);
+
+    static std::vector<double> conv1d_forward(const std::vector<double> & x, int C_in, int T_in,
+                                              const std::vector<double> & w, const std::vector<double> & b,
+                                              int C_out, int k, int stride, int pad, int dilation,
+                                              int T_out);
+    static std::vector<double> conv1d_backward_input(const std::vector<double> & d_y, int C_in, int T_in,
+                                                     const std::vector<double> & w, int C_out, int k,
+                                                     int stride, int pad, int dilation, int T_out);
+
+    static std::vector<double> conv2d_forward(const std::vector<double> & x, int C_in, int H, int W,
+                                              const std::vector<double> & w, const std::vector<double> & b,
+                                              int C_out, int kH, int kW, int sH, int sW, int pH, int pW,
+                                              int H_out, int W_out);
+    static std::vector<double> conv2d_backward_input(const std::vector<double> & d_y, int C_in, int H, int W,
+                                                     const std::vector<double> & w, int C_out, int kH, int kW,
+                                                     int sH, int sW, int pH, int pW, int H_out, int W_out);
+
+    // mean over T (per channel): m[c] = mean_t x[c, t]
+    static std::vector<double> mean_T_forward(const std::vector<double> & x, int C, int T);
+    static std::vector<double> mean_T_backward(const std::vector<double> & d_m, int C, int T);
+
+    // seg-pool then expand back to (C, T): each ceil-mode bin of seg_len holds
+    // the average of its members and is tiled across them.
+    static std::vector<double> seg_pool_forward(const std::vector<double> & x, int C, int T, int seg_len);
+    static std::vector<double> seg_pool_backward(const std::vector<double> & d_out, int C, int T, int seg_len);
+
+    // stats pool: (C, T) -> (2C) = concat(mean, unbiased std).
+    static std::vector<double> stats_pool_forward(const std::vector<double> & x, int C, int T,
+                                                  std::vector<double> & mean_out,
+                                                  std::vector<double> & std_out);
+    static std::vector<double> stats_pool_backward_input(const std::vector<double> & d_out,
+                                                         const std::vector<double> & x, int C, int T,
+                                                         const std::vector<double> & mean,
+                                                         const std::vector<double> & std_);
+
+    // --- module forward/backward -------------------------------------------
+    std::vector<double> fcm_resblock_forward(const CpResBlock & blk, const std::vector<double> & x,
+                                             int C_in, int H, int W, int & H_out, int & W_out,
+                                             CpResBlockActs & acts) const;
+    std::vector<double> fcm_resblock_backward(const CpResBlock & blk, const CpResBlockActs & acts,
+                                              const std::vector<double> & d_out, int C_in) const;
+
+    std::vector<double> fcm_forward(const std::vector<double> & fbank_ct, int T, int & T_out,
+                                    CpFcmActs & acts) const;
+    std::vector<double> fcm_backward(const std::vector<double> & d_out, const CpFcmActs & acts) const;
+
+    std::vector<double> cam_layer_forward(const CpCamLayer & L, const std::vector<double> & x_in, int C_in,
+                                          int T, int growth, int kernel_size, int dilation, int bn_channels,
+                                          int seg_pool_len, CpCamLayerActs & acts) const;
+    std::vector<double> cam_layer_backward(const CpCamLayer & L, const CpCamLayerActs & acts,
+                                           const std::vector<double> & d_out, int C_in, int T, int growth,
+                                           int kernel_size, int dilation, int bn_channels,
+                                           int seg_pool_len) const;
+
+    CpWeights weights_;
+    mutable CpActs acts_;
+};
+
+}  // namespace cp_grad
+}  // namespace tts_cpp
diff --git a/tts-cpp/test/test_campplus_backward.cpp b/tts-cpp/test/test_campplus_backward.cpp
new file mode 100644
index 00000000000..c46d52e0720
--- /dev/null
+++ b/tts-cpp/test/test_campplus_backward.cpp
@@ -0,0 +1,499 @@
+// Gradcheck self-tests for the CAMPPlus speaker-encoder backward (voice-clone
+// ticket "GGML backward pass: CAMPPlus speaker encoder", QVAC-20984).  Pure host
+// logic, model-free: every analytic input-gradient is checked component-wise
+// against a central finite-difference numeric gradient of the matching forward,
+// using the Task 2 gradcheck harness.  Runs in the always-on `unit` ctest tier.
+//
+// Standalone build (single line):
+//   g++ -std=c++17 -I src test/test_campplus_backward.cpp src/campplus_backward.cpp src/voiceclone_gradcheck.cpp -o /tmp/t && /tmp/t
+
+#include "campplus_backward.h"
+#include "voiceclone_gradcheck.h"
+
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+using namespace tts_cpp::cp_grad;
+using tts_cpp::voiceclone::ScalarLossFn;
+using tts_cpp::voiceclone::compare_gradients;
+using tts_cpp::voiceclone::finite_diff_gradient;
+using tts_cpp::voiceclone::GradcheckReport;
+
+// Friend accessor: exposes CampplusBackward's private primitives to the tests.
+namespace tts_cpp {
+namespace cp_grad {
+struct CampplusBackwardTester {
+    using CB = CampplusBackward;
+
+    static std::vector<double> bn_forward(const std::vector<double> & x, int C, int T,
+                                          const std::vector<double> & s, const std::vector<double> & b) {
+        return CB::bn_forward(x, C, T, s, b);
+    }
+    static std::vector<double> bn_backward_input(const std::vector<double> & d, int C, int T,
+                                                 const std::vector<double> & s) {
+        return CB::bn_backward_input(d, C, T, s);
+    }
+    static std::vector<double> relu_forward(const std::vector<double> & x) { return CB::relu_forward(x); }
+    static std::vector<double> relu_backward(const std::vector<double> & in, const std::vector<double> & d) {
+        return CB::relu_backward(in, d);
+    }
+    static std::vector<double> sigmoid_backward(const std::vector<double> & s,
+                                                const std::vector<double> & d) {
+        return CB::sigmoid_backward(s, d);
+    }
+    static std::vector<double> conv1d_forward(const std::vector<double> & x, int Ci, int Ti,
+                                              const std::vector<double> & w, const std::vector<double> & b,
+                                              int Co, int k, int s, int p, int dl, int To) {
+        return CB::conv1d_forward(x, Ci, Ti, w, b, Co, k, s, p, dl, To);
+    }
+    static std::vector<double> conv1d_backward_input(const std::vector<double> & d, int Ci, int Ti,
+                                                     const std::vector<double> & w, int Co, int k, int s,
+                                                     int p, int dl, int To) {
+        return CB::conv1d_backward_input(d, Ci, Ti, w, Co, k, s, p, dl, To);
+    }
+    static std::vector<double> conv2d_forward(const std::vector<double> & x, int Ci, int H, int W,
+                                              const std::vector<double> & w, const std::vector<double> & b,
+                                              int Co, int kH, int kW, int sH, int sW, int pH, int pW,
+                                              int Ho, int Wo) {
+        return CB::conv2d_forward(x, Ci, H, W, w, b, Co, kH, kW, sH, sW, pH, pW, Ho, Wo);
+    }
+    static std::vector<double> conv2d_backward_input(const std::vector<double> & d, int Ci, int H, int W,
+                                                     const std::vector<double> & w, int Co, int kH, int kW,
+                                                     int sH, int sW, int pH, int pW, int Ho, int Wo) {
+        return CB::conv2d_backward_input(d, Ci, H, W, w, Co, kH, kW, sH, sW, pH, pW, Ho, Wo);
+    }
+    static std::vector<double> mean_T_forward(const std::vector<double> & x, int C, int T) {
+        return CB::mean_T_forward(x, C, T);
+    }
+    static std::vector<double> mean_T_backward(const std::vector<double> & d, int C, int T) {
+        return CB::mean_T_backward(d, C, T);
+    }
+    static std::vector<double> seg_pool_forward(const std::vector<double> & x, int C, int T, int sl) {
+        return CB::seg_pool_forward(x, C, T, sl);
+    }
+    static std::vector<double> seg_pool_backward(const std::vector<double> & d, int C, int T, int sl) {
+        return CB::seg_pool_backward(d, C, T, sl);
+    }
+    static std::vector<double> stats_pool_forward(const std::vector<double> & x, int C, int T,
+                                                  std::vector<double> & m, std::vector<double> & sd) {
+        return CB::stats_pool_forward(x, C, T, m, sd);
+    }
+    static std::vector<double> stats_pool_backward_input(const std::vector<double> & d,
+                                                         const std::vector<double> & x, int C, int T,
+                                                         const std::vector<double> & m,
+                                                         const std::vector<double> & sd) {
+        return CB::stats_pool_backward_input(d, x, C, T, m, sd);
+    }
+    static std::vector<double> resblock_forward(const CB & cb, const CpResBlock & blk,
+                                                const std::vector<double> & x, int Ci, int H, int W,
+                                                int & Ho, int & Wo, CpResBlockActs & a) {
+        return cb.fcm_resblock_forward(blk, x, Ci, H, W, Ho, Wo, a);
+    }
+    static std::vector<double> resblock_backward(const CB & cb, const CpResBlock & blk,
+                                                 const CpResBlockActs & a, const std::vector<double> & d,
+                                                 int Ci) {
+        return cb.fcm_resblock_backward(blk, a, d, Ci);
+    }
+    static std::vector<double> cam_layer_forward(const CB & cb, const CpCamLayer & L,
+                                                 const std::vector<double> & x, int Ci, int T, int g, int k,
+                                                 int dl, int bn, int sl, CpCamLayerActs & a) {
+        return cb.cam_layer_forward(L, x, Ci, T, g, k, dl, bn, sl, a);
+    }
+    static std::vector<double> cam_layer_backward(const CB & cb, const CpCamLayer & L,
+                                                  const CpCamLayerActs & a, const std::vector<double> & d,
+                                                  int Ci, int T, int g, int k, int dl, int bn, int sl) {
+        return cb.cam_layer_backward(L, a, d, Ci, T, g, k, dl, bn, sl);
+    }
+};
+}  // namespace cp_grad
+}  // namespace tts_cpp
+
+namespace {
+
+using Tester = tts_cpp::cp_grad::CampplusBackwardTester;
+
+int g_failures = 0;
+int g_checks = 0;
+
+#define CHECK(cond, ...) do {                                            \
+    ++g_checks;                                                          \
+    if (!(cond)) {                                                       \
+        ++g_failures;                                                    \
+        fprintf(stderr, "FAIL %s:%d  ", __FILE__, __LINE__);            \
+        fprintf(stderr, __VA_ARGS__);                                    \
+        fprintf(stderr, "\n");                                          \
+    }                                                                    \
+} while (0)
+
+double sample(int i, double phase) { return std::sin(i * 0.9 + phase) * 0.8; }
+
+std::vector<double> make_vector(int n, double phase) {
+    std::vector<double> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[i] = sample(i, phase);
+    return v;
+}
+
+// ReLU input kept away from the kink: |v| >= 0.3 so a +-eps perturbation never
+// flips its sign and the central difference matches the analytic mask.
+std::vector<double> make_relu_input(int n, double phase) {
+    std::vector<double> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) {
+        const double s = sample(i, phase);
+        v[i] = std::copysign(0.3 + std::fabs(s), s == 0.0 ? 1.0 : s);
+    }
+    return v;
+}
+
+double dot(const std::vector<double> & a, const std::vector<double> & b) {
+    double acc = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i) acc += a[i] * b[i];
+    return acc;
+}
+
+double sigmoid(double x) { return 1.0 / (1.0 + std::exp(-x)); }
+
+void report_check(const char * name, const GradcheckReport & r) {
+    CHECK(r.passed, "%s: gradcheck failed (max_abs=%.3e max_rel=%.3e worst=%zu)", name, r.max_abs_err,
+          r.max_rel_err, r.worst_index);
+}
+
+// --- primitive gradchecks ---------------------------------------------------
+
+void test_bn_backward() {
+    const int C = 4, T = 5;
+    const std::vector<double> scale = make_vector(C, 0.2);
+    const std::vector<double> shift = make_vector(C, 1.0);
+    const std::vector<double> coeffs = make_vector(C * T, 2.0);
+    const std::vector<double> x0 = make_vector(C * T, 0.7);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::bn_forward(x, C, T, scale, shift));
+    };
+    report_check("bn_backward_input", compare_gradients(finite_diff_gradient(f, x0),
+                                                        Tester::bn_backward_input(coeffs, C, T, scale)));
+}
+
+void test_relu_backward() {
+    const int n = 20;
+    const std::vector<double> coeffs = make_vector(n, 1.3);
+    const std::vector<double> x0 = make_relu_input(n, 0.25);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::relu_forward(x));
+    };
+    report_check("relu_backward", compare_gradients(finite_diff_gradient(f, x0),
+                                                    Tester::relu_backward(x0, coeffs)));
+}
+
+void test_sigmoid_backward() {
+    const int n = 16;
+    const std::vector<double> coeffs = make_vector(n, 1.1);
+    const std::vector<double> x0 = make_vector(n, 0.4);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        std::vector<double> y(x.size());
+        for (std::size_t i = 0; i < x.size(); ++i) y[i] = sigmoid(x[i]);
+        return dot(coeffs, y);
+    };
+    std::vector<double> s0(x0.size());
+    for (std::size_t i = 0; i < x0.size(); ++i) s0[i] = sigmoid(x0[i]);
+    report_check("sigmoid_backward", compare_gradients(finite_diff_gradient(f, x0),
+                                                       Tester::sigmoid_backward(s0, coeffs)));
+}
+
+void test_conv1d_backward() {
+    const int Ci = 3, Ti = 7, Co = 4, k = 3, stride = 2, pad = 1, dilation = 2;
+    const int To = (Ti + 2 * pad - dilation * (k - 1) - 1) / stride + 1;
+    const std::vector<double> w = make_vector(Co * Ci * k, 0.3);
+    const std::vector<double> b = make_vector(Co, 1.1);
+    const std::vector<double> coeffs = make_vector(Co * To, 2.0);
+    const std::vector<double> x0 = make_vector(Ci * Ti, 0.7);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::conv1d_forward(x, Ci, Ti, w, b, Co, k, stride, pad, dilation, To));
+    };
+    report_check("conv1d_backward_input",
+                 compare_gradients(finite_diff_gradient(f, x0),
+                                   Tester::conv1d_backward_input(coeffs, Ci, Ti, w, Co, k, stride, pad,
+                                                                 dilation, To)));
+}
+
+void test_conv2d_backward() {
+    const int Ci = 2, H = 5, W = 4, Co = 3, kH = 3, kW = 3, sH = 2, sW = 1, pH = 1, pW = 1;
+    const int Ho = (H + 2 * pH - (kH - 1) - 1) / sH + 1;
+    const int Wo = (W + 2 * pW - (kW - 1) - 1) / sW + 1;
+    const std::vector<double> w = make_vector(Co * Ci * kH * kW, 0.3);
+    const std::vector<double> b = make_vector(Co, 0.9);
+    const std::vector<double> coeffs = make_vector(Co * Ho * Wo, 1.4);
+    const std::vector<double> x0 = make_vector(Ci * H * W, 0.5);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::conv2d_forward(x, Ci, H, W, w, b, Co, kH, kW, sH, sW, pH, pW, Ho, Wo));
+    };
+    report_check("conv2d_backward_input",
+                 compare_gradients(finite_diff_gradient(f, x0),
+                                   Tester::conv2d_backward_input(coeffs, Ci, H, W, w, Co, kH, kW, sH, sW,
+                                                                 pH, pW, Ho, Wo)));
+}
+
+void test_mean_T_backward() {
+    const int C = 4, T = 6;
+    const std::vector<double> coeffs = make_vector(C, 1.7);
+    const std::vector<double> x0 = make_vector(C * T, 0.3);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::mean_T_forward(x, C, T));
+    };
+    report_check("mean_T_backward", compare_gradients(finite_diff_gradient(f, x0),
+                                                      Tester::mean_T_backward(coeffs, C, T)));
+}
+
+void test_seg_pool_backward() {
+    const int C = 3, T = 7, seg = 3;  // S = 3 bins: 3, 3, 1 (partial last)
+    const std::vector<double> coeffs = make_vector(C * T, 1.2);
+    const std::vector<double> x0 = make_vector(C * T, 0.6);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        return dot(coeffs, Tester::seg_pool_forward(x, C, T, seg));
+    };
+    report_check("seg_pool_backward", compare_gradients(finite_diff_gradient(f, x0),
+                                                        Tester::seg_pool_backward(coeffs, C, T, seg)));
+}
+
+void test_stats_pool_backward() {
+    const int C = 4, T = 6;
+    const std::vector<double> coeffs = make_vector(2 * C, 1.5);
+    const std::vector<double> x0 = make_vector(C * T, 0.4);
+    std::vector<double> mean, std_;
+    Tester::stats_pool_forward(x0, C, T, mean, std_);
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        std::vector<double> m, s;
+        return dot(coeffs, Tester::stats_pool_forward(x, C, T, m, s));
+    };
+    report_check("stats_pool_backward_input",
+                 compare_gradients(finite_diff_gradient(f, x0),
+                                   Tester::stats_pool_backward_input(coeffs, x0, C, T, mean, std_)));
+}
+
+// --- module gradchecks ------------------------------------------------------
+
+// BN with a firmly positive shift so the downstream ReLU stays in its active
+// (locally linear) region at the evaluation point: a +-eps finite-difference
+// step never crosses the kink, so the central difference matches the analytic
+// mask. The ReLU mask=0 branch is covered by the dedicated relu unit test.
+CpBn make_bn(int C, double phase) {
+    CpBn bn;
+    bn.scale.resize((std::size_t) C);
+    bn.shift.resize((std::size_t) C);
+    for (int c = 0; c < C; ++c) {
+        bn.scale[(std::size_t) c] = 0.5 + 0.3 * std::fabs(sample(c, phase));  // positive scale
+        bn.shift[(std::size_t) c] = 1.2 + 0.2 * sample(c, phase + 1.0);       // firmly positive bias
+    }
+    return bn;
+}
+
+CpConv make_conv1d(int Co, int Ci, int k, int stride, int pad, int dil, double phase, bool bias) {
+    CpConv c;
+    c.C_out = Co; c.C_in = Ci; c.k = k; c.stride = stride; c.pad = pad; c.dilation = dil;
+    c.w = make_vector(Co * Ci * k, phase);
+    for (double & v : c.w) v *= 0.1;  // small weights keep BN shift dominant -> ReLU active
+    if (bias) c.b = make_vector(Co, phase + 0.5);
+    return c;
+}
+
+CpConv make_conv2d(int Co, int Ci, int kH, int kW, int sH, int sW, int pH, int pW, double phase) {
+    CpConv c;
+    c.C_out = Co; c.C_in = Ci; c.kH = kH; c.kW = kW;
+    c.stride_h = sH; c.stride_w = sW; c.pad_h = pH; c.pad_w = pW;
+    c.w = make_vector(Co * Ci * kH * kW, phase);
+    for (double & v : c.w) v *= 0.1;
+    return c;
+}
+
+void test_resblock_backward(bool shortcut) {
+    const int Ci = 4, H = 6, W = 5;
+    const int stride = shortcut ? 2 : 1;
+    CpResBlock blk;
+    blk.stride_h = stride;
+    blk.has_shortcut = shortcut;
+    blk.conv1 = make_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, 0.2);
+    blk.bn1 = make_bn(Ci, 0.3);
+    blk.conv2 = make_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, 0.4);
+    blk.bn2 = make_bn(Ci, 0.5);
+    if (shortcut) {
+        blk.sc = make_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, 0.6);
+        blk.sc_bn = make_bn(Ci, 0.7);
+    }
+    const CampplusBackward cb{CpWeights{}};
+    const std::vector<double> x0 = make_vector(Ci * H * W, 0.35);
+
+    int Ho = 0, Wo = 0;
+    CpResBlockActs acts;
+    Tester::resblock_forward(cb, blk, x0, Ci, H, W, Ho, Wo, acts);
+    const std::vector<double> coeffs = make_vector(Ci * Ho * Wo, 1.1);
+    const std::vector<double> analytic = Tester::resblock_backward(cb, blk, acts, coeffs, Ci);
+
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        int h, w;
+        CpResBlockActs a;
+        return dot(coeffs, Tester::resblock_forward(cb, blk, x, Ci, H, W, h, w, a));
+    };
+    report_check(shortcut ? "fcm_resblock(shortcut) d_x" : "fcm_resblock(identity) d_x",
+                 compare_gradients(finite_diff_gradient(f, x0), analytic));
+}
+
+void test_cam_layer_backward() {
+    const int Ci = 6, T = 9, growth = 4, k = 3, dil = 2, bn = 8, seg = 4;
+    CpCamLayer L;
+    L.bn1 = make_bn(Ci, 0.2);
+    L.linear1 = make_conv1d(bn, Ci, 1, 1, 0, 1, 0.3, false);
+    L.bn2 = make_bn(bn, 0.4);
+    L.loc = make_conv1d(growth, bn, k, 1, (k - 1) / 2 * dil, dil, 0.5, false);
+    L.cam1 = make_conv1d(bn / 2, bn, 1, 1, 0, 1, 0.6, true);
+    L.cam2 = make_conv1d(growth, bn / 2, 1, 1, 0, 1, 0.7, true);
+
+    const CampplusBackward cb{CpWeights{}};
+    const std::vector<double> x0 = make_vector(Ci * T, 0.3);
+
+    CpCamLayerActs acts;
+    Tester::cam_layer_forward(cb, L, x0, Ci, T, growth, k, dil, bn, seg, acts);
+    const std::vector<double> coeffs = make_vector((Ci + growth) * T, 1.0);
+    const std::vector<double> analytic =
+        Tester::cam_layer_backward(cb, L, acts, coeffs, Ci, T, growth, k, dil, bn, seg);
+
+    const ScalarLossFn f = [&](const std::vector<double> & x) {
+        CpCamLayerActs a;
+        return dot(coeffs, Tester::cam_layer_forward(cb, L, x, Ci, T, growth, k, dil, bn, seg, a));
+    };
+    report_check("cam_dense_tdnn_layer d_x", compare_gradients(finite_diff_gradient(f, x0), analytic));
+}
+
+// --- full chain -------------------------------------------------------------
+
+CpCamBlock make_block(int num_layers, int dilation, int C_in, int growth, int bn_channels, int k,
+                      double phase) {
+    CpCamBlock blk;
+    blk.num_layers = num_layers;
+    blk.kernel_size = k;
+    blk.dilation = dilation;
+    blk.growth = growth;
+    blk.bn_channels = bn_channels;
+    blk.C_in = C_in;
+    blk.layers.resize((std::size_t) num_layers);
+    for (int i = 0; i < num_layers; ++i) {
+        const int lc = C_in + i * growth;
+        CpCamLayer & L = blk.layers[(std::size_t) i];
+        const double p = phase + i;
+        L.bn1 = make_bn(lc, p + 0.1);
+        L.linear1 = make_conv1d(bn_channels, lc, 1, 1, 0, 1, p + 0.2, false);
+        L.bn2 = make_bn(bn_channels, p + 0.3);
+        L.loc = make_conv1d(growth, bn_channels, k, 1, (k - 1) / 2 * dilation, dilation, p + 0.4, false);
+        L.cam1 = make_conv1d(bn_channels / 2, bn_channels, 1, 1, 0, 1, p + 0.5, true);
+        L.cam2 = make_conv1d(growth, bn_channels / 2, 1, 1, 0, 1, p + 0.6, true);
+    }
+    return blk;
+}
+
+CpTransit make_transit(int C_in, double phase) {
+    CpTransit t;
+    t.bn = make_bn(C_in, phase);
+    t.linear = make_conv1d(C_in / 2, C_in, 1, 1, 0, 1, phase + 0.5, false);
+    return t;
+}
+
+CpResBlock make_resblock(int Ci, int stride, bool shortcut, double phase) {
+    CpResBlock blk;
+    blk.stride_h = stride;
+    blk.has_shortcut = shortcut;
+    blk.conv1 = make_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, phase);
+    blk.bn1 = make_bn(Ci, phase + 0.1);
+    blk.conv2 = make_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, phase + 0.2);
+    blk.bn2 = make_bn(Ci, phase + 0.3);
+    if (shortcut) {
+        blk.sc = make_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, phase + 0.4);
+        blk.sc_bn = make_bn(Ci, phase + 0.5);
+    }
+    return blk;
+}
+
+CpWeights make_tiny_weights() {
+    const int feat_dim = 8;       // FCM downsamples H by 8 -> H_after = 1
+    const int growth = 4;
+    const int bn_channels = 8;
+    const int init_C = 8;
+    const int k = 3;
+
+    CpWeights w;
+    w.feat_dim = feat_dim;
+    w.seg_pool_len = 4;
+    w.embedding_size = 4;
+
+    // FCM
+    w.head.conv1 = make_conv2d(32, 1, 3, 3, 1, 1, 1, 1, 0.1);
+    w.head.bn1 = make_bn(32, 0.2);
+    w.head.layer1 = {make_resblock(32, 2, true, 1.0), make_resblock(32, 1, false, 2.0)};
+    w.head.layer2 = {make_resblock(32, 2, true, 3.0), make_resblock(32, 1, false, 4.0)};
+    w.head.conv2 = make_conv2d(32, 32, 3, 3, 2, 1, 1, 1, 5.0);
+    w.head.bn2 = make_bn(32, 5.5);
+
+    const int fcm_out = 32;  // 32 * H_after(1)
+    w.tdnn = make_conv1d(init_C, fcm_out, 5, 2, 2, 1, 6.0, false);
+    w.tdnn_bn = make_bn(init_C, 6.5);
+
+    w.block1 = make_block(2, 1, init_C, growth, bn_channels, k, 10.0);
+    const int after_b1 = init_C + 2 * growth;
+    w.transit1 = make_transit(after_b1, 20.0);
+
+    const int b2_in = after_b1 / 2;
+    w.block2 = make_block(2, 2, b2_in, growth, bn_channels, k, 30.0);
+    const int after_b2 = b2_in + 2 * growth;
+    w.transit2 = make_transit(after_b2, 40.0);
+
+    const int b3_in = after_b2 / 2;
+    w.block3 = make_block(1, 2, b3_in, growth, bn_channels, k, 50.0);
+    const int after_b3 = b3_in + 1 * growth;
+    w.transit3 = make_transit(after_b3, 60.0);
+
+    const int final_ch = after_b3 / 2;
+    w.out_bn = make_bn(final_ch, 70.0);
+    w.dense = make_conv1d(w.embedding_size, final_ch * 2, 1, 1, 0, 1, 80.0, false);
+    w.dense_bn = make_bn(w.embedding_size, 85.0);
+    return w;
+}
+
+void test_full_chain_backward() {
+    const int T = 12;
+    const CpWeights w = make_tiny_weights();
+    CampplusBackward cb{w};
+
+    const std::vector<double> fbank0 = make_vector(T * w.feat_dim, 0.3);
+    const std::vector<double> emb = cb.forward(fbank0, T);
+    const std::vector<double> coeffs = make_vector((int) emb.size(), 1.0);
+    const std::vector<double> analytic = cb.backward(coeffs);
+
+    const ScalarLossFn f = [&](const std::vector<double> & fb) {
+        CampplusBackward local{w};
+        return dot(coeffs, local.forward(fb, T));
+    };
+    report_check("campplus full-chain d_fbank",
+                 compare_gradients(finite_diff_gradient(f, fbank0), analytic));
+}
+
+}  // namespace
+
+int main() {
+    try {
+        test_bn_backward();
+        test_relu_backward();
+        test_sigmoid_backward();
+        test_conv1d_backward();
+        test_conv2d_backward();
+        test_mean_T_backward();
+        test_seg_pool_backward();
+        test_stats_pool_backward();
+        test_resblock_backward(/*shortcut=*/false);
+        test_resblock_backward(/*shortcut=*/true);
+        test_cam_layer_backward();
+        test_full_chain_backward();
+    } catch (const std::exception & e) {
+        ++g_failures;
+        fprintf(stderr, "FAIL uncaught exception: %s\n", e.what());
+    }
+    fprintf(stderr, "\n%s: %d/%d checks passed\n", g_failures == 0 ? "PASS" : "FAIL",
+            g_checks - g_failures, g_checks);
+    return g_failures == 0 ? 0 : 1;
+}
diff --git a/tts-cpp/test/test_campplus_backward_parity.cpp b/tts-cpp/test/test_campplus_backward_parity.cpp
new file mode 100644
index 00000000000..07c90ef3654
--- /dev/null
+++ b/tts-cpp/test/test_campplus_backward_parity.cpp
@@ -0,0 +1,340 @@
+// Forward-parity check for the CAMPPlus backward module (QVAC-20984).
+//
+// The gradcheck self-test (test_campplus_backward.cpp) validates the analytic
+// backward against finite differences of the SAME double forward. That proves
+// the backward is the exact derivative of `CampplusBackward::forward`, but not
+// that this forward matches the model CAMPPlus actually runs. This test closes
+// that gap: it feeds identical synthetic weights and the same fbank to the
+// production scalar forward (`campplus_embed` with backend==nullptr, i.e.
+// `campplus_embed_cpu`) and to `CampplusBackward::forward`, and asserts the two
+// 192-d embeddings agree. Any drift in layout, dilation schedule, seg-pool
+// geometry, stats-pool variance convention or per-channel scaling would surface
+// here, so the gradcheck's relevance is anchored to the real forward.
+//
+// `campplus_embed_cpu` hardcodes growth=32 and bn_channels=128, so the synthetic
+// topology below uses those values.
+//
+// Trust chain: the scalar CPU forward is what every `campplus_embed` caller in
+// the repo actually uses (production `main.cpp`, `test-campplus`,
+// `test-voice-embedding` all pass backend==nullptr), and `test-campplus` /
+// `test-voice-embedding` validate it against the Python reference embedding. So
+// anchoring this parity to `campplus_embed_cpu` ties the analytic forward (and
+// therefore the gradchecked backward) to the real model: Python -> CPU forward
+// -> analytic forward -> backward. The `campplus_embed_ggml` graph path is not
+// exercised by any caller today; if it is wired up later it gets its own
+// fixture parity against the CPU/Python path.
+//
+// Built via CMake (links campplus.cpp -> ggml). Runs in the `unit` ctest tier.
+
+#include "campplus.h"
+#include "campplus_backward.h"
+
+#include <cmath>
+#include <cstdio>
+#include <vector>
+
+using namespace tts_cpp::cp_grad;
+
+namespace {
+
+int g_failures = 0;
+
+double sample(int i, double phase) { return std::sin(i * 0.7 + phase) * 0.5; }
+
+std::vector<float> gen_f(int n, double phase) {
+    std::vector<float> v((std::size_t) n);
+    for (int i = 0; i < n; ++i) v[i] = (float) sample(i, phase);
+    return v;
+}
+
+std::vector<double> widen(const std::vector<float> & v) {
+    return std::vector<double>(v.begin(), v.end());
+}
+
+// --- synthetic float weight builders ----------------------------------------
+
+// Small weights so the deep ReLU stack stays numerically bounded (real CAMPPlus
+// weights are BN-normalized; unscaled synthetic weights would blow activations
+// up exponentially across the ~10 conv layers and overflow float).
+constexpr double kWeightScale = 0.1;
+
+campplus_conv mk_conv2d(int Co, int Ci, int kH, int kW, int sH, int sW, int pH, int pW, double phase) {
+    campplus_conv c;
+    c.w = gen_f(Co * Ci * kH * kW, phase);
+    for (float & v : c.w) v *= (float) kWeightScale;
+    c.C_out = Co; c.C_in = Ci; c.kH = kH; c.kW = kW;
+    c.stride_h = sH; c.stride_w = sW; c.pad_h = pH; c.pad_w = pW;
+    c.dilation_h = 1; c.dilation_w = 1; c.is_2d = true;
+    return c;
+}
+
+campplus_conv mk_conv1d(int Co, int Ci, int k, int stride, int pad, int dil, double phase, bool bias) {
+    campplus_conv c;
+    c.w = gen_f(Co * Ci * k, phase);
+    for (float & v : c.w) v *= (float) kWeightScale;
+    if (bias) c.b = gen_f(Co, phase + 0.5);
+    c.C_out = Co; c.C_in = Ci; c.k = k;
+    c.stride_w = stride; c.pad_w = pad; c.dilation_w = dil; c.is_2d = false;
+    return c;
+}
+
+// Positive-biased scale/shift so signal propagates through the ReLU stack and
+// the embedding is non-degenerate (a zero-mean BN would let the ReLUs collapse
+// everything to the final bias, making the parity comparison vacuous).
+campplus_bn mk_bn(int C, double phase) {
+    campplus_bn bn;
+    bn.scale.resize((std::size_t) C);
+    bn.shift.resize((std::size_t) C);
+    for (int c = 0; c < C; ++c) {
+        bn.scale[(std::size_t) c] = (float) (0.5 + 0.3 * std::fabs(sample(c, phase)));
+        bn.shift[(std::size_t) c] = (float) (0.6 + 0.3 * sample(c, phase + 1.0));
+    }
+    return bn;
+}
+
+campplus_res_block mk_resblock(int Ci, int stride, bool shortcut, double phase) {
+    campplus_res_block b;
+    b.stride_h = stride;
+    b.conv1 = mk_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, phase);
+    b.bn1 = mk_bn(Ci, phase + 0.1);
+    b.conv2 = mk_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, phase + 0.2);
+    b.bn2 = mk_bn(Ci, phase + 0.3);
+    if (shortcut) {
+        b.shortcut_conv = mk_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, phase + 0.4);
+        b.shortcut_bn = mk_bn(Ci, phase + 0.5);
+    }
+    return b;
+}
+
+campplus_cam_block mk_cam_block(int num_layers, int kernel_size, int dilation, int C_in, int growth,
+                                int bn_channels, double phase) {
+    campplus_cam_block blk;
+    blk.num_layers = num_layers;
+    blk.kernel_size = kernel_size;
+    blk.dilation = dilation;
+    blk.layers.resize((std::size_t) num_layers);
+    const int pad = (kernel_size - 1) / 2 * dilation;
+    for (int i = 0; i < num_layers; ++i) {
+        const int lc = C_in + i * growth;
+        campplus_cam_dense_tdnn_layer & L = blk.layers[(std::size_t) i];
+        const double p = phase + i;
+        L.bn1 = mk_bn(lc, p + 0.1);
+        L.linear1 = mk_conv1d(bn_channels, lc, 1, 1, 0, 1, p + 0.2, false);
+        L.bn2 = mk_bn(bn_channels, p + 0.3);
+        L.cam_linear_local = mk_conv1d(growth, bn_channels, kernel_size, 1, pad, dilation, p + 0.4, false);
+        L.cam_linear1 = mk_conv1d(bn_channels / 2, bn_channels, 1, 1, 0, 1, p + 0.5, true);
+        L.cam_linear2 = mk_conv1d(growth, bn_channels / 2, 1, 1, 0, 1, p + 0.6, true);
+    }
+    return blk;
+}
+
+campplus_transit mk_transit(int C_in, double phase) {
+    campplus_transit t;
+    t.bn = mk_bn(C_in, phase);
+    t.linear = mk_conv1d(C_in / 2, C_in, 1, 1, 0, 1, phase + 0.5, false);
+    return t;
+}
+
+struct Topo {
+    // campplus_embed_cpu's fcm_forward hardcodes F=80 (H: 80->40->20->10), so the
+    // production CPU path is only self-consistent at feat_dim=80; the parity check
+    // must use it. fcm_out = 32 * (80/8) = 320.
+    int feat_dim = 80;
+    int init_C = 32;
+    int growth = 32;           // hardcoded in campplus_embed_cpu
+    int bn_channels = 128;     // hardcoded in campplus_embed_cpu
+    int kernel_size = 3;
+    int embedding = 8;
+    int seg_pool_len = 5;
+};
+
+campplus_weights build_weights(const Topo & d) {
+    campplus_weights w;
+    w.feat_dim = d.feat_dim;
+    w.embedding_size = d.embedding;
+    w.seg_pool_len = d.seg_pool_len;
+    w.sample_rate = 16000;
+
+    w.head.conv1 = mk_conv2d(32, 1, 3, 3, 1, 1, 1, 1, 0.1);
+    w.head.bn1 = mk_bn(32, 0.2);
+    w.head.layer1 = {mk_resblock(32, 2, true, 1.0), mk_resblock(32, 1, false, 2.0)};
+    w.head.layer2 = {mk_resblock(32, 2, true, 3.0), mk_resblock(32, 1, false, 4.0)};
+    w.head.conv2 = mk_conv2d(32, 32, 3, 3, 2, 1, 1, 1, 5.0);
+    w.head.bn2 = mk_bn(32, 5.5);
+
+    const int fcm_out = 32 * (d.feat_dim / 8);  // 320 at feat_dim=80
+    w.tdnn_linear = mk_conv1d(d.init_C, fcm_out, 5, 2, 2, 1, 6.0, false);
+    w.tdnn_bn = mk_bn(d.init_C, 6.5);
+
+    // Multi-layer CAM blocks (2/3/2) so the dense-concat accumulation (layer i
+    // enters with C_in + i*growth) is anchored to production, not only to the
+    // self-referential full-chain gradcheck.
+    const int b1_layers = 2, b2_layers = 3, b3_layers = 2;
+    w.block1 = mk_cam_block(b1_layers, d.kernel_size, 1, d.init_C, d.growth, d.bn_channels, 10.0);
+    const int after_b1 = d.init_C + b1_layers * d.growth;
+    w.transit1 = mk_transit(after_b1, 20.0);
+
+    const int b2_in = after_b1 / 2;
+    w.block2 = mk_cam_block(b2_layers, d.kernel_size, 2, b2_in, d.growth, d.bn_channels, 30.0);
+    const int after_b2 = b2_in + b2_layers * d.growth;
+    w.transit2 = mk_transit(after_b2, 40.0);
+
+    const int b3_in = after_b2 / 2;
+    w.block3 = mk_cam_block(b3_layers, d.kernel_size, 2, b3_in, d.growth, d.bn_channels, 50.0);
+    const int after_b3 = b3_in + b3_layers * d.growth;
+    w.transit3 = mk_transit(after_b3, 60.0);
+
+    const int final_ch = after_b3 / 2;
+    w.out_nonlinear_bn = mk_bn(final_ch, 70.0);
+    w.dense_linear = mk_conv1d(d.embedding, final_ch * 2, 1, 1, 0, 1, 80.0, false);
+    w.dense_bn = mk_bn(d.embedding, 85.0);
+    return w;
+}
+
+// --- float -> double weight conversion (campplus_weights -> CpWeights) -------
+
+CpConv to_cp_conv(const campplus_conv & c) {
+    CpConv o;
+    o.w = widen(c.w);
+    o.b = widen(c.b);
+    o.C_out = c.C_out; o.C_in = c.C_in; o.k = c.k;
+    o.kH = c.kH; o.kW = c.kW;
+    o.stride = c.stride_w; o.pad = c.pad_w; o.dilation = c.dilation_w;
+    o.stride_h = c.stride_h; o.stride_w = c.stride_w; o.pad_h = c.pad_h; o.pad_w = c.pad_w;
+    return o;
+}
+
+CpBn to_cp_bn(const campplus_bn & b) {
+    CpBn o;
+    o.scale = widen(b.scale);
+    o.shift = widen(b.shift);
+    return o;
+}
+
+CpResBlock to_cp_resblock(const campplus_res_block & b) {
+    CpResBlock o;
+    o.conv1 = to_cp_conv(b.conv1); o.bn1 = to_cp_bn(b.bn1);
+    o.conv2 = to_cp_conv(b.conv2); o.bn2 = to_cp_bn(b.bn2);
+    o.has_shortcut = !b.shortcut_conv.w.empty();
+    if (o.has_shortcut) { o.sc = to_cp_conv(b.shortcut_conv); o.sc_bn = to_cp_bn(b.shortcut_bn); }
+    o.stride_h = b.stride_h;
+    return o;
+}
+
+CpCamBlock to_cp_block(const campplus_cam_block & b, int C_in, int growth, int bn_channels) {
+    CpCamBlock o;
+    o.num_layers = b.num_layers;
+    o.kernel_size = b.kernel_size;
+    o.dilation = b.dilation;
+    o.growth = growth;
+    o.bn_channels = bn_channels;
+    o.C_in = C_in;
+    o.layers.resize(b.layers.size());
+    for (std::size_t i = 0; i < b.layers.size(); ++i) {
+        const campplus_cam_dense_tdnn_layer & L = b.layers[i];
+        CpCamLayer & d = o.layers[i];
+        d.bn1 = to_cp_bn(L.bn1);
+        d.linear1 = to_cp_conv(L.linear1);
+        d.bn2 = to_cp_bn(L.bn2);
+        d.loc = to_cp_conv(L.cam_linear_local);
+        d.cam1 = to_cp_conv(L.cam_linear1);
+        d.cam2 = to_cp_conv(L.cam_linear2);
+    }
+    return o;
+}
+
+CpTransit to_cp_transit(const campplus_transit & t) {
+    CpTransit o;
+    o.bn = to_cp_bn(t.bn);
+    o.linear = to_cp_conv(t.linear);
+    return o;
+}
+
+CpWeights to_cp_weights(const campplus_weights & w, const Topo & d) {
+    CpWeights o;
+    o.feat_dim = w.feat_dim;
+    o.embedding_size = w.embedding_size;
+    o.seg_pool_len = w.seg_pool_len;
+
+    o.head.conv1 = to_cp_conv(w.head.conv1); o.head.bn1 = to_cp_bn(w.head.bn1);
+    for (const auto & b : w.head.layer1) o.head.layer1.push_back(to_cp_resblock(b));
+    for (const auto & b : w.head.layer2) o.head.layer2.push_back(to_cp_resblock(b));
+    o.head.conv2 = to_cp_conv(w.head.conv2); o.head.bn2 = to_cp_bn(w.head.bn2);
+
+    o.tdnn = to_cp_conv(w.tdnn_linear); o.tdnn_bn = to_cp_bn(w.tdnn_bn);
+
+    const int after_b1 = d.init_C + w.block1.num_layers * d.growth;
+    const int b2_in = after_b1 / 2;
+    const int after_b2 = b2_in + w.block2.num_layers * d.growth;
+    const int b3_in = after_b2 / 2;
+    o.block1 = to_cp_block(w.block1, d.init_C, d.growth, d.bn_channels);
+    o.transit1 = to_cp_transit(w.transit1);
+    o.block2 = to_cp_block(w.block2, b2_in, d.growth, d.bn_channels);
+    o.transit2 = to_cp_transit(w.transit2);
+    o.block3 = to_cp_block(w.block3, b3_in, d.growth, d.bn_channels);
+    o.transit3 = to_cp_transit(w.transit3);
+
+    o.out_bn = to_cp_bn(w.out_nonlinear_bn);
+    o.dense = to_cp_conv(w.dense_linear);
+    o.dense_bn = to_cp_bn(w.dense_bn);
+    return o;
+}
+
+}  // namespace
+
+int main() {
+    const Topo d;
+    const int T = 16;
+
+    const campplus_weights w_f = build_weights(d);
+    const CpWeights w_d = to_cp_weights(w_f, d);
+
+    const std::vector<float> fbank_f = gen_f(T * d.feat_dim, 0.3);
+
+    std::vector<float> prod;
+    const bool ok = campplus_embed(fbank_f, T, w_f, /*backend=*/nullptr, prod);
+    if (!ok) {
+        fprintf(stderr, "FAIL campplus_embed (cpu path) returned false\n");
+        return 1;
+    }
+
+    CampplusBackward backward(w_d);
+    const std::vector<double> ref = backward.forward(widen(fbank_f), T);
+
+    if (prod.size() != ref.size() || (int) prod.size() != d.embedding) {
+        fprintf(stderr, "FAIL embedding size mismatch: prod=%zu ref=%zu expected=%d\n", prod.size(),
+                ref.size(), d.embedding);
+        return 1;
+    }
+
+    double max_abs = 0.0, max_rel = 0.0;
+    for (std::size_t i = 0; i < ref.size(); ++i) {
+        const double a = (double) prod[i];
+        const double b = ref[i];
+        if (!std::isfinite(a) || !std::isfinite(b)) {
+            ++g_failures;
+            fprintf(stderr, "FAIL non-finite embedding at %zu: prod=%g ref=%g\n", i, a, b);
+            continue;
+        }
+        const double abs_err = std::fabs(a - b);
+        const double rel_err = abs_err / (std::fabs(b) + 1e-6);
+        if (abs_err > max_abs) max_abs = abs_err;
+        if (rel_err > max_rel) max_rel = rel_err;
+    }
+
+    // float production vs double reference: the only difference is float rounding
+    // accumulated through the chain (bn_channels=128 reductions dominate). The
+    // measured error is ~3e-8; 1e-4 leaves ample float-accumulation margin while
+    // still catching any real layout / convention / wiring drift (which shows up
+    // orders of magnitude larger).
+    constexpr double kAbsTol = 1e-4;
+    if (max_abs > kAbsTol) {
+        ++g_failures;
+        fprintf(stderr, "FAIL forward parity exceeded tolerance: max_abs=%.3e max_rel=%.3e\n", max_abs,
+                max_rel);
+    }
+
+    fprintf(stderr, "%s: forward parity max_abs=%.3e max_rel=%.3e (emb[0]=%.6f ref[0]=%.6f)\n",
+            g_failures == 0 ? "PASS" : "FAIL", max_abs, max_rel, (double) prod[0], ref[0]);
+    return g_failures == 0 ? 0 : 1;
+}