diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt index d359335a066..7b7ece8bb8a 100644 --- a/tts-cpp/CMakeLists.txt +++ b/tts-cpp/CMakeLists.txt @@ -857,6 +857,35 @@ if (TTS_CPP_BUILD_TESTS) tts_cpp_apply_ccache(test-supertonic-vector-estimator-backward) tts_cpp_register_test(test-supertonic-vector-estimator-backward LABEL "unit") + # QVAC-20984 — analytic backward of the CAMPPlus speaker encoder (FCM Conv2d + # head + residual blocks, TDNN, CAM dense-TDNN blocks with context-attention + # gating and dense concat, statistics pooling, dense head). Model-free: every + # analytic input-gradient is gradchecked against finite differences via the + # Task 2 harness, so it ALWAYS runs on a fresh checkout (no-skip policy, no + # model/fixtures needed). + add_executable(test-campplus-backward + test/test_campplus_backward.cpp + src/campplus_backward.cpp + src/voiceclone_gradcheck.cpp) + target_include_directories(test-campplus-backward PRIVATE src) + tts_cpp_apply_ccache(test-campplus-backward) + tts_cpp_register_test(test-campplus-backward LABEL "unit") + + # Forward-parity: the analytic double forward must match the production scalar + # CAMPPlus forward (campplus_embed_cpu) on synthetic weights, anchoring the + # gradcheck to the real model. Links campplus.cpp -> ggml. + add_executable(test-campplus-backward-parity + test/test_campplus_backward_parity.cpp + src/campplus_backward.cpp + src/campplus.cpp) + target_link_libraries(test-campplus-backward-parity PRIVATE ggml) + target_include_directories(test-campplus-backward-parity PRIVATE ggml/include src) + if (OpenMP_CXX_FOUND) + target_link_libraries(test-campplus-backward-parity PRIVATE OpenMP::OpenMP_CXX) + endif() + tts_cpp_apply_ccache(test-campplus-backward-parity) + tts_cpp_register_test(test-campplus-backward-parity LABEL "unit") + # Engine-level streaming-callback contract test for the per-sentence # segmentation path (Fix #2): monotonic global chunk_index, single final # is_last, result.pcm == concat(callbacks), accumulated stats. Gated on diff --git a/tts-cpp/docs/voiceclone-backward-campplus.md b/tts-cpp/docs/voiceclone-backward-campplus.md new file mode 100644 index 00000000000..1c4c56570d6 --- /dev/null +++ b/tts-cpp/docs/voiceclone-backward-campplus.md @@ -0,0 +1,143 @@ +# Voice-clone backward — CAMPPlus speaker encoder (op × backend gap matrix) + +Scope for ticket *"GGML backward pass: CAMPPlus speaker encoder"* (QVAC-20984). +This doc scopes the work to make the CAMPPlus speaker encoder **differentiable in +GGML** on the CPU path used for enrollment, and records which backward ops are +still missing in the vendored `ggml`. + +It is committed alongside the interim deliverable of this PR: an analytic, +gradchecked C++ backward of the whole CAMPPlus chain +(`src/campplus_backward.{h,cpp}`). See +[Interim vs Phase-2](#interim-solution-shipped-in-this-pr) for how the two +relate. + +## Why the gap exists + +In the enrollment loop CAMPPlus provides the **speaker-similarity loss** between +the target-WAV embedding (constant, forward-only) and the generated-audio +embedding. Only the generated-audio path needs gradients, so the gradient we +need is `d(loss)/d(fbank)` — the input gradient with the model weights frozen. +The fbank is differentiated further back to the waveform by a separate stage; +this module stops at the CAMPPlus input. + +A fully GGML-native backward (the Phase-2 goal, needed by the on-device +enrollment loop) requires every op on the forward graph to have a backward in +`ggml_compute_backward` (`ggml/src/ggml.c`) **and** a CPU kernel for the ops the +backward expands into. Several are missing today. + +## Forward ops on the CAMPPlus path + +Source: `src/campplus_forward.inc` (the GGML graph) and `src/campplus.cpp` (the +scalar CPU reference `campplus_embed_cpu`). + +| Forward op | Where (forward) | +| --- | --- | +| `ggml_conv_2d` / `ggml_im2col` + `ggml_mul_mat` | FCM Conv2d head + residual blocks | +| `conv1d_f32` (`ggml_im2col` + `ggml_mul_mat`) | TDNN, linear1, linear_local, cam linear1/2, transits, dense | +| `ggml_mul` / `ggml_add` (broadcast) | pre-fused BN (scale/shift), bias adds, residuals | +| `ggml_relu` | every nonlinear1/2, transit, out_nonlinear, FCM | +| `ggml_sigmoid` | CAMLayer context gate | +| `ggml_mean` | CAMLayer global context, stats-pool mean + variance | +| `ggml_sum_rows` | CAMLayer seg-pool reduction | +| `ggml_pad` / `ggml_repeat` | CAMLayer seg-pool reshape + broadcast | +| `ggml_sqrt` | stats-pool std | +| `ggml_concat` | dense concat (CAMDenseTDNN), stats-pool mean‖std | +| `ggml_cont`/`reshape`/`view` | layout shuffles, FCM (32,10,T)→(320,T) flatten | + +## Gap matrix + +Legend: **OK** = implemented; **MISSING** = aborts / not implemented; **n/a** = +not on the enrollment path. + +"Graph backward" = a case in `ggml_compute_backward` (`ggml/src/ggml.c`). It is +backend-agnostic: if it aborts, no backend can differentiate the op. "CPU bwd +kernel" = the kernels the backward expands into exist for the CPU backend +(`ggml-cpu`), the only backend enrollment needs in Phase 2. GPU columns are out +of scope for Phase 2 (enrollment runs on CPU) and tracked only for visibility. + +| Op | Graph backward (ggml.c) | CPU bwd kernel | CUDA / Metal / Vulkan / OpenCL | +| --- | --- | --- | --- | +| `MUL_MAT` | OK | OK (`out_prod`/`mul_mat`) | out of scope | +| `ADD` / `MUL` | OK | OK | out of scope | +| `CONT`/`RESHAPE`/`VIEW`/`PERMUTE` | OK | OK | out of scope | +| `IM2COL` | OK (`im2col_back`) | OK | out of scope | +| `RELU` (unary) | OK | OK | out of scope | +| `SIGMOID` (unary) | **MISSING** | — | — | +| `MEAN` | **MISSING** | — | — | +| `SUM_ROWS` | **MISSING** | — | — | +| `SQRT` (unary) | **MISSING** | — | — | +| `PAD` | **MISSING** | — | — | +| `REPEAT` | **MISSING** | — | — | +| `CONCAT` | **MISSING** | — | — | + +Confirmed against the `ggml_compute_backward` switch: handled ops include `ADD`, +`MUL`, `SCALE`, `CPY`, `CONT`, `RESHAPE`, `PERMUTE`, `TRANSPOSE`, `GET_ROWS`, +`DIAG_MASK_INF`, `RMS_NORM`, `MUL_MAT`, `SOFT_MAX`, `IM2COL`, and a subset of +`UNARY` (`ABS`, `SGN`, `NEG`, `STEP`, `RELU`, `SILU`, `EXP`, `EXPM1`, +`SOFTPLUS`). `SIGMOID`, `SQRT`, `MEAN`, `SUM_ROWS`, `PAD`, `REPEAT`, and `CONCAT` +fall through to `GGML_ABORT`. + +## Remaining Phase-2 work items + +To reach a fully GGML-native, on-device backward of CAMPPlus: + +1. **`SIGMOID` backward** — add `s*(1-s)` to the `UNARY` switch + CPU kernel + (needed by the CAMLayer gate). +2. **`SQRT` backward** — add `1/(2*sqrt(x))` to the `UNARY` switch + CPU kernel + (stats-pool std). +3. **`MEAN` / `SUM_ROWS` backward** — broadcast the upstream grad back over the + reduced axis (`1/N` for mean) + CPU kernels. +4. **`PAD` / `REPEAT` backward** — slice off the padding / sum over the repeated + axis (`ggml_repeat_back` already exists; wire it into `ggml_compute_backward`). +5. **`CONCAT` backward** — slice-and-route the grad to each input (dense concat + and stats-pool concat). +6. **Per-stage gradcheck** — wire each lowered stage into the Task 2 harness; + the analytic backward from this PR is the reference oracle. + +Alternatively, the seg-pool / stats-pool subgraphs can be lowered to +`mul_mat`-based reductions (which already have backward), avoiding new kernels for +`MEAN`/`SUM_ROWS`/`REPEAT`. + +## Interim solution shipped in this PR + +Because the gaps above block a GGML-native backward today, this PR ships an +**analytic C++ backward** of the whole CAMPPlus chain, validated component-wise +against finite differences via the Task 2 gradcheck harness +(`src/voiceclone_gradcheck.{h,cpp}`): + +- `conv1d_backward_input` / `conv2d_backward_input` — transpose-conv input grad + (stride / pad / dilation aware) +- `bn_backward_input` — pre-fused affine BN (per-channel scale) +- `relu_backward` / `sigmoid_backward` — pointwise nonlinearities +- `mean_T_backward` / `seg_pool_backward` — CAMLayer context reductions +- `stats_pool_backward_input` — mean + unbiased std pooling +- `fcm_resblock_backward` — Conv2d residual block (with optional shortcut) +- `cam_layer_backward` — CAMDenseTDNN layer (gate + dense-concat split) +- `CampplusBackward::backward` — full chain → `d(loss)/d(fbank)` + +It mirrors the layout and conventions of `campplus_embed_cpu` exactly. Two tests +guard it (both in the always-on `unit` ctest tier, model-free): + +- `test-campplus-backward` — gradchecks every primitive and the full chain + against central finite differences. +- `test-campplus-backward-parity` — asserts the analytic double forward matches + the production scalar forward (`campplus_embed_cpu`) on synthetic weights + (multi-layer CAM blocks, 2/3/2, so the dense-concat accumulation is exercised), + anchoring the gradcheck's relevance to the real model. + +The scalar CPU forward is the path every `campplus_embed` caller uses today +(production `main.cpp`, `test-campplus`, `test-voice-embedding` all pass +`backend==nullptr`), and `test-campplus` / `test-voice-embedding` validate it +against the Python reference embedding. So the trust chain is complete: +Python → `campplus_embed_cpu` → analytic forward → gradchecked backward. The +`campplus_embed_ggml` graph path is not wired to any caller yet; when it is, it +gets its own fixture parity against the CPU/Python path. + +This is mathematically exact, runs on CPU (the enrollment target), and serves as +the **reference oracle** for the per-stage gradcheck once the GGML-native ops in +the work items above are implemented. + +> Note: `campplus_embed_cpu`'s `fcm_forward` hardcodes the input feature +> dimension to 80 (the production fbank width), so the production scalar path is +> only self-consistent at `feat_dim=80`; the parity test uses that. The analytic +> backward derives every dimension from `feat_dim`, so it is geometry-agnostic. diff --git a/tts-cpp/src/campplus_backward.cpp b/tts-cpp/src/campplus_backward.cpp new file mode 100644 index 00000000000..82096e1c102 --- /dev/null +++ b/tts-cpp/src/campplus_backward.cpp @@ -0,0 +1,702 @@ +#include "campplus_backward.h" + +#include +#include +#include +#include + +namespace tts_cpp { +namespace cp_grad { + +namespace { + +int conv_out_len(int L_in, int k, int stride, int pad, int dilation) { + return (L_in + 2 * pad - dilation * (k - 1) - 1) / stride + 1; +} + +// Per-channel sum over the time axis of a channel-major (C, T) buffer. +std::vector row_sum_T(const std::vector & x, int C, int T) { + std::vector s((std::size_t) C, 0.0); + for (int c = 0; c < C; ++c) { + const double * row = x.data() + (std::size_t) c * T; + double acc = 0.0; + for (int t = 0; t < T; ++t) acc += row[t]; + s[(std::size_t) c] = acc; + } + return s; +} + +void add_in_place(std::vector & a, const std::vector & b) { + for (std::size_t i = 0; i < a.size(); ++i) a[i] += b[i]; +} + +} // namespace + +CampplusBackward::CampplusBackward(CpWeights weights) : weights_(std::move(weights)) {} + +// --- elementwise / pooling primitives --------------------------------------- + +std::vector CampplusBackward::bn_forward(const std::vector & x, int C, int T, + const std::vector & scale, + const std::vector & shift) { + std::vector y(x.size()); + for (int c = 0; c < C; ++c) { + const double s = scale[(std::size_t) c]; + const double b = shift[(std::size_t) c]; + const std::size_t base = (std::size_t) c * T; + for (int t = 0; t < T; ++t) y[base + t] = x[base + t] * s + b; + } + return y; +} + +std::vector CampplusBackward::bn_backward_input(const std::vector & d_y, int C, int T, + const std::vector & scale) { + std::vector d_x(d_y.size()); + for (int c = 0; c < C; ++c) { + const double s = scale[(std::size_t) c]; + const std::size_t base = (std::size_t) c * T; + for (int t = 0; t < T; ++t) d_x[base + t] = d_y[base + t] * s; + } + return d_x; +} + +std::vector CampplusBackward::relu_forward(const std::vector & x) { + std::vector y(x.size()); + for (std::size_t i = 0; i < x.size(); ++i) y[i] = x[i] > 0.0 ? x[i] : 0.0; + return y; +} + +std::vector CampplusBackward::relu_backward(const std::vector & relu_in, + const std::vector & d_y) { + std::vector d_x(d_y.size()); + for (std::size_t i = 0; i < d_y.size(); ++i) d_x[i] = relu_in[i] > 0.0 ? d_y[i] : 0.0; + return d_x; +} + +std::vector CampplusBackward::sigmoid_backward(const std::vector & s, + const std::vector & d_y) { + std::vector d_x(d_y.size()); + for (std::size_t i = 0; i < d_y.size(); ++i) d_x[i] = d_y[i] * s[i] * (1.0 - s[i]); + return d_x; +} + +// --- conv1d ----------------------------------------------------------------- +// y[co, to] = b[co] + sum_{ci, kk} w[(co*C_in+ci)*k + kk] * x[ci, to*stride + kk*dilation - pad] +// (valid taps only; zero padding). Channel-major (C, T). + +std::vector CampplusBackward::conv1d_forward(const std::vector & x, int C_in, int T_in, + const std::vector & w, + const std::vector & b, int C_out, int k, + int stride, int pad, int dilation, int T_out) { + std::vector y((std::size_t) C_out * T_out); + const bool has_bias = !b.empty(); + for (int co = 0; co < C_out; ++co) { + const double bias = has_bias ? b[(std::size_t) co] : 0.0; + const std::size_t w_co = (std::size_t) co * C_in * k; + double * y_row = y.data() + (std::size_t) co * T_out; + for (int to = 0; to < T_out; ++to) { + double acc = bias; + const int base_t = to * stride - pad; + for (int ci = 0; ci < C_in; ++ci) { + const double * x_row = x.data() + (std::size_t) ci * T_in; + const double * w_row = w.data() + w_co + (std::size_t) ci * k; + for (int kk = 0; kk < k; ++kk) { + const int ti = base_t + kk * dilation; + if (ti >= 0 && ti < T_in) acc += w_row[kk] * x_row[ti]; + } + } + y_row[to] = acc; + } + } + return y; +} + +std::vector CampplusBackward::conv1d_backward_input(const std::vector & d_y, int C_in, + int T_in, const std::vector & w, + int C_out, int k, int stride, int pad, + int dilation, int T_out) { + std::vector d_x((std::size_t) C_in * T_in, 0.0); + for (int co = 0; co < C_out; ++co) { + const std::size_t w_co = (std::size_t) co * C_in * k; + const double * d_row = d_y.data() + (std::size_t) co * T_out; + for (int to = 0; to < T_out; ++to) { + const double g = d_row[to]; + if (g == 0.0) continue; + const int base_t = to * stride - pad; + for (int ci = 0; ci < C_in; ++ci) { + double * dx_row = d_x.data() + (std::size_t) ci * T_in; + const double * w_row = w.data() + w_co + (std::size_t) ci * k; + for (int kk = 0; kk < k; ++kk) { + const int ti = base_t + kk * dilation; + if (ti >= 0 && ti < T_in) dx_row[ti] += g * w_row[kk]; + } + } + } + } + return d_x; +} + +// --- conv2d ----------------------------------------------------------------- +// Channel-major (C, H, W); weight (C_out, C_in, kH, kW) row-major. + +std::vector CampplusBackward::conv2d_forward(const std::vector & x, int C_in, int H, int W, + const std::vector & w, + const std::vector & b, int C_out, int kH, + int kW, int sH, int sW, int pH, int pW, int H_out, + int W_out) { + std::vector y((std::size_t) C_out * H_out * W_out); + const bool has_bias = !b.empty(); + for (int co = 0; co < C_out; ++co) { + const double bias = has_bias ? b[(std::size_t) co] : 0.0; + const std::size_t w_co = (std::size_t) co * C_in * kH * kW; + for (int ho = 0; ho < H_out; ++ho) { + for (int wo = 0; wo < W_out; ++wo) { + double acc = bias; + const int base_h = ho * sH - pH; + const int base_w = wo * sW - pW; + for (int ci = 0; ci < C_in; ++ci) { + const double * x_c = x.data() + (std::size_t) ci * H * W; + const double * w_c = w.data() + w_co + (std::size_t) ci * kH * kW; + for (int kh = 0; kh < kH; ++kh) { + const int hi = base_h + kh; + if (hi < 0 || hi >= H) continue; + for (int kw = 0; kw < kW; ++kw) { + const int wi = base_w + kw; + if (wi < 0 || wi >= W) continue; + acc += w_c[(std::size_t) kh * kW + kw] * x_c[(std::size_t) hi * W + wi]; + } + } + } + y[(std::size_t) co * H_out * W_out + (std::size_t) ho * W_out + wo] = acc; + } + } + } + return y; +} + +std::vector CampplusBackward::conv2d_backward_input(const std::vector & d_y, int C_in, int H, + int W, const std::vector & w, int C_out, + int kH, int kW, int sH, int sW, int pH, int pW, + int H_out, int W_out) { + std::vector d_x((std::size_t) C_in * H * W, 0.0); + for (int co = 0; co < C_out; ++co) { + const std::size_t w_co = (std::size_t) co * C_in * kH * kW; + for (int ho = 0; ho < H_out; ++ho) { + for (int wo = 0; wo < W_out; ++wo) { + const double g = d_y[(std::size_t) co * H_out * W_out + (std::size_t) ho * W_out + wo]; + if (g == 0.0) continue; + const int base_h = ho * sH - pH; + const int base_w = wo * sW - pW; + for (int ci = 0; ci < C_in; ++ci) { + double * dx_c = d_x.data() + (std::size_t) ci * H * W; + const double * w_c = w.data() + w_co + (std::size_t) ci * kH * kW; + for (int kh = 0; kh < kH; ++kh) { + const int hi = base_h + kh; + if (hi < 0 || hi >= H) continue; + for (int kw = 0; kw < kW; ++kw) { + const int wi = base_w + kw; + if (wi < 0 || wi >= W) continue; + dx_c[(std::size_t) hi * W + wi] += g * w_c[(std::size_t) kh * kW + kw]; + } + } + } + } + } + } + return d_x; +} + +// --- mean over time --------------------------------------------------------- + +std::vector CampplusBackward::mean_T_forward(const std::vector & x, int C, int T) { + std::vector m((std::size_t) C); + for (int c = 0; c < C; ++c) { + const double * row = x.data() + (std::size_t) c * T; + double acc = 0.0; + for (int t = 0; t < T; ++t) acc += row[t]; + m[(std::size_t) c] = acc / (double) T; + } + return m; +} + +std::vector CampplusBackward::mean_T_backward(const std::vector & d_m, int C, int T) { + std::vector d_x((std::size_t) C * T); + const double inv = 1.0 / (double) T; + for (int c = 0; c < C; ++c) { + const double g = d_m[(std::size_t) c] * inv; + double * row = d_x.data() + (std::size_t) c * T; + for (int t = 0; t < T; ++t) row[t] = g; + } + return d_x; +} + +// --- segment pooling (ceil mode, true-count last bin), expanded to (C, T) ---- + +std::vector CampplusBackward::seg_pool_forward(const std::vector & x, int C, int T, + int seg_len) { + std::vector out((std::size_t) C * T); + const int S = (T + seg_len - 1) / seg_len; + for (int c = 0; c < C; ++c) { + const double * row = x.data() + (std::size_t) c * T; + double * dst = out.data() + (std::size_t) c * T; + for (int s = 0; s < S; ++s) { + const int t0 = s * seg_len; + const int t1 = (T < t0 + seg_len) ? T : t0 + seg_len; + const int n = t1 - t0; + double acc = 0.0; + for (int t = t0; t < t1; ++t) acc += row[t]; + const double avg = acc / (double) (n > 0 ? n : 1); + for (int t = t0; t < t1; ++t) dst[t] = avg; + } + } + return out; +} + +std::vector CampplusBackward::seg_pool_backward(const std::vector & d_out, int C, int T, + int seg_len) { + std::vector d_x((std::size_t) C * T); + const int S = (T + seg_len - 1) / seg_len; + for (int c = 0; c < C; ++c) { + const double * d_row = d_out.data() + (std::size_t) c * T; + double * dx_row = d_x.data() + (std::size_t) c * T; + for (int s = 0; s < S; ++s) { + const int t0 = s * seg_len; + const int t1 = (T < t0 + seg_len) ? T : t0 + seg_len; + const int n = t1 - t0; + double acc = 0.0; + for (int t = t0; t < t1; ++t) acc += d_row[t]; + const double g = acc / (double) (n > 0 ? n : 1); + for (int t = t0; t < t1; ++t) dx_row[t] = g; + } + } + return d_x; +} + +// --- statistics pooling (mean + unbiased std) ------------------------------- + +std::vector CampplusBackward::stats_pool_forward(const std::vector & x, int C, int T, + std::vector & mean_out, + std::vector & std_out) { + std::vector out((std::size_t) 2 * C); + mean_out.assign((std::size_t) C, 0.0); + std_out.assign((std::size_t) C, 0.0); + const double denom = (double) (T > 1 ? T - 1 : 1); + for (int c = 0; c < C; ++c) { + const double * row = x.data() + (std::size_t) c * T; + double sum = 0.0; + for (int t = 0; t < T; ++t) sum += row[t]; + const double mean = sum / (double) T; + double sq = 0.0; + for (int t = 0; t < T; ++t) { + const double d = row[t] - mean; + sq += d * d; + } + const double sd = std::sqrt(sq / denom); + mean_out[(std::size_t) c] = mean; + std_out[(std::size_t) c] = sd; + out[(std::size_t) c] = mean; + out[(std::size_t) C + c] = sd; + } + return out; +} + +std::vector CampplusBackward::stats_pool_backward_input(const std::vector & d_out, + const std::vector & x, int C, int T, + const std::vector & mean, + const std::vector & std_) { + std::vector d_x((std::size_t) C * T); + const double inv_T = 1.0 / (double) T; + const double denom = (double) (T > 1 ? T - 1 : 1); + for (int c = 0; c < C; ++c) { + const double d_mean = d_out[(std::size_t) c]; + const double d_std = d_out[(std::size_t) C + c]; + const double sd = std_[(std::size_t) c]; + const double m = mean[(std::size_t) c]; + // d_std/d_x[c,t] = d_std * (x - mean) / ((T-1) * std); the mean-coupling + // term vanishes because sum_t (x - mean) = 0. + const double std_coeff = sd > 0.0 ? d_std / (denom * sd) : 0.0; + const double * row = x.data() + (std::size_t) c * T; + double * dx_row = d_x.data() + (std::size_t) c * T; + for (int t = 0; t < T; ++t) { + dx_row[t] = d_mean * inv_T + std_coeff * (row[t] - m); + } + } + return d_x; +} + +// --- FCM residual block ------------------------------------------------------ + +std::vector CampplusBackward::fcm_resblock_forward(const CpResBlock & blk, + const std::vector & x, int C_in, int H, + int W, int & H_out, int & W_out, + CpResBlockActs & acts) const { + const int planes = blk.conv1.C_out; + const int sH = blk.stride_h; + H_out = conv_out_len(H, 3, sH, 1, 1); + W_out = conv_out_len(W, 3, 1, 1, 1); + acts.H_in = H; acts.W_in = W; acts.H_out = H_out; acts.W_out = W_out; + + std::vector t1 = conv2d_forward(x, C_in, H, W, blk.conv1.w, {}, planes, 3, 3, sH, 1, 1, 1, + H_out, W_out); + t1 = bn_forward(t1, planes, H_out * W_out, blk.bn1.scale, blk.bn1.shift); + acts.relu1_in = t1; + t1 = relu_forward(t1); + + std::vector t2 = conv2d_forward(t1, planes, H_out, W_out, blk.conv2.w, {}, planes, 3, 3, 1, 1, + 1, 1, H_out, W_out); + t2 = bn_forward(t2, planes, H_out * W_out, blk.bn2.scale, blk.bn2.shift); + + std::vector sc; + if (blk.has_shortcut) { + sc = conv2d_forward(x, C_in, H, W, blk.sc.w, {}, planes, 1, 1, sH, 1, 0, 0, H_out, W_out); + sc = bn_forward(sc, planes, H_out * W_out, blk.sc_bn.scale, blk.sc_bn.shift); + } + + std::vector y((std::size_t) planes * H_out * W_out); + if (sc.empty()) { + for (std::size_t i = 0; i < y.size(); ++i) y[i] = t2[i] + x[i]; + } else { + for (std::size_t i = 0; i < y.size(); ++i) y[i] = t2[i] + sc[i]; + } + acts.relu_out_in = y; + return relu_forward(y); +} + +std::vector CampplusBackward::fcm_resblock_backward(const CpResBlock & blk, + const CpResBlockActs & acts, + const std::vector & d_out, + int C_in) const { + const int planes = blk.conv1.C_out; + const int sH = blk.stride_h; + const int H = acts.H_in, W = acts.W_in, Ho = acts.H_out, Wo = acts.W_out; + + // y = relu(t2 + sc) + std::vector d_pre = relu_backward(acts.relu_out_in, d_out); // d(t2+sc) + + // t2 = bn2(conv2(t1)) + std::vector d_t2_bn = bn_backward_input(d_pre, planes, Ho * Wo, blk.bn2.scale); + std::vector d_t1 = conv2d_backward_input(d_t2_bn, planes, Ho, Wo, blk.conv2.w, planes, 3, 3, 1, + 1, 1, 1, Ho, Wo); + // t1 = relu(bn1(conv1(x))) + std::vector d_t1_relu = relu_backward(acts.relu1_in, d_t1); + std::vector d_t1_bn = bn_backward_input(d_t1_relu, planes, Ho * Wo, blk.bn1.scale); + std::vector d_x = conv2d_backward_input(d_t1_bn, C_in, H, W, blk.conv1.w, planes, 3, 3, sH, 1, + 1, 1, Ho, Wo); + + // shortcut path + if (blk.has_shortcut) { + std::vector d_sc_bn = bn_backward_input(d_pre, planes, Ho * Wo, blk.sc_bn.scale); + std::vector d_x_sc = conv2d_backward_input(d_sc_bn, C_in, H, W, blk.sc.w, planes, 1, 1, sH, + 1, 0, 0, Ho, Wo); + add_in_place(d_x, d_x_sc); + } else { + // identity shortcut: y += x (shape-preserving block) + add_in_place(d_x, d_pre); + } + return d_x; +} + +// --- FCM --------------------------------------------------------------------- + +std::vector CampplusBackward::fcm_forward(const std::vector & fbank_ct, int T, int & T_out, + CpFcmActs & acts) const { + const CpFcm & f = weights_.head; + const int F = weights_.feat_dim; + acts.T = T; + + // conv1: (1 -> 32, k3, s1, p1) + int H = conv_out_len(F, 3, 1, 1, 1); + int W = conv_out_len(T, 3, 1, 1, 1); + std::vector x = conv2d_forward(fbank_ct, 1, F, T, f.conv1.w, {}, 32, 3, 3, 1, 1, 1, 1, H, W); + x = bn_forward(x, 32, H * W, f.bn1.scale, f.bn1.shift); + acts.conv1_relu_in = x; + x = relu_forward(x); + + acts.layer1.assign(f.layer1.size(), CpResBlockActs{}); + for (std::size_t i = 0; i < f.layer1.size(); ++i) { + int Hn, Wn; + x = fcm_resblock_forward(f.layer1[i], x, 32, H, W, Hn, Wn, acts.layer1[i]); + H = Hn; W = Wn; + } + acts.layer2.assign(f.layer2.size(), CpResBlockActs{}); + for (std::size_t i = 0; i < f.layer2.size(); ++i) { + int Hn, Wn; + x = fcm_resblock_forward(f.layer2[i], x, 32, H, W, Hn, Wn, acts.layer2[i]); + H = Hn; W = Wn; + } + + // conv2: (32 -> 32, k3, s(sH=2, sW=1), p1) + const int H2 = conv_out_len(H, 3, 2, 1, 1); + const int W2 = conv_out_len(W, 3, 1, 1, 1); + std::vector y = conv2d_forward(x, 32, H, W, f.conv2.w, {}, 32, 3, 3, 2, 1, 1, 1, H2, W2); + y = bn_forward(y, 32, H2 * W2, f.bn2.scale, f.bn2.shift); + acts.conv2_relu_in = y; + y = relu_forward(y); + + acts.H_after = H2; + T_out = W2; // == T (sW=1 throughout) + // (32, H2, W2) reinterpreted as (32*H2, W2) channel-major — identical memory. + return y; +} + +std::vector CampplusBackward::fcm_backward(const std::vector & d_out, + const CpFcmActs & acts) const { + const CpFcm & f = weights_.head; + const int F = weights_.feat_dim; + const int T = acts.T; + const int H_after = acts.H_after; + + // conv2: input (32, H_l2, T), output (32, H_after, T) + const int H_l2 = acts.layer2.back().H_out; + std::vector d = relu_backward(acts.conv2_relu_in, d_out); // (32, H_after*T) + d = bn_backward_input(d, 32, H_after * T, f.bn2.scale); + d = conv2d_backward_input(d, 32, H_l2, T, f.conv2.w, 32, 3, 3, 2, 1, 1, 1, H_after, T); + + for (std::size_t i = f.layer2.size(); i-- > 0;) { + d = fcm_resblock_backward(f.layer2[i], acts.layer2[i], d, 32); + } + for (std::size_t i = f.layer1.size(); i-- > 0;) { + d = fcm_resblock_backward(f.layer1[i], acts.layer1[i], d, 32); + } + + // conv1: input (1, F, T), output (32, F, T) + d = relu_backward(acts.conv1_relu_in, d); + d = bn_backward_input(d, 32, F * T, f.bn1.scale); + return conv2d_backward_input(d, 1, F, T, f.conv1.w, 32, 3, 3, 1, 1, 1, 1, F, T); // (F, T) +} + +// --- CAMDenseTDNN layer ------------------------------------------------------ + +std::vector CampplusBackward::cam_layer_forward(const CpCamLayer & L, + const std::vector & x_in, int C_in, int T, + int growth, int kernel_size, int dilation, + int bn_channels, int seg_pool_len, + CpCamLayerActs & acts) const { + acts.C_in = C_in; + + // nonlinear1 = BN + ReLU on x_in + std::vector y = bn_forward(x_in, C_in, T, L.bn1.scale, L.bn1.shift); + acts.relu1_in = y; + y = relu_forward(y); + + // linear1: 1x1 conv (C_in -> bn_channels) + std::vector z = conv1d_forward(y, C_in, T, L.linear1.w, {}, bn_channels, 1, 1, 0, 1, T); + + // nonlinear2 = BN + ReLU + z = bn_forward(z, bn_channels, T, L.bn2.scale, L.bn2.shift); + acts.relu2_in = z; + z = relu_forward(z); // CAMLayer input + + // linear_local + const int pad = (kernel_size - 1) / 2 * dilation; + acts.y_local = conv1d_forward(z, bn_channels, T, L.loc.w, {}, growth, kernel_size, 1, pad, dilation, T); + + // context = mean_T(z) + seg_pool(z) + const std::vector mean_ctx = mean_T_forward(z, bn_channels, T); + std::vector context = seg_pool_forward(z, bn_channels, T, seg_pool_len); + for (int c = 0; c < bn_channels; ++c) { + const double m = mean_ctx[(std::size_t) c]; + double * row = context.data() + (std::size_t) c * T; + for (int t = 0; t < T; ++t) row[t] += m; + } + + // cam linear1: 1x1 (bn_channels -> bn_channels/2) + bias, ReLU + const int mid = L.cam1.C_out; + std::vector h1 = conv1d_forward(context, bn_channels, T, L.cam1.w, L.cam1.b, mid, 1, 1, 0, 1, T); + acts.h1_in = h1; + h1 = relu_forward(h1); + + // cam linear2: 1x1 (bn_channels/2 -> growth) + bias, sigmoid + std::vector gate = conv1d_forward(h1, mid, T, L.cam2.w, L.cam2.b, growth, 1, 1, 0, 1, T); + for (std::size_t i = 0; i < gate.size(); ++i) gate[i] = 1.0 / (1.0 + std::exp(-gate[i])); + acts.gate = gate; + + // cam_out = y_local * gate, then dense concat [x_in; cam_out] + std::vector out((std::size_t) (C_in + growth) * T); + for (std::size_t i = 0; i < (std::size_t) C_in * T; ++i) out[i] = x_in[i]; + double * cam_dst = out.data() + (std::size_t) C_in * T; + for (std::size_t i = 0; i < acts.y_local.size(); ++i) cam_dst[i] = acts.y_local[i] * gate[i]; + return out; +} + +std::vector CampplusBackward::cam_layer_backward(const CpCamLayer & L, const CpCamLayerActs & acts, + const std::vector & d_out, int C_in, int T, + int growth, int kernel_size, int dilation, + int bn_channels, int seg_pool_len) const { + const int mid = L.cam1.C_out; + const int pad = (kernel_size - 1) / 2 * dilation; + + // split dense concat + std::vector d_x((std::size_t) C_in * T); + for (std::size_t i = 0; i < d_x.size(); ++i) d_x[i] = d_out[i]; // direct identity path + const double * d_cam = d_out.data() + (std::size_t) C_in * T; // (growth, T) + + // cam_out = y_local * gate + std::vector d_y_local((std::size_t) growth * T); + std::vector d_gate((std::size_t) growth * T); + for (std::size_t i = 0; i < d_y_local.size(); ++i) { + d_y_local[i] = d_cam[i] * acts.gate[i]; + d_gate[i] = d_cam[i] * acts.y_local[i]; + } + + // gate = sigmoid(conv2(h1)+b) + std::vector d_g_pre = sigmoid_backward(acts.gate, d_gate); + std::vector d_h1 = conv1d_backward_input(d_g_pre, mid, T, L.cam2.w, growth, 1, 1, 0, 1, T); + // h1 = relu(cam1(context)+b) + std::vector d_h1_pre = relu_backward(acts.h1_in, d_h1); + std::vector d_context = conv1d_backward_input(d_h1_pre, bn_channels, T, L.cam1.w, mid, 1, 1, 0, + 1, T); + + // context = seg_pool(z) + mean_T(z) (broadcast) + std::vector d_mean = row_sum_T(d_context, bn_channels, T); + std::vector d_z = seg_pool_backward(d_context, bn_channels, T, seg_pool_len); + add_in_place(d_z, mean_T_backward(d_mean, bn_channels, T)); + + // y_local = conv_local(z) + add_in_place(d_z, conv1d_backward_input(d_y_local, bn_channels, T, L.loc.w, growth, kernel_size, 1, pad, + dilation, T)); + + // z = relu(bn2(linear1(relu(bn1(x_in))))) + std::vector d_z_relu = relu_backward(acts.relu2_in, d_z); + std::vector d_lin1 = bn_backward_input(d_z_relu, bn_channels, T, L.bn2.scale); + std::vector d_y = conv1d_backward_input(d_lin1, C_in, T, L.linear1.w, bn_channels, 1, 1, 0, 1, T); + std::vector d_y_relu = relu_backward(acts.relu1_in, d_y); + std::vector d_x_branch = bn_backward_input(d_y_relu, C_in, T, L.bn1.scale); + + add_in_place(d_x, d_x_branch); + return d_x; +} + +// --- full chain -------------------------------------------------------------- + +std::vector CampplusBackward::forward(const std::vector & fbank_t_by_c, int T) { + const CpWeights & w = weights_; + const int F = w.feat_dim; + acts_ = CpActs{}; + acts_.T = T; + + // transpose (T, F) -> (F, T) channel-major + std::vector fbank_ct((std::size_t) F * T); + for (int t = 0; t < T; ++t) + for (int c = 0; c < F; ++c) + fbank_ct[(std::size_t) c * T + t] = fbank_t_by_c[(std::size_t) t * F + c]; + + int T_after_fcm = 0; + std::vector x = fcm_forward(fbank_ct, T, T_after_fcm, acts_.fcm); + const int fcm_out_ch = 32 * acts_.fcm.H_after; + + // tdnn: Conv1d(fcm_out -> init_channels, k5, s2, p2) + BN + ReLU + const int init_C = w.tdnn.C_out; + const int T_cam = conv_out_len(T_after_fcm, 5, 2, 2, 1); + acts_.T_cam = T_cam; + x = conv1d_forward(x, fcm_out_ch, T_after_fcm, w.tdnn.w, {}, init_C, 5, 2, 2, 1, T_cam); + x = bn_forward(x, init_C, T_cam, w.tdnn_bn.scale, w.tdnn_bn.shift); + acts_.tdnn_relu_in = x; + x = relu_forward(x); + + int C_cur = init_C; + + auto run_block = [&](const CpCamBlock & blk, const CpTransit & tr, std::vector & bacts, + std::vector & tr_relu_in, int & tr_Cin) { + bacts.assign(blk.layers.size(), CpCamLayerActs{}); + for (std::size_t i = 0; i < blk.layers.size(); ++i) { + x = cam_layer_forward(blk.layers[i], x, C_cur, T_cam, blk.growth, blk.kernel_size, blk.dilation, + blk.bn_channels, w.seg_pool_len, bacts[i]); + C_cur += blk.growth; + } + // transit: BN + ReLU + 1x1 conv (halves channels) + tr_Cin = C_cur; + x = bn_forward(x, C_cur, T_cam, tr.bn.scale, tr.bn.shift); + tr_relu_in = x; + x = relu_forward(x); + const int C_out = tr.linear.C_out; + x = conv1d_forward(x, C_cur, T_cam, tr.linear.w, {}, C_out, 1, 1, 0, 1, T_cam); + C_cur = C_out; + }; + + run_block(w.block1, w.transit1, acts_.block1, acts_.tr1_relu_in, acts_.tr1_Cin); + run_block(w.block2, w.transit2, acts_.block2, acts_.tr2_relu_in, acts_.tr2_Cin); + run_block(w.block3, w.transit3, acts_.block3, acts_.tr3_relu_in, acts_.tr3_Cin); + + acts_.final_ch = C_cur; + + // out_nonlinear: BN + ReLU + x = bn_forward(x, C_cur, T_cam, w.out_bn.scale, w.out_bn.shift); + acts_.out_relu_in = x; + x = relu_forward(x); + acts_.stats_x = x; + + // stats pool -> (2*final) + std::vector stats = stats_pool_forward(x, C_cur, T_cam, acts_.stats_mean, acts_.stats_std); + + // dense: 1x1 conv (2*final -> E) + BN(affine-less) + const int E = w.embedding_size; + std::vector emb = conv1d_forward(stats, 2 * C_cur, 1, w.dense.w, {}, E, 1, 1, 0, 1, 1); + emb = bn_forward(emb, E, 1, w.dense_bn.scale, w.dense_bn.shift); + return emb; +} + +std::vector CampplusBackward::backward(const std::vector & d_emb) const { + if (acts_.stats_x.empty()) { + throw std::logic_error("CampplusBackward::backward called before forward (no cached activations)"); + } + const CpWeights & w = weights_; + const int T = acts_.T; + const int T_cam = acts_.T_cam; + const int final_ch = acts_.final_ch; + const int E = w.embedding_size; + + // dense: emb = bn(conv1d(stats)) + std::vector d = bn_backward_input(d_emb, E, 1, w.dense_bn.scale); + std::vector d_stats = conv1d_backward_input(d, 2 * final_ch, 1, w.dense.w, E, 1, 1, 0, 1, 1); + + // stats pool + std::vector d_x = stats_pool_backward_input(d_stats, acts_.stats_x, final_ch, T_cam, + acts_.stats_mean, acts_.stats_std); + + // out_nonlinear: relu(bn(prev)) + d_x = relu_backward(acts_.out_relu_in, d_x); + d_x = bn_backward_input(d_x, final_ch, T_cam, w.out_bn.scale); + + auto run_block_backward = [&](const CpCamBlock & blk, const CpTransit & tr, + const std::vector & bacts, + const std::vector & tr_relu_in, int tr_Cin) { + // transit: x = conv1d(relu(bn(prev))) + d_x = conv1d_backward_input(d_x, tr_Cin, T_cam, tr.linear.w, tr.linear.C_out, 1, 1, 0, 1, T_cam); + d_x = relu_backward(tr_relu_in, d_x); + d_x = bn_backward_input(d_x, tr_Cin, T_cam, tr.bn.scale); + // block layers in reverse + int C_in = tr_Cin; + for (std::size_t i = blk.layers.size(); i-- > 0;) { + C_in -= blk.growth; + d_x = cam_layer_backward(blk.layers[i], bacts[i], d_x, C_in, T_cam, blk.growth, blk.kernel_size, + blk.dilation, blk.bn_channels, w.seg_pool_len); + } + }; + + run_block_backward(w.block3, w.transit3, acts_.block3, acts_.tr3_relu_in, acts_.tr3_Cin); + run_block_backward(w.block2, w.transit2, acts_.block2, acts_.tr2_relu_in, acts_.tr2_Cin); + run_block_backward(w.block1, w.transit1, acts_.block1, acts_.tr1_relu_in, acts_.tr1_Cin); + + // tdnn: relu(bn(conv1d(fcm_out))) + const int init_C = w.tdnn.C_out; + const int fcm_out_ch = 32 * acts_.fcm.H_after; + d_x = relu_backward(acts_.tdnn_relu_in, d_x); + d_x = bn_backward_input(d_x, init_C, T_cam, w.tdnn_bn.scale); + std::vector d_fcm_out = conv1d_backward_input(d_x, fcm_out_ch, T, w.tdnn.w, init_C, 5, 2, 2, 1, + T_cam); + + // fcm -> d_fbank_ct (F, T) + std::vector d_fbank_ct = fcm_backward(d_fcm_out, acts_.fcm); + + // transpose (F, T) -> (T, F) + const int F = w.feat_dim; + std::vector d_fbank((std::size_t) T * F); + for (int c = 0; c < F; ++c) + for (int t = 0; t < T; ++t) + d_fbank[(std::size_t) t * F + c] = d_fbank_ct[(std::size_t) c * T + t]; + return d_fbank; +} + +} // namespace cp_grad +} // namespace tts_cpp diff --git a/tts-cpp/src/campplus_backward.h b/tts-cpp/src/campplus_backward.h new file mode 100644 index 00000000000..5eeba330270 --- /dev/null +++ b/tts-cpp/src/campplus_backward.h @@ -0,0 +1,262 @@ +#pragma once + +// Analytic backward pass for the CAMPPlus speaker encoder — voice-clone roadmap, +// ticket "GGML backward pass: CAMPPlus speaker encoder" (QVAC-20984). +// +// Scope: CAMPPlus maps an 80-channel Kaldi-fbank spectrogram to a 192-d speaker +// embedding. In the enrollment loop it provides the speaker-similarity loss +// between the (constant) target-WAV embedding and the generated-audio embedding. +// Only the generated-audio path needs gradients, so the gradient this class +// produces is `d(loss)/d(fbank)` — the input gradient with the model weights +// frozen. The fbank itself is differentiated further back to the waveform by a +// separate stage; this module stops at the CAMPPlus input. +// +// Why analytic (not ggml autodiff): the CAMPPlus forward leans on ops whose +// backward is not implemented in the vendored ggml (im2col-based conv1d/conv2d, +// `ggml_mean`, `ggml_sqrt`, `ggml_sigmoid`, `ggml_pad`, the seg-pool reshape / +// sum_rows / repeat chain, ...). The math here is the standard conv / batch-norm +// (pre-fused affine) / pooling / gating backward, computed in double for a +// well-conditioned reference and validated component-wise against central finite +// differences by the voiceclone gradcheck harness (Task 2 / QVAC-20979). +// +// Layout convention (mirrors `campplus_embed_cpu` in campplus.cpp): +// 1-D feature map: channel-major (C, T), access x[c * T + t]. +// 2-D feature map: channel-major (C, H, W), access x[c * H * W + h * W + w]. +// The public fbank in/out uses the (T, feat_dim) row-major layout of the public +// `campplus_embed` API; the transpose to/from channel-major happens internally. +// +// `CampplusBackward` owns the frozen weights and caches the per-call activations +// as state: `forward(fbank)` runs the chain and stores the activations needed by +// `backward(d_emb)`. The class has no dependency on the ggml graph or the GGUF +// loader; a thin adapter binds the real weights into `CpWeights` elsewhere. + +#include + +namespace tts_cpp { +namespace cp_grad { + +// --- Plain data holders (double mirror of campplus.h structs) --------------- + +// Conv weight in PyTorch row-major layout: Conv1d (C_out, C_in, k) flattened as +// ((co * C_in) + ci) * k + kk; Conv2d (C_out, C_in, kH, kW). `b` empty => no bias. +struct CpConv { + std::vector w; + std::vector b; + int C_out = 0, C_in = 0; + int k = 0; // Conv1d kernel + int kH = 0, kW = 0; // Conv2d kernel + int stride = 1, pad = 0, dilation = 1; // Conv1d + int stride_h = 1, stride_w = 1, pad_h = 0, pad_w = 0; // Conv2d +}; + +// Pre-fused affine batch norm: y[c] = x[c] * scale[c] + shift[c]. Frozen at +// inference, so the input-gradient is a per-channel scale. +struct CpBn { + std::vector scale; // [C] + std::vector shift; // [C] +}; + +// FCM BasicResBlock: conv1 + bn1 + relu + conv2 + bn2 (+ optional shortcut), +// residual add then relu. Conv2d, stride only on H. +struct CpResBlock { + CpConv conv1; CpBn bn1; + CpConv conv2; CpBn bn2; + CpConv sc; CpBn sc_bn; // shortcut; sc.w empty => identity + int stride_h = 1; + bool has_shortcut = false; +}; + +struct CpFcm { + CpConv conv1; CpBn bn1; + std::vector layer1; // 2 blocks, first stride 2 + std::vector layer2; // 2 blocks, first stride 2 + CpConv conv2; CpBn bn2; // stride (sH=2, sW=1) +}; + +// CAMDenseTDNNLayer: bn1+relu -> linear1(1x1) -> bn2+relu -> CAMLayer, then +// dense concat [x_in, cam_out]. +struct CpCamLayer { + CpBn bn1; + CpConv linear1; // 1x1 (C_in -> bn_channels) + CpBn bn2; + CpConv loc; // linear_local (bn_channels -> growth, k, dil), no bias + CpConv cam1; // 1x1 (bn_channels -> bn_channels/2), bias + CpConv cam2; // 1x1 (bn_channels/2 -> growth), bias +}; + +struct CpCamBlock { + int num_layers = 0; + int kernel_size = 3; + int dilation = 1; + int growth = 32; + int bn_channels = 128; + int C_in = 0; // channels entering layer 0 + std::vector layers; +}; + +struct CpTransit { + CpBn bn; + CpConv linear; // 1x1 +}; + +struct CpWeights { + int feat_dim = 80; + int embedding_size = 192; + int seg_pool_len = 100; + + CpFcm head; + + CpConv tdnn; CpBn tdnn_bn; // Conv1d (fcm_out -> init_channels, k=5, s=2, p=2) + + CpCamBlock block1; CpTransit transit1; + CpCamBlock block2; CpTransit transit2; + CpCamBlock block3; CpTransit transit3; + + CpBn out_bn; // out_nonlinear BN + CpConv dense; // 1x1 (final*2 -> embedding) + CpBn dense_bn; // affine-less BN (scale = 1/sqrt(var+eps)) +}; + +// --- Activation caches ------------------------------------------------------- + +// ReLU is recovered from the cached pre-activation (relu input). Conv / BN +// input-gradients need only the frozen weights, so the input tensors are not +// cached; only the values the nonlinearities and poolings need are kept. +struct CpResBlockActs { + std::vector relu1_in; // bn1(conv1(x)) pre-relu, (planes, Ho*Wo) + std::vector relu_out_in; // (conv2 path + shortcut) pre-final-relu + int H_in = 0, W_in = 0; // block input dims (for conv backward) + int H_out = 0, W_out = 0; // block output dims +}; + +struct CpCamLayerActs { + std::vector relu1_in; // bn1(x_in) pre-relu, (C_in, T) + std::vector relu2_in; // bn2(linear1(.)) pre-relu, (bn_channels, T) + std::vector y_local; // linear_local output, (growth, T) + std::vector h1_in; // cam1(context)+b pre-relu, (bn_channels/2, T) + std::vector gate; // sigmoid output, (growth, T) + int C_in = 0; // layer input channels +}; + +struct CpFcmActs { + std::vector conv1_relu_in; // (32, 80*T) pre-relu + std::vector layer1; + std::vector layer2; + std::vector conv2_relu_in; // (32, 10*T) pre-relu + int T = 0; // FCM width (== input T) + int H_after = 0; // 10 +}; + +struct CpActs { + int T = 0; // input frames + CpFcmActs fcm; + std::vector tdnn_relu_in; // (init_channels, T_cam) pre-relu + int T_cam = 0; + std::vector block1, block2, block3; + std::vector tr1_relu_in, tr2_relu_in, tr3_relu_in; // pre-relu of each transit BN + int tr1_Cin = 0, tr2_Cin = 0, tr3_Cin = 0; + std::vector out_relu_in; // out_nonlinear BN pre-relu, (final, T_cam) + std::vector stats_x; // out_nonlinear output (final, T_cam) post-relu + std::vector stats_mean; // (final) + std::vector stats_std; // (final) + int final_ch = 0; +}; + +// --- CAMPPlus backward ------------------------------------------------------- +// +// Stateful: construct with the frozen weights, call `forward(fbank, T)` (caches +// activations), then `backward(d_emb)` (consumes them). The stateless math +// primitives are private; the gradcheck self-tests reach them through a friend +// tester so each is validated individually against finite differences. +class CampplusBackward { +public: + explicit CampplusBackward(CpWeights weights); + + const CpWeights & weights() const { return weights_; } + + // Forward: `fbank_t_by_c` is row-major (T, feat_dim). Runs the chain, caches + // activations and returns the raw 192-d embedding. + std::vector forward(const std::vector & fbank_t_by_c, int T); + + // Backward: from d_emb (embedding_size) return d_fbank in the (T, feat_dim) + // row-major layout the forward consumes. Uses the most recent forward cache. + std::vector backward(const std::vector & d_emb) const; + +private: + friend struct CampplusBackwardTester; + + // --- elementwise / pooling primitives (channel-major (C, T)) ------------- + static std::vector bn_forward(const std::vector & x, int C, int T, + const std::vector & scale, + const std::vector & shift); + static std::vector bn_backward_input(const std::vector & d_y, int C, int T, + const std::vector & scale); + + static std::vector relu_forward(const std::vector & x); + // d_x = d_y * (relu_in > 0) + static std::vector relu_backward(const std::vector & relu_in, + const std::vector & d_y); + + // d_x = d_y * s * (1 - s), s = sigmoid output (cached) + static std::vector sigmoid_backward(const std::vector & s, + const std::vector & d_y); + + static std::vector conv1d_forward(const std::vector & x, int C_in, int T_in, + const std::vector & w, const std::vector & b, + int C_out, int k, int stride, int pad, int dilation, + int T_out); + static std::vector conv1d_backward_input(const std::vector & d_y, int C_in, int T_in, + const std::vector & w, int C_out, int k, + int stride, int pad, int dilation, int T_out); + + static std::vector conv2d_forward(const std::vector & x, int C_in, int H, int W, + const std::vector & w, const std::vector & b, + int C_out, int kH, int kW, int sH, int sW, int pH, int pW, + int H_out, int W_out); + static std::vector conv2d_backward_input(const std::vector & d_y, int C_in, int H, int W, + const std::vector & w, int C_out, int kH, int kW, + int sH, int sW, int pH, int pW, int H_out, int W_out); + + // mean over T (per channel): m[c] = mean_t x[c, t] + static std::vector mean_T_forward(const std::vector & x, int C, int T); + static std::vector mean_T_backward(const std::vector & d_m, int C, int T); + + // seg-pool then expand back to (C, T): each ceil-mode bin of seg_len holds + // the average of its members and is tiled across them. + static std::vector seg_pool_forward(const std::vector & x, int C, int T, int seg_len); + static std::vector seg_pool_backward(const std::vector & d_out, int C, int T, int seg_len); + + // stats pool: (C, T) -> (2C) = concat(mean, unbiased std). + static std::vector stats_pool_forward(const std::vector & x, int C, int T, + std::vector & mean_out, + std::vector & std_out); + static std::vector stats_pool_backward_input(const std::vector & d_out, + const std::vector & x, int C, int T, + const std::vector & mean, + const std::vector & std_); + + // --- module forward/backward ------------------------------------------- + std::vector fcm_resblock_forward(const CpResBlock & blk, const std::vector & x, + int C_in, int H, int W, int & H_out, int & W_out, + CpResBlockActs & acts) const; + std::vector fcm_resblock_backward(const CpResBlock & blk, const CpResBlockActs & acts, + const std::vector & d_out, int C_in) const; + + std::vector fcm_forward(const std::vector & fbank_ct, int T, int & T_out, + CpFcmActs & acts) const; + std::vector fcm_backward(const std::vector & d_out, const CpFcmActs & acts) const; + + std::vector cam_layer_forward(const CpCamLayer & L, const std::vector & x_in, int C_in, + int T, int growth, int kernel_size, int dilation, int bn_channels, + int seg_pool_len, CpCamLayerActs & acts) const; + std::vector cam_layer_backward(const CpCamLayer & L, const CpCamLayerActs & acts, + const std::vector & d_out, int C_in, int T, int growth, + int kernel_size, int dilation, int bn_channels, + int seg_pool_len) const; + + CpWeights weights_; + mutable CpActs acts_; +}; + +} // namespace cp_grad +} // namespace tts_cpp diff --git a/tts-cpp/test/test_campplus_backward.cpp b/tts-cpp/test/test_campplus_backward.cpp new file mode 100644 index 00000000000..c46d52e0720 --- /dev/null +++ b/tts-cpp/test/test_campplus_backward.cpp @@ -0,0 +1,499 @@ +// Gradcheck self-tests for the CAMPPlus speaker-encoder backward (voice-clone +// ticket "GGML backward pass: CAMPPlus speaker encoder", QVAC-20984). Pure host +// logic, model-free: every analytic input-gradient is checked component-wise +// against a central finite-difference numeric gradient of the matching forward, +// using the Task 2 gradcheck harness. Runs in the always-on `unit` ctest tier. +// +// Standalone build (single line): +// g++ -std=c++17 -I src test/test_campplus_backward.cpp src/campplus_backward.cpp src/voiceclone_gradcheck.cpp -o /tmp/t && /tmp/t + +#include "campplus_backward.h" +#include "voiceclone_gradcheck.h" + +#include +#include +#include + +using namespace tts_cpp::cp_grad; +using tts_cpp::voiceclone::ScalarLossFn; +using tts_cpp::voiceclone::compare_gradients; +using tts_cpp::voiceclone::finite_diff_gradient; +using tts_cpp::voiceclone::GradcheckReport; + +// Friend accessor: exposes CampplusBackward's private primitives to the tests. +namespace tts_cpp { +namespace cp_grad { +struct CampplusBackwardTester { + using CB = CampplusBackward; + + static std::vector bn_forward(const std::vector & x, int C, int T, + const std::vector & s, const std::vector & b) { + return CB::bn_forward(x, C, T, s, b); + } + static std::vector bn_backward_input(const std::vector & d, int C, int T, + const std::vector & s) { + return CB::bn_backward_input(d, C, T, s); + } + static std::vector relu_forward(const std::vector & x) { return CB::relu_forward(x); } + static std::vector relu_backward(const std::vector & in, const std::vector & d) { + return CB::relu_backward(in, d); + } + static std::vector sigmoid_backward(const std::vector & s, + const std::vector & d) { + return CB::sigmoid_backward(s, d); + } + static std::vector conv1d_forward(const std::vector & x, int Ci, int Ti, + const std::vector & w, const std::vector & b, + int Co, int k, int s, int p, int dl, int To) { + return CB::conv1d_forward(x, Ci, Ti, w, b, Co, k, s, p, dl, To); + } + static std::vector conv1d_backward_input(const std::vector & d, int Ci, int Ti, + const std::vector & w, int Co, int k, int s, + int p, int dl, int To) { + return CB::conv1d_backward_input(d, Ci, Ti, w, Co, k, s, p, dl, To); + } + static std::vector conv2d_forward(const std::vector & x, int Ci, int H, int W, + const std::vector & w, const std::vector & b, + int Co, int kH, int kW, int sH, int sW, int pH, int pW, + int Ho, int Wo) { + return CB::conv2d_forward(x, Ci, H, W, w, b, Co, kH, kW, sH, sW, pH, pW, Ho, Wo); + } + static std::vector conv2d_backward_input(const std::vector & d, int Ci, int H, int W, + const std::vector & w, int Co, int kH, int kW, + int sH, int sW, int pH, int pW, int Ho, int Wo) { + return CB::conv2d_backward_input(d, Ci, H, W, w, Co, kH, kW, sH, sW, pH, pW, Ho, Wo); + } + static std::vector mean_T_forward(const std::vector & x, int C, int T) { + return CB::mean_T_forward(x, C, T); + } + static std::vector mean_T_backward(const std::vector & d, int C, int T) { + return CB::mean_T_backward(d, C, T); + } + static std::vector seg_pool_forward(const std::vector & x, int C, int T, int sl) { + return CB::seg_pool_forward(x, C, T, sl); + } + static std::vector seg_pool_backward(const std::vector & d, int C, int T, int sl) { + return CB::seg_pool_backward(d, C, T, sl); + } + static std::vector stats_pool_forward(const std::vector & x, int C, int T, + std::vector & m, std::vector & sd) { + return CB::stats_pool_forward(x, C, T, m, sd); + } + static std::vector stats_pool_backward_input(const std::vector & d, + const std::vector & x, int C, int T, + const std::vector & m, + const std::vector & sd) { + return CB::stats_pool_backward_input(d, x, C, T, m, sd); + } + static std::vector resblock_forward(const CB & cb, const CpResBlock & blk, + const std::vector & x, int Ci, int H, int W, + int & Ho, int & Wo, CpResBlockActs & a) { + return cb.fcm_resblock_forward(blk, x, Ci, H, W, Ho, Wo, a); + } + static std::vector resblock_backward(const CB & cb, const CpResBlock & blk, + const CpResBlockActs & a, const std::vector & d, + int Ci) { + return cb.fcm_resblock_backward(blk, a, d, Ci); + } + static std::vector cam_layer_forward(const CB & cb, const CpCamLayer & L, + const std::vector & x, int Ci, int T, int g, int k, + int dl, int bn, int sl, CpCamLayerActs & a) { + return cb.cam_layer_forward(L, x, Ci, T, g, k, dl, bn, sl, a); + } + static std::vector cam_layer_backward(const CB & cb, const CpCamLayer & L, + const CpCamLayerActs & a, const std::vector & d, + int Ci, int T, int g, int k, int dl, int bn, int sl) { + return cb.cam_layer_backward(L, a, d, Ci, T, g, k, dl, bn, sl); + } +}; +} // namespace cp_grad +} // namespace tts_cpp + +namespace { + +using Tester = tts_cpp::cp_grad::CampplusBackwardTester; + +int g_failures = 0; +int g_checks = 0; + +#define CHECK(cond, ...) do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_failures; \ + fprintf(stderr, "FAIL %s:%d ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ +} while (0) + +double sample(int i, double phase) { return std::sin(i * 0.9 + phase) * 0.8; } + +std::vector make_vector(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[i] = sample(i, phase); + return v; +} + +// ReLU input kept away from the kink: |v| >= 0.3 so a +-eps perturbation never +// flips its sign and the central difference matches the analytic mask. +std::vector make_relu_input(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) { + const double s = sample(i, phase); + v[i] = std::copysign(0.3 + std::fabs(s), s == 0.0 ? 1.0 : s); + } + return v; +} + +double dot(const std::vector & a, const std::vector & b) { + double acc = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) acc += a[i] * b[i]; + return acc; +} + +double sigmoid(double x) { return 1.0 / (1.0 + std::exp(-x)); } + +void report_check(const char * name, const GradcheckReport & r) { + CHECK(r.passed, "%s: gradcheck failed (max_abs=%.3e max_rel=%.3e worst=%zu)", name, r.max_abs_err, + r.max_rel_err, r.worst_index); +} + +// --- primitive gradchecks --------------------------------------------------- + +void test_bn_backward() { + const int C = 4, T = 5; + const std::vector scale = make_vector(C, 0.2); + const std::vector shift = make_vector(C, 1.0); + const std::vector coeffs = make_vector(C * T, 2.0); + const std::vector x0 = make_vector(C * T, 0.7); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::bn_forward(x, C, T, scale, shift)); + }; + report_check("bn_backward_input", compare_gradients(finite_diff_gradient(f, x0), + Tester::bn_backward_input(coeffs, C, T, scale))); +} + +void test_relu_backward() { + const int n = 20; + const std::vector coeffs = make_vector(n, 1.3); + const std::vector x0 = make_relu_input(n, 0.25); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::relu_forward(x)); + }; + report_check("relu_backward", compare_gradients(finite_diff_gradient(f, x0), + Tester::relu_backward(x0, coeffs))); +} + +void test_sigmoid_backward() { + const int n = 16; + const std::vector coeffs = make_vector(n, 1.1); + const std::vector x0 = make_vector(n, 0.4); + const ScalarLossFn f = [&](const std::vector & x) { + std::vector y(x.size()); + for (std::size_t i = 0; i < x.size(); ++i) y[i] = sigmoid(x[i]); + return dot(coeffs, y); + }; + std::vector s0(x0.size()); + for (std::size_t i = 0; i < x0.size(); ++i) s0[i] = sigmoid(x0[i]); + report_check("sigmoid_backward", compare_gradients(finite_diff_gradient(f, x0), + Tester::sigmoid_backward(s0, coeffs))); +} + +void test_conv1d_backward() { + const int Ci = 3, Ti = 7, Co = 4, k = 3, stride = 2, pad = 1, dilation = 2; + const int To = (Ti + 2 * pad - dilation * (k - 1) - 1) / stride + 1; + const std::vector w = make_vector(Co * Ci * k, 0.3); + const std::vector b = make_vector(Co, 1.1); + const std::vector coeffs = make_vector(Co * To, 2.0); + const std::vector x0 = make_vector(Ci * Ti, 0.7); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::conv1d_forward(x, Ci, Ti, w, b, Co, k, stride, pad, dilation, To)); + }; + report_check("conv1d_backward_input", + compare_gradients(finite_diff_gradient(f, x0), + Tester::conv1d_backward_input(coeffs, Ci, Ti, w, Co, k, stride, pad, + dilation, To))); +} + +void test_conv2d_backward() { + const int Ci = 2, H = 5, W = 4, Co = 3, kH = 3, kW = 3, sH = 2, sW = 1, pH = 1, pW = 1; + const int Ho = (H + 2 * pH - (kH - 1) - 1) / sH + 1; + const int Wo = (W + 2 * pW - (kW - 1) - 1) / sW + 1; + const std::vector w = make_vector(Co * Ci * kH * kW, 0.3); + const std::vector b = make_vector(Co, 0.9); + const std::vector coeffs = make_vector(Co * Ho * Wo, 1.4); + const std::vector x0 = make_vector(Ci * H * W, 0.5); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::conv2d_forward(x, Ci, H, W, w, b, Co, kH, kW, sH, sW, pH, pW, Ho, Wo)); + }; + report_check("conv2d_backward_input", + compare_gradients(finite_diff_gradient(f, x0), + Tester::conv2d_backward_input(coeffs, Ci, H, W, w, Co, kH, kW, sH, sW, + pH, pW, Ho, Wo))); +} + +void test_mean_T_backward() { + const int C = 4, T = 6; + const std::vector coeffs = make_vector(C, 1.7); + const std::vector x0 = make_vector(C * T, 0.3); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::mean_T_forward(x, C, T)); + }; + report_check("mean_T_backward", compare_gradients(finite_diff_gradient(f, x0), + Tester::mean_T_backward(coeffs, C, T))); +} + +void test_seg_pool_backward() { + const int C = 3, T = 7, seg = 3; // S = 3 bins: 3, 3, 1 (partial last) + const std::vector coeffs = make_vector(C * T, 1.2); + const std::vector x0 = make_vector(C * T, 0.6); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, Tester::seg_pool_forward(x, C, T, seg)); + }; + report_check("seg_pool_backward", compare_gradients(finite_diff_gradient(f, x0), + Tester::seg_pool_backward(coeffs, C, T, seg))); +} + +void test_stats_pool_backward() { + const int C = 4, T = 6; + const std::vector coeffs = make_vector(2 * C, 1.5); + const std::vector x0 = make_vector(C * T, 0.4); + std::vector mean, std_; + Tester::stats_pool_forward(x0, C, T, mean, std_); + const ScalarLossFn f = [&](const std::vector & x) { + std::vector m, s; + return dot(coeffs, Tester::stats_pool_forward(x, C, T, m, s)); + }; + report_check("stats_pool_backward_input", + compare_gradients(finite_diff_gradient(f, x0), + Tester::stats_pool_backward_input(coeffs, x0, C, T, mean, std_))); +} + +// --- module gradchecks ------------------------------------------------------ + +// BN with a firmly positive shift so the downstream ReLU stays in its active +// (locally linear) region at the evaluation point: a +-eps finite-difference +// step never crosses the kink, so the central difference matches the analytic +// mask. The ReLU mask=0 branch is covered by the dedicated relu unit test. +CpBn make_bn(int C, double phase) { + CpBn bn; + bn.scale.resize((std::size_t) C); + bn.shift.resize((std::size_t) C); + for (int c = 0; c < C; ++c) { + bn.scale[(std::size_t) c] = 0.5 + 0.3 * std::fabs(sample(c, phase)); // positive scale + bn.shift[(std::size_t) c] = 1.2 + 0.2 * sample(c, phase + 1.0); // firmly positive bias + } + return bn; +} + +CpConv make_conv1d(int Co, int Ci, int k, int stride, int pad, int dil, double phase, bool bias) { + CpConv c; + c.C_out = Co; c.C_in = Ci; c.k = k; c.stride = stride; c.pad = pad; c.dilation = dil; + c.w = make_vector(Co * Ci * k, phase); + for (double & v : c.w) v *= 0.1; // small weights keep BN shift dominant -> ReLU active + if (bias) c.b = make_vector(Co, phase + 0.5); + return c; +} + +CpConv make_conv2d(int Co, int Ci, int kH, int kW, int sH, int sW, int pH, int pW, double phase) { + CpConv c; + c.C_out = Co; c.C_in = Ci; c.kH = kH; c.kW = kW; + c.stride_h = sH; c.stride_w = sW; c.pad_h = pH; c.pad_w = pW; + c.w = make_vector(Co * Ci * kH * kW, phase); + for (double & v : c.w) v *= 0.1; + return c; +} + +void test_resblock_backward(bool shortcut) { + const int Ci = 4, H = 6, W = 5; + const int stride = shortcut ? 2 : 1; + CpResBlock blk; + blk.stride_h = stride; + blk.has_shortcut = shortcut; + blk.conv1 = make_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, 0.2); + blk.bn1 = make_bn(Ci, 0.3); + blk.conv2 = make_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, 0.4); + blk.bn2 = make_bn(Ci, 0.5); + if (shortcut) { + blk.sc = make_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, 0.6); + blk.sc_bn = make_bn(Ci, 0.7); + } + const CampplusBackward cb{CpWeights{}}; + const std::vector x0 = make_vector(Ci * H * W, 0.35); + + int Ho = 0, Wo = 0; + CpResBlockActs acts; + Tester::resblock_forward(cb, blk, x0, Ci, H, W, Ho, Wo, acts); + const std::vector coeffs = make_vector(Ci * Ho * Wo, 1.1); + const std::vector analytic = Tester::resblock_backward(cb, blk, acts, coeffs, Ci); + + const ScalarLossFn f = [&](const std::vector & x) { + int h, w; + CpResBlockActs a; + return dot(coeffs, Tester::resblock_forward(cb, blk, x, Ci, H, W, h, w, a)); + }; + report_check(shortcut ? "fcm_resblock(shortcut) d_x" : "fcm_resblock(identity) d_x", + compare_gradients(finite_diff_gradient(f, x0), analytic)); +} + +void test_cam_layer_backward() { + const int Ci = 6, T = 9, growth = 4, k = 3, dil = 2, bn = 8, seg = 4; + CpCamLayer L; + L.bn1 = make_bn(Ci, 0.2); + L.linear1 = make_conv1d(bn, Ci, 1, 1, 0, 1, 0.3, false); + L.bn2 = make_bn(bn, 0.4); + L.loc = make_conv1d(growth, bn, k, 1, (k - 1) / 2 * dil, dil, 0.5, false); + L.cam1 = make_conv1d(bn / 2, bn, 1, 1, 0, 1, 0.6, true); + L.cam2 = make_conv1d(growth, bn / 2, 1, 1, 0, 1, 0.7, true); + + const CampplusBackward cb{CpWeights{}}; + const std::vector x0 = make_vector(Ci * T, 0.3); + + CpCamLayerActs acts; + Tester::cam_layer_forward(cb, L, x0, Ci, T, growth, k, dil, bn, seg, acts); + const std::vector coeffs = make_vector((Ci + growth) * T, 1.0); + const std::vector analytic = + Tester::cam_layer_backward(cb, L, acts, coeffs, Ci, T, growth, k, dil, bn, seg); + + const ScalarLossFn f = [&](const std::vector & x) { + CpCamLayerActs a; + return dot(coeffs, Tester::cam_layer_forward(cb, L, x, Ci, T, growth, k, dil, bn, seg, a)); + }; + report_check("cam_dense_tdnn_layer d_x", compare_gradients(finite_diff_gradient(f, x0), analytic)); +} + +// --- full chain ------------------------------------------------------------- + +CpCamBlock make_block(int num_layers, int dilation, int C_in, int growth, int bn_channels, int k, + double phase) { + CpCamBlock blk; + blk.num_layers = num_layers; + blk.kernel_size = k; + blk.dilation = dilation; + blk.growth = growth; + blk.bn_channels = bn_channels; + blk.C_in = C_in; + blk.layers.resize((std::size_t) num_layers); + for (int i = 0; i < num_layers; ++i) { + const int lc = C_in + i * growth; + CpCamLayer & L = blk.layers[(std::size_t) i]; + const double p = phase + i; + L.bn1 = make_bn(lc, p + 0.1); + L.linear1 = make_conv1d(bn_channels, lc, 1, 1, 0, 1, p + 0.2, false); + L.bn2 = make_bn(bn_channels, p + 0.3); + L.loc = make_conv1d(growth, bn_channels, k, 1, (k - 1) / 2 * dilation, dilation, p + 0.4, false); + L.cam1 = make_conv1d(bn_channels / 2, bn_channels, 1, 1, 0, 1, p + 0.5, true); + L.cam2 = make_conv1d(growth, bn_channels / 2, 1, 1, 0, 1, p + 0.6, true); + } + return blk; +} + +CpTransit make_transit(int C_in, double phase) { + CpTransit t; + t.bn = make_bn(C_in, phase); + t.linear = make_conv1d(C_in / 2, C_in, 1, 1, 0, 1, phase + 0.5, false); + return t; +} + +CpResBlock make_resblock(int Ci, int stride, bool shortcut, double phase) { + CpResBlock blk; + blk.stride_h = stride; + blk.has_shortcut = shortcut; + blk.conv1 = make_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, phase); + blk.bn1 = make_bn(Ci, phase + 0.1); + blk.conv2 = make_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, phase + 0.2); + blk.bn2 = make_bn(Ci, phase + 0.3); + if (shortcut) { + blk.sc = make_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, phase + 0.4); + blk.sc_bn = make_bn(Ci, phase + 0.5); + } + return blk; +} + +CpWeights make_tiny_weights() { + const int feat_dim = 8; // FCM downsamples H by 8 -> H_after = 1 + const int growth = 4; + const int bn_channels = 8; + const int init_C = 8; + const int k = 3; + + CpWeights w; + w.feat_dim = feat_dim; + w.seg_pool_len = 4; + w.embedding_size = 4; + + // FCM + w.head.conv1 = make_conv2d(32, 1, 3, 3, 1, 1, 1, 1, 0.1); + w.head.bn1 = make_bn(32, 0.2); + w.head.layer1 = {make_resblock(32, 2, true, 1.0), make_resblock(32, 1, false, 2.0)}; + w.head.layer2 = {make_resblock(32, 2, true, 3.0), make_resblock(32, 1, false, 4.0)}; + w.head.conv2 = make_conv2d(32, 32, 3, 3, 2, 1, 1, 1, 5.0); + w.head.bn2 = make_bn(32, 5.5); + + const int fcm_out = 32; // 32 * H_after(1) + w.tdnn = make_conv1d(init_C, fcm_out, 5, 2, 2, 1, 6.0, false); + w.tdnn_bn = make_bn(init_C, 6.5); + + w.block1 = make_block(2, 1, init_C, growth, bn_channels, k, 10.0); + const int after_b1 = init_C + 2 * growth; + w.transit1 = make_transit(after_b1, 20.0); + + const int b2_in = after_b1 / 2; + w.block2 = make_block(2, 2, b2_in, growth, bn_channels, k, 30.0); + const int after_b2 = b2_in + 2 * growth; + w.transit2 = make_transit(after_b2, 40.0); + + const int b3_in = after_b2 / 2; + w.block3 = make_block(1, 2, b3_in, growth, bn_channels, k, 50.0); + const int after_b3 = b3_in + 1 * growth; + w.transit3 = make_transit(after_b3, 60.0); + + const int final_ch = after_b3 / 2; + w.out_bn = make_bn(final_ch, 70.0); + w.dense = make_conv1d(w.embedding_size, final_ch * 2, 1, 1, 0, 1, 80.0, false); + w.dense_bn = make_bn(w.embedding_size, 85.0); + return w; +} + +void test_full_chain_backward() { + const int T = 12; + const CpWeights w = make_tiny_weights(); + CampplusBackward cb{w}; + + const std::vector fbank0 = make_vector(T * w.feat_dim, 0.3); + const std::vector emb = cb.forward(fbank0, T); + const std::vector coeffs = make_vector((int) emb.size(), 1.0); + const std::vector analytic = cb.backward(coeffs); + + const ScalarLossFn f = [&](const std::vector & fb) { + CampplusBackward local{w}; + return dot(coeffs, local.forward(fb, T)); + }; + report_check("campplus full-chain d_fbank", + compare_gradients(finite_diff_gradient(f, fbank0), analytic)); +} + +} // namespace + +int main() { + try { + test_bn_backward(); + test_relu_backward(); + test_sigmoid_backward(); + test_conv1d_backward(); + test_conv2d_backward(); + test_mean_T_backward(); + test_seg_pool_backward(); + test_stats_pool_backward(); + test_resblock_backward(/*shortcut=*/false); + test_resblock_backward(/*shortcut=*/true); + test_cam_layer_backward(); + test_full_chain_backward(); + } catch (const std::exception & e) { + ++g_failures; + fprintf(stderr, "FAIL uncaught exception: %s\n", e.what()); + } + fprintf(stderr, "\n%s: %d/%d checks passed\n", g_failures == 0 ? "PASS" : "FAIL", + g_checks - g_failures, g_checks); + return g_failures == 0 ? 0 : 1; +} diff --git a/tts-cpp/test/test_campplus_backward_parity.cpp b/tts-cpp/test/test_campplus_backward_parity.cpp new file mode 100644 index 00000000000..07c90ef3654 --- /dev/null +++ b/tts-cpp/test/test_campplus_backward_parity.cpp @@ -0,0 +1,340 @@ +// Forward-parity check for the CAMPPlus backward module (QVAC-20984). +// +// The gradcheck self-test (test_campplus_backward.cpp) validates the analytic +// backward against finite differences of the SAME double forward. That proves +// the backward is the exact derivative of `CampplusBackward::forward`, but not +// that this forward matches the model CAMPPlus actually runs. This test closes +// that gap: it feeds identical synthetic weights and the same fbank to the +// production scalar forward (`campplus_embed` with backend==nullptr, i.e. +// `campplus_embed_cpu`) and to `CampplusBackward::forward`, and asserts the two +// 192-d embeddings agree. Any drift in layout, dilation schedule, seg-pool +// geometry, stats-pool variance convention or per-channel scaling would surface +// here, so the gradcheck's relevance is anchored to the real forward. +// +// `campplus_embed_cpu` hardcodes growth=32 and bn_channels=128, so the synthetic +// topology below uses those values. +// +// Trust chain: the scalar CPU forward is what every `campplus_embed` caller in +// the repo actually uses (production `main.cpp`, `test-campplus`, +// `test-voice-embedding` all pass backend==nullptr), and `test-campplus` / +// `test-voice-embedding` validate it against the Python reference embedding. So +// anchoring this parity to `campplus_embed_cpu` ties the analytic forward (and +// therefore the gradchecked backward) to the real model: Python -> CPU forward +// -> analytic forward -> backward. The `campplus_embed_ggml` graph path is not +// exercised by any caller today; if it is wired up later it gets its own +// fixture parity against the CPU/Python path. +// +// Built via CMake (links campplus.cpp -> ggml). Runs in the `unit` ctest tier. + +#include "campplus.h" +#include "campplus_backward.h" + +#include +#include +#include + +using namespace tts_cpp::cp_grad; + +namespace { + +int g_failures = 0; + +double sample(int i, double phase) { return std::sin(i * 0.7 + phase) * 0.5; } + +std::vector gen_f(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[i] = (float) sample(i, phase); + return v; +} + +std::vector widen(const std::vector & v) { + return std::vector(v.begin(), v.end()); +} + +// --- synthetic float weight builders ---------------------------------------- + +// Small weights so the deep ReLU stack stays numerically bounded (real CAMPPlus +// weights are BN-normalized; unscaled synthetic weights would blow activations +// up exponentially across the ~10 conv layers and overflow float). +constexpr double kWeightScale = 0.1; + +campplus_conv mk_conv2d(int Co, int Ci, int kH, int kW, int sH, int sW, int pH, int pW, double phase) { + campplus_conv c; + c.w = gen_f(Co * Ci * kH * kW, phase); + for (float & v : c.w) v *= (float) kWeightScale; + c.C_out = Co; c.C_in = Ci; c.kH = kH; c.kW = kW; + c.stride_h = sH; c.stride_w = sW; c.pad_h = pH; c.pad_w = pW; + c.dilation_h = 1; c.dilation_w = 1; c.is_2d = true; + return c; +} + +campplus_conv mk_conv1d(int Co, int Ci, int k, int stride, int pad, int dil, double phase, bool bias) { + campplus_conv c; + c.w = gen_f(Co * Ci * k, phase); + for (float & v : c.w) v *= (float) kWeightScale; + if (bias) c.b = gen_f(Co, phase + 0.5); + c.C_out = Co; c.C_in = Ci; c.k = k; + c.stride_w = stride; c.pad_w = pad; c.dilation_w = dil; c.is_2d = false; + return c; +} + +// Positive-biased scale/shift so signal propagates through the ReLU stack and +// the embedding is non-degenerate (a zero-mean BN would let the ReLUs collapse +// everything to the final bias, making the parity comparison vacuous). +campplus_bn mk_bn(int C, double phase) { + campplus_bn bn; + bn.scale.resize((std::size_t) C); + bn.shift.resize((std::size_t) C); + for (int c = 0; c < C; ++c) { + bn.scale[(std::size_t) c] = (float) (0.5 + 0.3 * std::fabs(sample(c, phase))); + bn.shift[(std::size_t) c] = (float) (0.6 + 0.3 * sample(c, phase + 1.0)); + } + return bn; +} + +campplus_res_block mk_resblock(int Ci, int stride, bool shortcut, double phase) { + campplus_res_block b; + b.stride_h = stride; + b.conv1 = mk_conv2d(Ci, Ci, 3, 3, stride, 1, 1, 1, phase); + b.bn1 = mk_bn(Ci, phase + 0.1); + b.conv2 = mk_conv2d(Ci, Ci, 3, 3, 1, 1, 1, 1, phase + 0.2); + b.bn2 = mk_bn(Ci, phase + 0.3); + if (shortcut) { + b.shortcut_conv = mk_conv2d(Ci, Ci, 1, 1, stride, 1, 0, 0, phase + 0.4); + b.shortcut_bn = mk_bn(Ci, phase + 0.5); + } + return b; +} + +campplus_cam_block mk_cam_block(int num_layers, int kernel_size, int dilation, int C_in, int growth, + int bn_channels, double phase) { + campplus_cam_block blk; + blk.num_layers = num_layers; + blk.kernel_size = kernel_size; + blk.dilation = dilation; + blk.layers.resize((std::size_t) num_layers); + const int pad = (kernel_size - 1) / 2 * dilation; + for (int i = 0; i < num_layers; ++i) { + const int lc = C_in + i * growth; + campplus_cam_dense_tdnn_layer & L = blk.layers[(std::size_t) i]; + const double p = phase + i; + L.bn1 = mk_bn(lc, p + 0.1); + L.linear1 = mk_conv1d(bn_channels, lc, 1, 1, 0, 1, p + 0.2, false); + L.bn2 = mk_bn(bn_channels, p + 0.3); + L.cam_linear_local = mk_conv1d(growth, bn_channels, kernel_size, 1, pad, dilation, p + 0.4, false); + L.cam_linear1 = mk_conv1d(bn_channels / 2, bn_channels, 1, 1, 0, 1, p + 0.5, true); + L.cam_linear2 = mk_conv1d(growth, bn_channels / 2, 1, 1, 0, 1, p + 0.6, true); + } + return blk; +} + +campplus_transit mk_transit(int C_in, double phase) { + campplus_transit t; + t.bn = mk_bn(C_in, phase); + t.linear = mk_conv1d(C_in / 2, C_in, 1, 1, 0, 1, phase + 0.5, false); + return t; +} + +struct Topo { + // campplus_embed_cpu's fcm_forward hardcodes F=80 (H: 80->40->20->10), so the + // production CPU path is only self-consistent at feat_dim=80; the parity check + // must use it. fcm_out = 32 * (80/8) = 320. + int feat_dim = 80; + int init_C = 32; + int growth = 32; // hardcoded in campplus_embed_cpu + int bn_channels = 128; // hardcoded in campplus_embed_cpu + int kernel_size = 3; + int embedding = 8; + int seg_pool_len = 5; +}; + +campplus_weights build_weights(const Topo & d) { + campplus_weights w; + w.feat_dim = d.feat_dim; + w.embedding_size = d.embedding; + w.seg_pool_len = d.seg_pool_len; + w.sample_rate = 16000; + + w.head.conv1 = mk_conv2d(32, 1, 3, 3, 1, 1, 1, 1, 0.1); + w.head.bn1 = mk_bn(32, 0.2); + w.head.layer1 = {mk_resblock(32, 2, true, 1.0), mk_resblock(32, 1, false, 2.0)}; + w.head.layer2 = {mk_resblock(32, 2, true, 3.0), mk_resblock(32, 1, false, 4.0)}; + w.head.conv2 = mk_conv2d(32, 32, 3, 3, 2, 1, 1, 1, 5.0); + w.head.bn2 = mk_bn(32, 5.5); + + const int fcm_out = 32 * (d.feat_dim / 8); // 320 at feat_dim=80 + w.tdnn_linear = mk_conv1d(d.init_C, fcm_out, 5, 2, 2, 1, 6.0, false); + w.tdnn_bn = mk_bn(d.init_C, 6.5); + + // Multi-layer CAM blocks (2/3/2) so the dense-concat accumulation (layer i + // enters with C_in + i*growth) is anchored to production, not only to the + // self-referential full-chain gradcheck. + const int b1_layers = 2, b2_layers = 3, b3_layers = 2; + w.block1 = mk_cam_block(b1_layers, d.kernel_size, 1, d.init_C, d.growth, d.bn_channels, 10.0); + const int after_b1 = d.init_C + b1_layers * d.growth; + w.transit1 = mk_transit(after_b1, 20.0); + + const int b2_in = after_b1 / 2; + w.block2 = mk_cam_block(b2_layers, d.kernel_size, 2, b2_in, d.growth, d.bn_channels, 30.0); + const int after_b2 = b2_in + b2_layers * d.growth; + w.transit2 = mk_transit(after_b2, 40.0); + + const int b3_in = after_b2 / 2; + w.block3 = mk_cam_block(b3_layers, d.kernel_size, 2, b3_in, d.growth, d.bn_channels, 50.0); + const int after_b3 = b3_in + b3_layers * d.growth; + w.transit3 = mk_transit(after_b3, 60.0); + + const int final_ch = after_b3 / 2; + w.out_nonlinear_bn = mk_bn(final_ch, 70.0); + w.dense_linear = mk_conv1d(d.embedding, final_ch * 2, 1, 1, 0, 1, 80.0, false); + w.dense_bn = mk_bn(d.embedding, 85.0); + return w; +} + +// --- float -> double weight conversion (campplus_weights -> CpWeights) ------- + +CpConv to_cp_conv(const campplus_conv & c) { + CpConv o; + o.w = widen(c.w); + o.b = widen(c.b); + o.C_out = c.C_out; o.C_in = c.C_in; o.k = c.k; + o.kH = c.kH; o.kW = c.kW; + o.stride = c.stride_w; o.pad = c.pad_w; o.dilation = c.dilation_w; + o.stride_h = c.stride_h; o.stride_w = c.stride_w; o.pad_h = c.pad_h; o.pad_w = c.pad_w; + return o; +} + +CpBn to_cp_bn(const campplus_bn & b) { + CpBn o; + o.scale = widen(b.scale); + o.shift = widen(b.shift); + return o; +} + +CpResBlock to_cp_resblock(const campplus_res_block & b) { + CpResBlock o; + o.conv1 = to_cp_conv(b.conv1); o.bn1 = to_cp_bn(b.bn1); + o.conv2 = to_cp_conv(b.conv2); o.bn2 = to_cp_bn(b.bn2); + o.has_shortcut = !b.shortcut_conv.w.empty(); + if (o.has_shortcut) { o.sc = to_cp_conv(b.shortcut_conv); o.sc_bn = to_cp_bn(b.shortcut_bn); } + o.stride_h = b.stride_h; + return o; +} + +CpCamBlock to_cp_block(const campplus_cam_block & b, int C_in, int growth, int bn_channels) { + CpCamBlock o; + o.num_layers = b.num_layers; + o.kernel_size = b.kernel_size; + o.dilation = b.dilation; + o.growth = growth; + o.bn_channels = bn_channels; + o.C_in = C_in; + o.layers.resize(b.layers.size()); + for (std::size_t i = 0; i < b.layers.size(); ++i) { + const campplus_cam_dense_tdnn_layer & L = b.layers[i]; + CpCamLayer & d = o.layers[i]; + d.bn1 = to_cp_bn(L.bn1); + d.linear1 = to_cp_conv(L.linear1); + d.bn2 = to_cp_bn(L.bn2); + d.loc = to_cp_conv(L.cam_linear_local); + d.cam1 = to_cp_conv(L.cam_linear1); + d.cam2 = to_cp_conv(L.cam_linear2); + } + return o; +} + +CpTransit to_cp_transit(const campplus_transit & t) { + CpTransit o; + o.bn = to_cp_bn(t.bn); + o.linear = to_cp_conv(t.linear); + return o; +} + +CpWeights to_cp_weights(const campplus_weights & w, const Topo & d) { + CpWeights o; + o.feat_dim = w.feat_dim; + o.embedding_size = w.embedding_size; + o.seg_pool_len = w.seg_pool_len; + + o.head.conv1 = to_cp_conv(w.head.conv1); o.head.bn1 = to_cp_bn(w.head.bn1); + for (const auto & b : w.head.layer1) o.head.layer1.push_back(to_cp_resblock(b)); + for (const auto & b : w.head.layer2) o.head.layer2.push_back(to_cp_resblock(b)); + o.head.conv2 = to_cp_conv(w.head.conv2); o.head.bn2 = to_cp_bn(w.head.bn2); + + o.tdnn = to_cp_conv(w.tdnn_linear); o.tdnn_bn = to_cp_bn(w.tdnn_bn); + + const int after_b1 = d.init_C + w.block1.num_layers * d.growth; + const int b2_in = after_b1 / 2; + const int after_b2 = b2_in + w.block2.num_layers * d.growth; + const int b3_in = after_b2 / 2; + o.block1 = to_cp_block(w.block1, d.init_C, d.growth, d.bn_channels); + o.transit1 = to_cp_transit(w.transit1); + o.block2 = to_cp_block(w.block2, b2_in, d.growth, d.bn_channels); + o.transit2 = to_cp_transit(w.transit2); + o.block3 = to_cp_block(w.block3, b3_in, d.growth, d.bn_channels); + o.transit3 = to_cp_transit(w.transit3); + + o.out_bn = to_cp_bn(w.out_nonlinear_bn); + o.dense = to_cp_conv(w.dense_linear); + o.dense_bn = to_cp_bn(w.dense_bn); + return o; +} + +} // namespace + +int main() { + const Topo d; + const int T = 16; + + const campplus_weights w_f = build_weights(d); + const CpWeights w_d = to_cp_weights(w_f, d); + + const std::vector fbank_f = gen_f(T * d.feat_dim, 0.3); + + std::vector prod; + const bool ok = campplus_embed(fbank_f, T, w_f, /*backend=*/nullptr, prod); + if (!ok) { + fprintf(stderr, "FAIL campplus_embed (cpu path) returned false\n"); + return 1; + } + + CampplusBackward backward(w_d); + const std::vector ref = backward.forward(widen(fbank_f), T); + + if (prod.size() != ref.size() || (int) prod.size() != d.embedding) { + fprintf(stderr, "FAIL embedding size mismatch: prod=%zu ref=%zu expected=%d\n", prod.size(), + ref.size(), d.embedding); + return 1; + } + + double max_abs = 0.0, max_rel = 0.0; + for (std::size_t i = 0; i < ref.size(); ++i) { + const double a = (double) prod[i]; + const double b = ref[i]; + if (!std::isfinite(a) || !std::isfinite(b)) { + ++g_failures; + fprintf(stderr, "FAIL non-finite embedding at %zu: prod=%g ref=%g\n", i, a, b); + continue; + } + const double abs_err = std::fabs(a - b); + const double rel_err = abs_err / (std::fabs(b) + 1e-6); + if (abs_err > max_abs) max_abs = abs_err; + if (rel_err > max_rel) max_rel = rel_err; + } + + // float production vs double reference: the only difference is float rounding + // accumulated through the chain (bn_channels=128 reductions dominate). The + // measured error is ~3e-8; 1e-4 leaves ample float-accumulation margin while + // still catching any real layout / convention / wiring drift (which shows up + // orders of magnitude larger). + constexpr double kAbsTol = 1e-4; + if (max_abs > kAbsTol) { + ++g_failures; + fprintf(stderr, "FAIL forward parity exceeded tolerance: max_abs=%.3e max_rel=%.3e\n", max_abs, + max_rel); + } + + fprintf(stderr, "%s: forward parity max_abs=%.3e max_rel=%.3e (emb[0]=%.6f ref[0]=%.6f)\n", + g_failures == 0 ? "PASS" : "FAIL", max_abs, max_rel, (double) prod[0], ref[0]); + return g_failures == 0 ? 0 : 1; +}