diff --git a/tts-cpp/CMakeLists.txt b/tts-cpp/CMakeLists.txt index d359335a066..a835eb5ee2d 100644 --- a/tts-cpp/CMakeLists.txt +++ b/tts-cpp/CMakeLists.txt @@ -857,6 +857,40 @@ if (TTS_CPP_BUILD_TESTS) tts_cpp_apply_ccache(test-supertonic-vector-estimator-backward) tts_cpp_register_test(test-supertonic-vector-estimator-backward LABEL "unit") + # QVAC-20983 — analytic backward of the vocoder (latent unpack, denorm, + # causal conv1d, causal depthwise, channel layer norm, gelu, per-channel-gamma + # convnext, affine batch norm, leaky-relu head, full chain d(loss)/d(latent)). + # The "transposed convolution" upsampling is a fixed reshape+permute (the + # latent unpack), so its backward is a pure permutation. Model-free: every + # analytic gradient is gradchecked against finite differences via the Task 2 + # harness, so it ALWAYS runs on a fresh checkout (no-skip policy). + add_executable(test-supertonic-vocoder-backward + test/test_supertonic_vocoder_backward.cpp + src/supertonic_vocoder_backward.cpp + src/supertonic_vector_estimator_backward.cpp + src/voiceclone_gradcheck.cpp) + target_include_directories(test-supertonic-vocoder-backward PRIVATE src) + tts_cpp_apply_ccache(test-supertonic-vocoder-backward) + tts_cpp_register_test(test-supertonic-vocoder-backward LABEL "unit") + + # QVAC-20983 — forward-parity guard for the vocoder backward. The gradcheck + # above is self-referential (analytic backward vs the in-file reference + # forward); this builds a synthetic CPU-backed `supertonic_model` from + # deterministic weights and asserts the reference forward + # (`VocoderBackward::forward`) matches the PRODUCTION + # `supertonic_vocoder_forward_cpu` on the same buffers. Catches any drift + # between the two forwards (per-channel gamma layout, dilation schedule, + # weight index order) the gradcheck cannot see. Model-free (weights + # synthesized in-memory) so it always runs in the `unit` tier. + add_executable(test-supertonic-vocoder-backward-parity + test/test_supertonic_vocoder_backward_parity.cpp + src/supertonic_vocoder_backward.cpp + src/supertonic_vector_estimator_backward.cpp) + target_link_libraries(test-supertonic-vocoder-backward-parity PRIVATE tts-cpp ggml) + target_include_directories(test-supertonic-vocoder-backward-parity PRIVATE ggml/include src include) + tts_cpp_apply_ccache(test-supertonic-vocoder-backward-parity) + tts_cpp_register_test(test-supertonic-vocoder-backward-parity LABEL "unit") + # Engine-level streaming-callback contract test for the per-sentence # segmentation path (Fix #2): monotonic global chunk_index, single final # is_last, result.pcm == concat(callbacks), accumulated stats. Gated on diff --git a/tts-cpp/docs/voiceclone-backward-vocoder.md b/tts-cpp/docs/voiceclone-backward-vocoder.md new file mode 100644 index 00000000000..af33fd1b245 --- /dev/null +++ b/tts-cpp/docs/voiceclone-backward-vocoder.md @@ -0,0 +1,122 @@ +# Voice-clone backward — vocoder (QVAC-20983) + +Scope for ticket *"6. GGML backward pass: vocoder"*. Make the Supertonic vocoder +differentiable for enrollment, with the **transposed convolution** called out as +the main risk op. This doc records the op × backend gap for the vocoder path and +the CPU-fallback behavior, and is committed alongside the deliverable: an +analytic, gradchecked C++ backward of the full vocoder +(`src/supertonic_vocoder_backward.{h,cpp}`). + +It is the vocoder counterpart of `voiceclone-backward-gap-matrix.md` (text +encoder); the *why analytic* rationale and the Task-2 gradcheck contract are +shared. + +## Why the gap exists + +Voice cloning optimizes only `style_ttl` (model weights frozen). The vocoder +maps the CFM latent to a waveform, so the gradient enrollment needs from it is +`d(loss)/d(latent)` — the audio-loss gradient backpropagated through the frozen +vocoder into the latent, which then flows back through the vector estimator and +text encoder to `style_ttl`. + +A fully GGML-native backward (the on-device goal) needs every op on the vocoder +path to have a case in `ggml_compute_backward` (`ggml/src/ggml.c`) **and** a CPU +kernel for the ops the backward expands into. Several are missing, and the +vocoder additionally leans on custom `ggml_supertonic_*` ops that only run on the +CPU backend. + +## The "transposed convolution" risk, resolved + +The ticket flags the transposed convolution as the main risk. In the Supertonic +vocoder there is **no `ggml_conv_transpose_*` op**: the time upsampling (factor +`ttl_chunk_compress_factor`) is realized as a fixed `reshape + permute + cont` +(the latent unpack, `build_supertonic_vocoder_cache`), i.e. a pure permutation +`x[(t*factor+r)*C_latent + c] = latent[(c*factor+r)*latent_len + t]`. Its +backward is the transpose gather (`latent_unpack_backward`), with no kernel risk +— the feared conv-transpose backward does not arise here. + +## Forward ops on the vocoder path + +Source: `src/supertonic_vocoder.cpp`. + +| Forward op | Where (forward) | +| --- | --- | +| reshape/permute/cont | latent unpack (the notional "transposed conv" upsample) | +| `ggml_scale`, `ggml_mul`, `ggml_add` (broadcast) | denorm, BN affine, residuals, per-channel gamma | +| `ggml_im2col` + `ggml_mul_mat` | causal conv1d (embed, pw1/pw2, head1/head2) | +| custom causal depthwise (`ggml_custom` / `ggml_supertonic_depthwise_1d_causal_ct`) | ConvNeXt depthwise | +| `ggml_norm` (+ `ggml_supertonic_layer_norm_channel*`) | ConvNeXt channel layer norm | +| `ggml_gelu_erf` (+ `ggml_supertonic_bias_gelu*`) | ConvNeXt FFN | +| leaky-relu lowering (`leaky_relu_portable_ggml`) | head PReLU | +| `ggml_supertonic_edge_pad_1d`, `convnext_block_fused` | fused CPU/Metal fast paths | + +## Gap matrix + +Legend: **OK** = implemented; **MISSING** = aborts / not implemented. + +"Graph backward" = a case in `ggml_compute_backward` (`ggml/src/ggml.c`), +backend-agnostic. "CPU bwd kernel" = the kernels the backward expands into exist +for the CPU backend, which is the only backend enrollment needs. GPU columns are +out of scope for Phase 2 (enrollment runs on CPU). + +| Op | Graph backward (ggml.c) | CPU bwd kernel | CUDA / Metal / Vulkan / OpenCL | +| --- | --- | --- | --- | +| `RESHAPE`/`PERMUTE`/`CONT` (latent unpack) | OK | OK | out of scope | +| `SCALE` / `ADD` / `MUL` (denorm, BN, gamma) | OK | OK | out of scope | +| `MUL_MAT` / `IM2COL` (conv1d) | OK (`mul_mat`/`im2col_back`) | OK | out of scope | +| `NORM` (channel layer norm) | **MISSING** | — | — | +| `GELU_ERF` (unary) | **MISSING** | — | — | +| leaky-relu / PReLU | partial (`STEP`/`RELU` only) | — | — | +| custom `ggml_supertonic_*` ops | **MISSING** | — | CPU-only forward (see below) | + +Confirmed against the `ggml_compute_backward` switch: `NORM`, `GELU`/`GELU_ERF` +fall through to `GGML_ABORT`; the custom ops have no backward at all. This +mirrors the text-encoder matrix — the blocking gaps are the same `NORM` and the +elementwise activation (`GELU_ERF` here; the vector estimator adds it too). + +## CPU fallback behavior (enrollment) + +Two layers of CPU-only behavior matter for enrollment: + +1. **Forward custom ops are CPU-only.** `GGML_OP_CUSTOM` is rejected on every GPU + backend (CUDA / Metal / Vulkan / OpenCL), so the vocoder's custom causal + depthwise, fused ConvNeXt block, edge-pad and `_ct` fused ops only execute on + the CPU backend. On GPU backends the forward already falls back to the + pure-GGML `im2col + mul_mat` / granular-op chain (see + `supertonic_use_cpu_custom_ops()` / `supertonic_use_fused_supertonic_ops()` + guards in `supertonic_vocoder.cpp`). +2. **The backward runs on CPU, analytically.** Because `NORM`, `GELU_ERF` and the + custom ops have no GGML backward, the enrollment gradient cannot be produced + by `ggml`'s autodiff on any backend today. The differentiable vocoder is + therefore provided as the analytic C++ backward in this PR, which runs on the + CPU (the enrollment target). **Every backend must fall back to CPU for the + vocoder backward during enrollment.** This is acceptable: enrollment is a + one-time, offline optimization loop, not the realtime synthesis path (which + keeps its GPU fast paths unchanged). + +## Solution shipped in this PR + +The `VocoderBackward` class (`src/supertonic_vocoder_backward.{h,cpp}`) owns the +frozen weights and caches per-call activations as state: `forward(latent)` runs +the chain and `backward(d_wav)` consumes the cached activations to return +`d(loss)/d(latent)`. It is model-free and validated component-wise against +central finite differences via the Task-2 gradcheck harness +(`test/test_supertonic_vocoder_backward.cpp`, always-on `unit` tier). The +stateless math primitives are exposed as static members, each gradchecked +individually: + +- `denorm_backward_input` — latent denormalization +- `conv1d_causal_backward_input` — full causal conv1d (embed / head1) +- `depthwise_causal_backward_input` — causal depthwise conv1d +- `batch_norm_backward_input` — affine BN at inference +- `leaky_relu_backward` — head PReLU +- `latent_unpack_backward` — the "transposed conv" upsample (permutation) +- `convnext_backward_input` — full per-channel-gamma ConvNeXt block +- `VocoderBackward::backward` — the whole chain → `d(loss)/d(latent)` + +Channel layer norm, erf-GELU and pointwise (1x1) convs are shared with the +vector-estimator backward (`tts_cpp::ve_grad`), since the math is identical. + +This is mathematically exact, runs on CPU, and serves as the reference oracle for +the per-stage gradcheck once the GGML-native ops (`NORM`, `GELU_ERF`, custom-op +backward / lowering) are implemented. diff --git a/tts-cpp/src/supertonic_vocoder.cpp b/tts-cpp/src/supertonic_vocoder.cpp index d7f51c420c7..dfb255591cb 100644 --- a/tts-cpp/src/supertonic_vocoder.cpp +++ b/tts-cpp/src/supertonic_vocoder.cpp @@ -453,12 +453,10 @@ ggml_tensor * convnext_block_ggml(ggml_context * ctx, y = conv1d_causal_ggml(ctx, y, w.pw1_w, w.pw1_b); y = ggml_gelu_erf(ctx, y); } - // NOTE: the vector_estimator's `ggml_supertonic_pw2_residual` op - // expects `gamma` to be `[C]` (per-channel scale); the vocoder - // however stores `gamma` as a `[1]` scalar (single learnable - // scale per ConvNeXt block). The shapes are incompatible, so we - // keep the unfused chain here. A vocoder-specific fused op with - // scalar gamma is possible but the win would be tiny (~10 + // NOTE: `gamma` is the per-channel `[C]` residual scale (same shape as the + // vector_estimator's), broadcast over time by `repeat_like` below. We keep + // the unfused `mul + add` tail rather than the vector_estimator's + // `ggml_supertonic_pw2_residual` fused op because the win would be tiny (~10 // dispatches × ~40μs = 0.4 ms). y = conv1d_causal_ggml(ctx, y, w.pw2_w, w.pw2_b); y = ggml_mul(ctx, y, repeat_like(ctx, w.gamma, y)); @@ -488,10 +486,10 @@ ggml_tensor * pointwise_matmul_ct_voc(ggml_context * ctx, // Vocoder ConvNeXt differs from vector_estimator's: (1) depthwise is // **causal** (left-only pad) rather than symmetric edge-clamp — handled // by the `_causal_ct` variant of the fused depthwise kernel (port-v14). -// (2) `gamma` is a scalar `[1]`, not per-channel, so the `pw2_residual_ct` -// fused op doesn't fit — unfused scalar `mul + add` tail. (3) `norm_g` / -// `norm_b` ship as `[1, C]` (same flatten-needed quirk as vector_estimator's -// `.gamma`). +// (2) the per-channel `[C]` `gamma` residual scale is applied with an +// unfused `mul + add` tail (the `pw2_residual_ct` fused op isn't wired up +// here). (3) `norm_g` / `norm_b` ship as `[1, C]` (same flatten-needed +// quirk as vector_estimator's `.gamma`). // // Caller: `SUPERTONIC_DISABLE_CT_VOCODER=1` reverts to legacy // `convnext_block_ggml`. @@ -515,7 +513,7 @@ ggml_tensor * convnext_block_ggml_ct(ggml_context * ctx, y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw1_w, /*bias=*/nullptr); y_ct = ggml_supertonic_bias_gelu_ct(ctx, y_ct, flatten_1d(w.pw1_b)); y_ct = pointwise_matmul_ct_voc(ctx, y_ct, w.pw2_w, flatten_1d(w.pw2_b)); - // Scalar gamma multiply (broadcasts in any layout). + // Per-channel `[C]` gamma multiply (broadcasts over time in any layout). y_ct = ggml_mul(ctx, y_ct, repeat_like(ctx, w.gamma, y_ct)); return ggml_add(ctx, residual, y_ct); } @@ -630,7 +628,7 @@ void build_supertonic_vocoder_cache(vocoder_graph_cache & cache, ggml_set_name(x, "vocoder_embed"); // Phase B2 follow-up: route the 10-block ConvNeXt chain through the // `[C, T]` variant on Metal. Each block runs depthwise (causal_ct) + - // layer_norm + pw1 + bias_gelu + pw2 + scalar gamma + residual add + // layer_norm + pw1 + bias_gelu + pw2 + per-channel gamma + residual add // entirely on `[C, T]` — no intra-block permutes. The single // `[T, C] -> [C, T]` permute happens once before the chain and the // single reverse permute once after. Override: diff --git a/tts-cpp/src/supertonic_vocoder_backward.cpp b/tts-cpp/src/supertonic_vocoder_backward.cpp new file mode 100644 index 00000000000..2b2cfea5175 --- /dev/null +++ b/tts-cpp/src/supertonic_vocoder_backward.cpp @@ -0,0 +1,305 @@ +#include "supertonic_vocoder_backward.h" + +#include "supertonic_vector_estimator_backward.h" // shared conv1x1 / layer_norm / gelu + +#include +#include +#include +#include + +namespace tts_cpp { +namespace voc_grad { + +namespace { + +inline int causal_src_index(int t, int k, int dilation, int pad_left) { + int st = t + k * dilation - pad_left; + return st < 0 ? 0 : st; // replicate ("causal") left padding +} + +} // namespace + +VocoderBackward::VocoderBackward(VocoderWeights weights) : weights_(std::move(weights)) {} + +// --- denorm ----------------------------------------------------------------- + +std::vector VocoderBackward::denorm_forward(const std::vector & x, int L, int C, + double normalizer_scale, const std::vector & std, + const std::vector & mean) const { + std::vector y((std::size_t) L * C); + const double inv = 1.0 / normalizer_scale; + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + const std::size_t i = (std::size_t) t * C + c; + y[i] = x[i] * inv * std[(std::size_t) c] + mean[(std::size_t) c]; + } + } + return y; +} + +std::vector VocoderBackward::denorm_backward_input(const std::vector & d_y, int L, int C, + double normalizer_scale, + const std::vector & std) const { + std::vector d_x((std::size_t) L * C); + const double inv = 1.0 / normalizer_scale; + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + const std::size_t i = (std::size_t) t * C + c; + d_x[i] = d_y[i] * std[(std::size_t) c] * inv; + } + } + return d_x; +} + +// --- full causal conv1d ----------------------------------------------------- + +std::vector VocoderBackward::conv1d_causal_forward(const std::vector & x, int L, int IC, + int OC, int K, const std::vector & w, + const std::vector & b) const { + std::vector y((std::size_t) L * OC); + const int pad_left = K - 1; + const bool has_bias = !b.empty(); + for (int t = 0; t < L; ++t) { + for (int oc = 0; oc < OC; ++oc) { + double sum = has_bias ? b[(std::size_t) oc] : 0.0; + for (int ic = 0; ic < IC; ++ic) { + const std::size_t wbase = ((std::size_t) oc * IC + ic) * K; + for (int k = 0; k < K; ++k) { + const int st = causal_src_index(t, k, 1, pad_left); + sum += w[wbase + k] * x[(std::size_t) st * IC + ic]; + } + } + y[(std::size_t) t * OC + oc] = sum; + } + } + return y; +} + +std::vector VocoderBackward::conv1d_causal_backward_input(const std::vector & d_y, int L, + int IC, int OC, int K, + const std::vector & w) const { + std::vector d_x((std::size_t) L * IC, 0.0); + const int pad_left = K - 1; + for (int t = 0; t < L; ++t) { + for (int oc = 0; oc < OC; ++oc) { + const double g = d_y[(std::size_t) t * OC + oc]; + for (int ic = 0; ic < IC; ++ic) { + const std::size_t wbase = ((std::size_t) oc * IC + ic) * K; + for (int k = 0; k < K; ++k) { + const int st = causal_src_index(t, k, 1, pad_left); + d_x[(std::size_t) st * IC + ic] += g * w[wbase + k]; + } + } + } + } + return d_x; +} + +// --- causal depthwise conv1d ------------------------------------------------ + +std::vector VocoderBackward::depthwise_causal_forward(const std::vector & x, int L, int C, + int K, int dilation, + const std::vector & w, + const std::vector & b) const { + std::vector y((std::size_t) L * C); + const int pad_left = (K - 1) * dilation; + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + double sum = b[(std::size_t) c]; + const std::size_t wbase = (std::size_t) c * K; + for (int k = 0; k < K; ++k) { + const int st = causal_src_index(t, k, dilation, pad_left); + sum += w[wbase + k] * x[(std::size_t) st * C + c]; + } + y[(std::size_t) t * C + c] = sum; + } + } + return y; +} + +std::vector VocoderBackward::depthwise_causal_backward_input(const std::vector & d_y, int L, + int C, int K, int dilation, + const std::vector & w) const { + std::vector d_x((std::size_t) L * C, 0.0); + const int pad_left = (K - 1) * dilation; + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + const double g = d_y[(std::size_t) t * C + c]; + const std::size_t wbase = (std::size_t) c * K; + for (int k = 0; k < K; ++k) { + const int st = causal_src_index(t, k, dilation, pad_left); + d_x[(std::size_t) st * C + c] += g * w[wbase + k]; + } + } + } + return d_x; +} + +// --- affine batch norm (inference) ------------------------------------------ + +std::vector VocoderBackward::batch_norm_forward(const std::vector & x, int L, int C, + const std::vector & gamma, + const std::vector & beta, + const std::vector & running_mean, + const std::vector & running_var, + double eps) const { + std::vector y((std::size_t) L * C); + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + const std::size_t i = (std::size_t) t * C + c; + const double inv = 1.0 / std::sqrt(running_var[(std::size_t) c] + eps); + y[i] = (x[i] - running_mean[(std::size_t) c]) * inv * gamma[(std::size_t) c] + beta[(std::size_t) c]; + } + } + return y; +} + +std::vector VocoderBackward::batch_norm_backward_input(const std::vector & d_y, int L, int C, + const std::vector & gamma, + const std::vector & running_var, + double eps) const { + std::vector d_x((std::size_t) L * C); + for (int t = 0; t < L; ++t) { + for (int c = 0; c < C; ++c) { + const std::size_t i = (std::size_t) t * C + c; + const double inv = 1.0 / std::sqrt(running_var[(std::size_t) c] + eps); + d_x[i] = d_y[i] * gamma[(std::size_t) c] * inv; + } + } + return d_x; +} + +// --- leaky relu / prelu ----------------------------------------------------- + +std::vector VocoderBackward::leaky_relu_forward(const std::vector & x, double slope) const { + std::vector y(x.size()); + for (std::size_t i = 0; i < x.size(); ++i) y[i] = x[i] >= 0.0 ? x[i] : slope * x[i]; + return y; +} + +std::vector VocoderBackward::leaky_relu_backward(const std::vector & x, + const std::vector & d_y, double slope) const { + std::vector d_x(x.size()); + for (std::size_t i = 0; i < x.size(); ++i) d_x[i] = d_y[i] * (x[i] >= 0.0 ? 1.0 : slope); + return d_x; +} + +// --- latent unpack ---------------------------------------------------------- + +std::vector VocoderBackward::latent_unpack_forward(const std::vector & latent, int latent_len, + int C_latent, int factor) const { + const int T0 = latent_len * factor; + std::vector x((std::size_t) T0 * C_latent); + for (int c = 0; c < C_latent; ++c) { + for (int t = 0; t < latent_len; ++t) { + for (int r = 0; r < factor; ++r) { + const int src_c = c * factor + r; + x[(std::size_t) (t * factor + r) * C_latent + c] = + latent[(std::size_t) src_c * latent_len + t]; + } + } + } + return x; +} + +std::vector VocoderBackward::latent_unpack_backward(const std::vector & d_x, int latent_len, + int C_latent, int factor) const { + const int latent_channels = C_latent * factor; + std::vector d_latent((std::size_t) latent_channels * latent_len); + for (int c = 0; c < C_latent; ++c) { + for (int t = 0; t < latent_len; ++t) { + for (int r = 0; r < factor; ++r) { + const int src_c = c * factor + r; + d_latent[(std::size_t) src_c * latent_len + t] = + d_x[(std::size_t) (t * factor + r) * C_latent + c]; + } + } + } + return d_latent; +} + +// --- vocoder ConvNeXt block ------------------------------------------------- + +std::vector VocoderBackward::convnext_forward(const VocConvNextWeights & w, + const std::vector & x, int L, + VocConvNextActivations & acts) const { + acts.dw_out = depthwise_causal_forward(x, L, w.C, w.K, w.dilation, w.dw_w, w.dw_b); + const std::vector ln = ve_grad::layer_norm_forward(acts.dw_out, L, w.C, w.ln_gamma, w.ln_beta); + acts.z1 = ve_grad::conv1x1_forward(ln, L, w.C, w.hidden, w.pw1_w, w.pw1_b); + const std::vector g = ve_grad::gelu_forward(acts.z1); + const std::vector z2 = ve_grad::conv1x1_forward(g, L, w.hidden, w.C, w.pw2_w, w.pw2_b); + + std::vector out((std::size_t) L * w.C); + for (std::size_t i = 0; i < out.size(); ++i) { + const std::size_t c = i % (std::size_t) w.C; // [L, C] time-major, gamma is per-channel + out[i] = x[i] + w.gamma[c] * z2[i]; + } + return out; +} + +std::vector VocoderBackward::convnext_backward_input(const VocConvNextWeights & w, + const VocConvNextActivations & acts, + const std::vector & d_out, int L) const { + std::vector d_z2((std::size_t) L * w.C); + for (std::size_t i = 0; i < d_z2.size(); ++i) { + const std::size_t c = i % (std::size_t) w.C; // [L, C] time-major, gamma is per-channel + d_z2[i] = w.gamma[c] * d_out[i]; + } + + const std::vector d_g = ve_grad::conv1x1_backward_input(d_z2, L, w.hidden, w.C, w.pw2_w); + const std::vector d_z1 = ve_grad::gelu_backward(acts.z1, d_g); + const std::vector d_ln = ve_grad::conv1x1_backward_input(d_z1, L, w.C, w.hidden, w.pw1_w); + const std::vector d_dw = ve_grad::layer_norm_backward_input(acts.dw_out, L, w.C, w.ln_gamma, d_ln); + const std::vector d_x_dw = + depthwise_causal_backward_input(d_dw, L, w.C, w.K, w.dilation, w.dw_w); + + std::vector d_x((std::size_t) L * w.C); + for (std::size_t i = 0; i < d_x.size(); ++i) d_x[i] = d_out[i] + d_x_dw[i]; // residual path + return d_x; +} + +// --- full vocoder ----------------------------------------------------------- + +std::vector VocoderBackward::forward(const std::vector & latent) { + const VocoderWeights & w = weights_; + const int T0 = w.latent_len * w.factor; + + std::vector x = latent_unpack_forward(latent, w.latent_len, w.C_latent, w.factor); + x = denorm_forward(x, T0, w.C_latent, w.normalizer_scale, w.latent_std, w.latent_mean); + x = conv1d_causal_forward(x, T0, w.C_latent, w.C, w.K_embed, w.embed_w, w.embed_b); + + block_acts_.assign(w.convnext.size(), VocConvNextActivations{}); + for (std::size_t i = 0; i < w.convnext.size(); ++i) { + x = convnext_forward(w.convnext[i], x, T0, block_acts_[i]); + } + + x = batch_norm_forward(x, T0, w.C, w.bn_gamma, w.bn_beta, w.bn_running_mean, w.bn_running_var); + + head1_out_ = conv1d_causal_forward(x, T0, w.C, w.Hh, w.K_head1, w.head1_w, w.head1_b); + const std::vector p = leaky_relu_forward(head1_out_, w.prelu_slope); + return ve_grad::conv1x1_forward(p, T0, w.Hh, w.OUT, w.head2_w, /*b=*/{}); +} + +std::vector VocoderBackward::backward(const std::vector & d_wav) const { + if (head1_out_.empty()) { + throw std::logic_error("VocoderBackward::backward called before forward (no cached activations)"); + } + const VocoderWeights & w = weights_; + const int T0 = w.latent_len * w.factor; + + std::vector d_p = ve_grad::conv1x1_backward_input(d_wav, T0, w.Hh, w.OUT, w.head2_w); + std::vector d_x = leaky_relu_backward(head1_out_, d_p, w.prelu_slope); + d_x = conv1d_causal_backward_input(d_x, T0, w.C, w.Hh, w.K_head1, w.head1_w); + d_x = batch_norm_backward_input(d_x, T0, w.C, w.bn_gamma, w.bn_running_var); + + for (std::size_t i = w.convnext.size(); i-- > 0;) { + d_x = convnext_backward_input(w.convnext[i], block_acts_[i], d_x, T0); + } + + d_x = conv1d_causal_backward_input(d_x, T0, w.C_latent, w.C, w.K_embed, w.embed_w); + d_x = denorm_backward_input(d_x, T0, w.C_latent, w.normalizer_scale, w.latent_std); + return latent_unpack_backward(d_x, w.latent_len, w.C_latent, w.factor); +} + +} // namespace voc_grad +} // namespace tts_cpp diff --git a/tts-cpp/src/supertonic_vocoder_backward.h b/tts-cpp/src/supertonic_vocoder_backward.h new file mode 100644 index 00000000000..b407ad77531 --- /dev/null +++ b/tts-cpp/src/supertonic_vocoder_backward.h @@ -0,0 +1,210 @@ +#pragma once + +// Analytic backward pass for the Supertonic vocoder — voice-clone roadmap, +// ticket "GGML backward pass: vocoder" (QVAC-20983). +// +// Scope: the vocoder maps a latent (the CFM output) to a waveform. For +// gradient-based voice cloning only `style_ttl` is optimized, so the gradient +// this class produces is `d(loss)/d(latent)` — the signal the on-device +// enrollment loop backprops from the audio loss down through the vocoder into +// the latent, which then flows back through the vector estimator and text +// encoder to `style_ttl`. Model weights are frozen, so the backward returns the +// input gradient only. +// +// Why analytic (not ggml autodiff): the vocoder forward uses ops whose backward +// is not implemented in the vendored ggml (`ggml_norm` layer norm, `ggml_gelu_*`, +// the custom causal depthwise op, and the leaky-relu/prelu lowering). The +// "transposed convolution" the upsampling notionally needs is realized as a +// fixed reshape+permute (the latent unpack), so its backward is a pure +// permutation rather than a conv-transpose kernel. +// +// `VocoderBackward` owns the frozen weights and caches the per-call activations +// as state: `forward(latent)` runs the chain and stores the activations needed +// by `backward(d_wav)`. The math is computed in double for a well-conditioned +// reference and validated component-wise against central finite differences by +// the voiceclone gradcheck harness (Task 2 / QVAC-20979). The class has no +// dependency on `supertonic_model`; a thin adapter binds the real GGUF weights +// into `VocoderWeights` elsewhere. +// +// Pointwise (1x1) convs, channel layer norm and erf-GELU are shared with the +// vector-estimator backward (`tts_cpp::ve_grad`) since the math is identical; +// the class adds the vocoder-specific causal convs, affine batch norm, +// leaky-relu, per-channel-gamma ConvNeXt block, latent unpack and the full chain. + +#include + +namespace tts_cpp { +namespace voc_grad { + +// --- Plain data holders ------------------------------------------------------ + +// Vocoder ConvNeXt block weights (matches `convnext_block`). Differs from the +// vector-estimator block in that the depthwise conv is *causal* (left pad); +// `gamma` is the per-channel `[C]` residual scale, as in the production model. +struct VocConvNextWeights { + std::vector dw_w; // depthwise [C * K] + std::vector dw_b; // [C] + std::vector ln_gamma; // [C] + std::vector ln_beta; // [C] + std::vector pw1_w; // [hidden * C] + std::vector pw1_b; // [hidden] + std::vector pw2_w; // [C * hidden] + std::vector pw2_b; // [C] + std::vector gamma; // [C], per-channel residual scale + int C = 0; + int hidden = 0; + int K = 0; + int dilation = 1; +}; + +// Activations cached by a ConvNeXt forward for reuse in its backward. +struct VocConvNextActivations { + std::vector dw_out; // [L, C], depthwise output (input to layer norm) + std::vector z1; // [L, hidden], pwconv1 output (input to gelu) +}; + +// Full vocoder weights (matches `supertonic_vocoder_forward_cpu`). +struct VocoderWeights { + int latent_len = 0; // L in the packed latent + int C_latent = 0; // unpacked latent channels + int factor = 0; // time upsample factor (latent_channels = C_latent * factor) + int C = 0; // embed output channels (ConvNeXt width) + double normalizer_scale = 1.0; + std::vector latent_mean; // [C_latent] + std::vector latent_std; // [C_latent] + + std::vector embed_w; // [C * C_latent * K_embed] (OC=C, IC=C_latent) + std::vector embed_b; // [C] + int K_embed = 0; + + std::vector convnext; // ConvNeXt chain (carries dilations) + + std::vector bn_gamma; // [C] + std::vector bn_beta; // [C] + std::vector bn_running_mean; // [C] + std::vector bn_running_var; // [C] + + std::vector head1_w; // [Hh * C * K_head1] + std::vector head1_b; // [Hh] + int K_head1 = 0; + int Hh = 0; // head1 output channels + double prelu_slope = 0.0; + + std::vector head2_w; // [OUT * Hh] pointwise (K=1), no bias + int OUT = 0; // waveform output channels +}; + +// --- Vocoder backward -------------------------------------------------------- +// +// Stateful: construct with the frozen weights, call `forward(latent)` (which +// caches the activations), then `backward(d_wav)` (which consumes them). Only +// the construction + forward/backward surface is public; the stateless math +// primitives are private implementation details. The gradcheck self-tests reach +// them through a `friend` so each primitive is still validated individually +// against finite differences without widening the public API. +class VocoderBackward { +public: + explicit VocoderBackward(VocoderWeights weights); + + const VocoderWeights & weights() const { return weights_; } + + // Forward: `latent` is channel-major [latent_channels, latent_len]. Runs the + // chain, caches the activations as state and returns the waveform, time-major + // [T0, OUT] with T0 = latent_len * factor. + std::vector forward(const std::vector & latent); + + // Backward: from d_wav [T0, OUT] return d_latent in the channel-major + // [latent_channels, latent_len] layout the forward consumes. Uses the + // activations cached by the most recent `forward`. + std::vector backward(const std::vector & d_wav) const; + +private: + // Grants the gradcheck self-tests access to the private primitives below. + friend struct VocoderBackwardTester; + + // The primitives below are pure: they read no member state, so they are + // marked `const` (callable from `backward`). They are private helpers of the + // forward/backward chain, individually gradchecked through the friend tester. + + // --- Latent denormalization (matches the vocoder "denorm" stage) --------- + // y[t, c] = (x[t, c] / normalizer_scale) * std[c] + mean[c]. `x` is + // time-major [L, C]; `std`/`mean` are per-channel [C]. + std::vector denorm_forward(const std::vector & x, int L, int C, double normalizer_scale, + const std::vector & std, + const std::vector & mean) const; + + std::vector denorm_backward_input(const std::vector & d_y, int L, int C, + double normalizer_scale, const std::vector & std) const; + + // --- Full causal conv1d, IC -> OC, K taps (matches `conv1d_causal`) ------ + // Left replicate ("causal") padding by K-1. Weight is ONNX row-major + // [OC, IC, K] with raw index ((oc * IC + ic) * K + k); bias is optional + // ([OC] or empty). `x` is time-major [L, IC]; output is [L, OC]. + std::vector conv1d_causal_forward(const std::vector & x, int L, int IC, int OC, int K, + const std::vector & w, + const std::vector & b) const; + + std::vector conv1d_causal_backward_input(const std::vector & d_y, int L, int IC, int OC, + int K, const std::vector & w) const; + + // --- Causal depthwise conv1d (matches `depthwise_conv1d_causal`) --------- + // Left replicate padding by (K-1)*dilation. Weight is [C, K] with raw index + // c * K + k; bias is per-channel [C]. `x` is time-major [L, C]. + std::vector depthwise_causal_forward(const std::vector & x, int L, int C, int K, + int dilation, const std::vector & w, + const std::vector & b) const; + + std::vector depthwise_causal_backward_input(const std::vector & d_y, int L, int C, int K, + int dilation, const std::vector & w) const; + + // --- Affine batch norm at inference (matches `batch_norm_channel`) ------- + // y[t, c] = (x[t, c] - running_mean[c]) / sqrt(running_var[c] + eps) * + // gamma[c] + beta[c]. Per-channel [C]; constants at inference, so + // the backward into the input is a per-channel scale. + std::vector batch_norm_forward(const std::vector & x, int L, int C, + const std::vector & gamma, const std::vector & beta, + const std::vector & running_mean, + const std::vector & running_var, double eps = 1e-5) const; + + std::vector batch_norm_backward_input(const std::vector & d_y, int L, int C, + const std::vector & gamma, + const std::vector & running_var, + double eps = 1e-5) const; + + // --- Leaky-relu / prelu with a scalar negative slope (head prelu) -------- + // y = x >= 0 ? x : slope * x, elementwise. + std::vector leaky_relu_forward(const std::vector & x, double slope) const; + + std::vector leaky_relu_backward(const std::vector & x, const std::vector & d_y, + double slope) const; + + // --- Latent unpack (the notional "transposed conv" upsampling) ----------- + // The latent ships channel-major [latent_channels, latent_len] (raw index + // (c*factor + r) * latent_len + t) and is unpacked to the time-major + // activation [T0, C_latent] with T0 = latent_len * factor and + // latent_channels = C_latent * factor, via + // x[(t*factor + r) * C_latent + c] = latent[(c*factor + r)*latent_len + t]. + // This is a pure permutation; its backward is the transpose gather. + std::vector latent_unpack_forward(const std::vector & latent, int latent_len, int C_latent, + int factor) const; + + std::vector latent_unpack_backward(const std::vector & d_x, int latent_len, int C_latent, + int factor) const; + + // --- Vocoder ConvNeXt block (matches `convnext_block`) ------------------- + // out = x + gamma * pwconv2(gelu(pwconv1(layer_norm(depthwise_causal(x))))). + // pwconv1/pwconv2 are 1x1 out-major-weight convs (shared `ve_grad::conv1x1`). + std::vector convnext_forward(const VocConvNextWeights & w, const std::vector & x, int L, + VocConvNextActivations & acts) const; + + std::vector convnext_backward_input(const VocConvNextWeights & w, + const VocConvNextActivations & acts, + const std::vector & d_out, int L) const; + + VocoderWeights weights_; + std::vector block_acts_; // per ConvNeXt block + std::vector head1_out_; // [T0, Hh], the prelu input +}; + +} // namespace voc_grad +} // namespace tts_cpp diff --git a/tts-cpp/test/test_supertonic_vocoder_backward.cpp b/tts-cpp/test/test_supertonic_vocoder_backward.cpp new file mode 100644 index 00000000000..51c6a86a21b --- /dev/null +++ b/tts-cpp/test/test_supertonic_vocoder_backward.cpp @@ -0,0 +1,291 @@ +// Gradcheck self-tests for the Supertonic vocoder backward (voice-clone ticket +// "GGML backward pass: vocoder", QVAC-20983). Pure host logic, model-free: every +// analytic gradient is checked component-wise against a central finite-difference +// numeric gradient of the matching forward. Runs in the always-on `unit` ctest +// tier. +// +// Standalone build (single line): +// g++ -std=c++17 -I src test/test_supertonic_vocoder_backward.cpp \ +// src/supertonic_vocoder_backward.cpp \ +// src/supertonic_vector_estimator_backward.cpp \ +// src/voiceclone_gradcheck.cpp -o /tmp/t && /tmp/t + +#include "supertonic_vocoder_backward.h" +#include "voiceclone_gradcheck.h" + +#include +#include +#include +#include +#include + +using namespace tts_cpp::voc_grad; +using tts_cpp::voiceclone::compare_gradients; +using tts_cpp::voiceclone::finite_diff_gradient; +using tts_cpp::voiceclone::GradcheckReport; +using tts_cpp::voiceclone::ScalarLossFn; + +namespace { + +int g_failures = 0; +int g_checks = 0; + +#define CHECK(cond, ...) do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_failures; \ + fprintf(stderr, "FAIL %s:%d ", __FILE__, __LINE__); \ + fprintf(stderr, __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + } \ +} while (0) + +double sample(int i, double phase) { + return std::sin(i * 0.9 + phase) * 0.8; +} + +std::vector make_vector(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[i] = sample(i, phase); + return v; +} + +// Strictly positive samples for variance-like quantities (running_var). +std::vector make_positive(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[i] = 0.5 + 0.4 * (sample(i, phase) + 1.0); + return v; +} + +double dot(const std::vector & a, const std::vector & b) { + double acc = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) acc += a[i] * b[i]; + return acc; +} + +void report_check(const char * name, const GradcheckReport & r) { + CHECK(r.passed, "%s: gradcheck failed (max_abs=%.3e max_rel=%.3e worst=%zu)", + name, r.max_abs_err, r.max_rel_err, r.worst_index); +} + +// --------------------------------------------------------------------------- + +VocConvNextWeights make_block(int C, int hidden, int K, int dilation, double phase) { + VocConvNextWeights w; + w.C = C; + w.hidden = hidden; + w.K = K; + w.dilation = dilation; + w.dw_w = make_vector(C * K, phase + 0.1); + w.dw_b = make_vector(C, phase + 0.2); + w.ln_gamma = make_vector(C, phase + 0.3); + w.ln_beta = make_vector(C, phase + 0.4); + w.pw1_w = make_vector(hidden * C, phase + 0.5); + w.pw1_b = make_vector(hidden, phase + 0.6); + w.pw2_w = make_vector(C * hidden, phase + 0.7); + w.pw2_b = make_vector(C, phase + 0.8); + w.gamma = make_vector(C, phase + 0.9); // per-channel residual scale + return w; +} + +VocoderWeights make_vocoder() { + VocoderWeights w; + w.latent_len = 2; + w.C_latent = 3; + w.factor = 2; // latent_channels = 6, T0 = 4 + w.C = 5; + w.normalizer_scale = 1.7; + w.latent_mean = make_vector(w.C_latent, 0.2); + w.latent_std = make_vector(w.C_latent, 0.5); + + w.K_embed = 3; + w.embed_w = make_vector(w.C * w.C_latent * w.K_embed, 0.9); + w.embed_b = make_vector(w.C, 1.0); + + // Three ConvNeXt blocks exercising the production dilation variety (1/2/4). + const int hidden = 7, Kdw = 3; + w.convnext.push_back(make_block(w.C, hidden, Kdw, 1, 1.0)); + w.convnext.push_back(make_block(w.C, hidden, Kdw, 2, 2.0)); + w.convnext.push_back(make_block(w.C, hidden, Kdw, 4, 3.0)); + + w.bn_gamma = make_vector(w.C, 0.4); + w.bn_beta = make_vector(w.C, 0.7); + w.bn_running_mean = make_vector(w.C, 0.1); + w.bn_running_var = make_positive(w.C, 0.5); + + w.Hh = 6; + w.K_head1 = 3; + w.head1_w = make_vector(w.Hh * w.C * w.K_head1, 0.3); + w.head1_b = make_vector(w.Hh, 0.45); + w.prelu_slope = 0.1; + + w.OUT = 1; + w.head2_w = make_vector(w.OUT * w.Hh, 0.65); + return w; +} + +} // namespace + +namespace tts_cpp { +namespace voc_grad { + +// Friend of VocoderBackward: validates the private math primitives individually +// (and the full forward/backward chain) against finite differences. Declared a +// friend so the gradchecks reach the primitives without widening the public API. +struct VocoderBackwardTester { + // The primitives are pure (ignore weights), so a default-constructed instance + // is all the friend tester needs to exercise them. + static VocoderBackward op() { return VocoderBackward{VocoderWeights{}}; } + + static void test_denorm_backward() { + const int L = 4, C = 3; + const double normalizer_scale = 1.7; + const std::vector std = make_vector(C, 0.4); + const std::vector mean = make_vector(C, 1.2); + const std::vector coeffs = make_vector(L * C, 2.0); + const std::vector x0 = make_vector(L * C, 0.7); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, vb.denorm_forward(x, L, C, normalizer_scale, std, mean)); + }; + const std::vector analytic = vb.denorm_backward_input(coeffs, L, C, normalizer_scale, std); + report_check("denorm_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_conv1d_causal_backward() { + const int L = 6, IC = 4, OC = 5, K = 3; + const std::vector w = make_vector(OC * IC * K, 0.3); + const std::vector b = make_vector(OC, 1.1); + const std::vector coeffs = make_vector(L * OC, 2.0); + const std::vector x0 = make_vector(L * IC, 0.7); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, vb.conv1d_causal_forward(x, L, IC, OC, K, w, b)); + }; + const std::vector analytic = vb.conv1d_causal_backward_input(coeffs, L, IC, OC, K, w); + report_check("conv1d_causal_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_depthwise_causal_backward() { + const int L = 7, C = 4, K = 3, dilation = 2; + const std::vector w = make_vector(C * K, 0.4); + const std::vector b = make_vector(C, 0.9); + const std::vector coeffs = make_vector(L * C, 1.6); + const std::vector x0 = make_vector(L * C, 0.5); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, vb.depthwise_causal_forward(x, L, C, K, dilation, w, b)); + }; + const std::vector analytic = vb.depthwise_causal_backward_input(coeffs, L, C, K, dilation, w); + report_check("depthwise_causal_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_batch_norm_backward() { + const int L = 5, C = 4; + const std::vector gamma = make_vector(C, 0.5); + const std::vector beta = make_vector(C, 1.7); + const std::vector rmean = make_vector(C, 0.3); + const std::vector rvar = make_positive(C, 0.8); + const std::vector coeffs = make_vector(L * C, 2.3); + const std::vector x0 = make_vector(L * C, 0.2); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, vb.batch_norm_forward(x, L, C, gamma, beta, rmean, rvar)); + }; + const std::vector analytic = vb.batch_norm_backward_input(coeffs, L, C, gamma, rvar); + report_check("batch_norm_backward_input", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_leaky_relu_backward() { + const int n = 16; + const double slope = 0.1; + const std::vector coeffs = make_vector(n, 1.3); + const std::vector x0 = make_vector(n, 0.6); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & x) { + return dot(coeffs, vb.leaky_relu_forward(x, slope)); + }; + const std::vector analytic = vb.leaky_relu_backward(x0, coeffs, slope); + // The kink at 0 is measure-zero for these samples; central diff is exact away from it. + report_check("leaky_relu_backward", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_latent_unpack_backward() { + const int latent_len = 3, C_latent = 4, factor = 2; + const int T0 = latent_len * factor; + const int latent_channels = C_latent * factor; + const std::vector coeffs = make_vector(T0 * C_latent, 1.4); + const std::vector latent0 = make_vector(latent_channels * latent_len, 0.5); + + const VocoderBackward vb = op(); + const ScalarLossFn f = [&](const std::vector & latent) { + return dot(coeffs, vb.latent_unpack_forward(latent, latent_len, C_latent, factor)); + }; + const std::vector analytic = vb.latent_unpack_backward(coeffs, latent_len, C_latent, factor); + report_check("latent_unpack_backward", compare_gradients(finite_diff_gradient(f, latent0), analytic)); + } + + static void test_convnext_backward() { + const int L = 6, C = 5, hidden = 7, K = 3, dilation = 2; + const VocConvNextWeights w = make_block(C, hidden, K, dilation, 0.0); + const std::vector coeffs = make_vector(L * C, 1.1); + const std::vector x0 = make_vector(L * C, 0.3); + + const VocoderBackward vb = op(); + VocConvNextActivations acts; + vb.convnext_forward(w, x0, L, acts); + const std::vector analytic = vb.convnext_backward_input(w, acts, coeffs, L); + + const ScalarLossFn f = [&](const std::vector & x) { + VocConvNextActivations a; + return dot(coeffs, vb.convnext_forward(w, x, L, a)); + }; + report_check("voc_convnext d_x", compare_gradients(finite_diff_gradient(f, x0), analytic)); + } + + static void test_vocoder_backward() { + const VocoderWeights w = make_vocoder(); + const int T0 = w.latent_len * w.factor; + const int latent_channels = w.C_latent * w.factor; + const std::vector coeffs = make_vector(T0 * w.OUT, 1.2); + const std::vector latent0 = make_vector(latent_channels * w.latent_len, 0.4); + + VocoderBackward vb(w); + vb.forward(latent0); + const std::vector analytic = vb.backward(coeffs); + + const ScalarLossFn f = [&](const std::vector & latent) { + VocoderBackward local(w); + return dot(coeffs, local.forward(latent)); + }; + report_check("vocoder d_latent", compare_gradients(finite_diff_gradient(f, latent0), analytic)); + } +}; + +} // namespace voc_grad +} // namespace tts_cpp + +int main() { + using tts_cpp::voc_grad::VocoderBackwardTester; + try { + VocoderBackwardTester::test_denorm_backward(); + VocoderBackwardTester::test_conv1d_causal_backward(); + VocoderBackwardTester::test_depthwise_causal_backward(); + VocoderBackwardTester::test_batch_norm_backward(); + VocoderBackwardTester::test_leaky_relu_backward(); + VocoderBackwardTester::test_latent_unpack_backward(); + VocoderBackwardTester::test_convnext_backward(); + VocoderBackwardTester::test_vocoder_backward(); + } catch (const std::exception & e) { + ++g_failures; + fprintf(stderr, "FAIL uncaught exception: %s\n", e.what()); + } + fprintf(stderr, "\n%s: %d/%d checks passed\n", + g_failures == 0 ? "PASS" : "FAIL", g_checks - g_failures, g_checks); + return g_failures == 0 ? 0 : 1; +} diff --git a/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp b/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp new file mode 100644 index 00000000000..2fe6c96b27e --- /dev/null +++ b/tts-cpp/test/test_supertonic_vocoder_backward_parity.cpp @@ -0,0 +1,381 @@ +// Forward-parity test for the Supertonic vocoder backward (voice-clone ticket +// "GGML backward pass: vocoder", QVAC-20983). +// +// Why this test exists +// -------------------- +// `test_supertonic_vocoder_backward.cpp` gradchecks the analytic backward +// against the *in-file* `VocoderBackward::forward`. That proves the backward is +// the correct derivative of that forward, but it is self-referential: if +// `VocoderBackward::forward` itself drifted from the production vocoder, the +// gradcheck would still pass while the gradients flow through the wrong +// function (this is exactly how a `gamma` dimensionality bug slipped past it). +// +// This test closes that gap. It builds a synthetic `supertonic_model` on a CPU +// backend with deterministic weights, feeds the *identical* raw weight buffers +// to both `supertonic_vocoder_forward_cpu` (production) and +// `VocoderBackward::forward` (the backward's reference forward), and asserts the +// two waveforms match. Any divergence between the reference forward and the +// production forward — wrong gamma layout, wrong dilation schedule, swapped +// weight index order — fails here. +// +// Model-free: weights are synthesized in-memory, so it always runs in the +// always-on `unit` ctest tier (no GGUF, no fixtures). + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +#include "supertonic_internal.h" +#include "supertonic_vocoder_backward.h" + +#include +#include +#include +#include +#include +#include + +using namespace tts_cpp::supertonic::detail; +using tts_cpp::voc_grad::VocConvNextWeights; +using tts_cpp::voc_grad::VocoderBackward; +using tts_cpp::voc_grad::VocoderWeights; + +namespace { + +int g_failures = 0; +int g_checks = 0; + +#define CHECK(cond, ...) do { \ + ++g_checks; \ + if (!(cond)) { \ + ++g_failures; \ + std::fprintf(stderr, "FAIL %s:%d ", __FILE__, __LINE__); \ + std::fprintf(stderr, __VA_ARGS__); \ + std::fprintf(stderr, "\n"); \ + } \ +} while (0) + +// Production ConvNeXt dilation schedule (`convnext_block` in +// supertonic_vocoder.cpp). The 10-block count is fixed by the model. +constexpr int kNumBlocks = 10; +constexpr int kDilations[kNumBlocks] = {1, 2, 4, 1, 2, 4, 1, 1, 1, 1}; + +// Topology kept tiny (microsecond runtime) but structurally faithful: real +// dilation variety, per-channel gamma, the full denorm -> embed -> 10x convnext +// -> batch-norm -> head1 -> prelu -> head2 chain. +struct VocoderDims { + int C_latent = 4; + int factor = 2; // latent_channels = C_latent * factor = 8 + int latent_len = 5; // T0 = latent_len * factor = 10 + int C = 8; // ConvNeXt width + int K_embed = 3; + int hidden = 16; + int K_dw = 3; + int Hh = 4; // head1 output channels + int K_head1 = 3; + int OUT = 1; // waveform channels + + int latent_channels() const { return C_latent * factor; } + int T0() const { return latent_len * factor; } +}; + +// Deterministic, bounded weight generator. Small magnitudes keep the 10-block +// residual chain well-scaled so float (production) vs double (reference) +// rounding stays in the sub-1e-3 band; the values still vary per element so the +// test is not degenerate. +float gen_value(int index, double phase, double scale) { + return (float) (scale * std::sin(index * 0.7 + phase)); +} + +std::vector gen_buffer(int n, double phase, double scale = 0.25) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[(std::size_t) i] = gen_value(i, phase, scale); + return v; +} + +// Strictly positive buffer for batch-norm running variance. +std::vector gen_positive(int n, double phase) { + std::vector v((std::size_t) n); + for (int i = 0; i < n; ++i) v[(std::size_t) i] = 0.5f + 0.3f * (gen_value(i, phase, 1.0) + 1.0f); + return v; +} + +struct BlockBuffers { + std::vector dw_w, dw_b, norm_g, norm_b, pw1_w, pw1_b, pw2_w, pw2_b, gamma; + int dilation = 1; +}; + +// All vocoder weight buffers in their raw (production / ggml-linear) layout. +// The same buffers feed both forwards, so any layout mismatch is a real bug, +// not a test artifact. +struct VocoderBuffers { + VocoderDims dims; + std::vector normalizer_scale, latent_mean, latent_std, embed_w, embed_b; + std::vector blocks; + std::vector final_g, final_b, final_mean, final_var; + std::vector head1_w, head1_b, head_prelu, head2_w; + std::vector latent; +}; + +BlockBuffers gen_block(const VocoderDims & d, int dilation, double phase) { + BlockBuffers b; + b.dilation = dilation; + b.dw_w = gen_buffer(d.C * d.K_dw, phase + 0.1); + b.dw_b = gen_buffer(d.C, phase + 0.2); + b.norm_g = gen_buffer(d.C, phase + 0.3); + b.norm_b = gen_buffer(d.C, phase + 0.4); + b.pw1_w = gen_buffer(d.hidden * d.C, phase + 0.5); + b.pw1_b = gen_buffer(d.hidden, phase + 0.6); + b.pw2_w = gen_buffer(d.C * d.hidden, phase + 0.7); + b.pw2_b = gen_buffer(d.C, phase + 0.8); + b.gamma = gen_buffer(d.C, phase + 0.9, 0.15); // per-channel residual scale + return b; +} + +VocoderBuffers gen_vocoder_buffers() { + VocoderBuffers vb; + const VocoderDims & d = vb.dims; + + vb.normalizer_scale = {1.7f}; + vb.latent_mean = gen_buffer(d.C_latent, 0.2); + vb.latent_std = gen_buffer(d.C_latent, 0.5); + vb.embed_w = gen_buffer(d.C * d.C_latent * d.K_embed, 0.9); + vb.embed_b = gen_buffer(d.C, 1.0); + + for (int i = 0; i < kNumBlocks; ++i) { + vb.blocks.push_back(gen_block(d, kDilations[i], 1.0 + 0.31 * i)); + } + + vb.final_g = gen_buffer(d.C, 0.4); + vb.final_b = gen_buffer(d.C, 0.7); + vb.final_mean = gen_buffer(d.C, 0.1); + vb.final_var = gen_positive(d.C, 0.5); + + vb.head1_w = gen_buffer(d.Hh * d.C * d.K_head1, 0.3); + vb.head1_b = gen_buffer(d.Hh, 0.45); + vb.head_prelu = {0.1f}; + vb.head2_w = gen_buffer(d.OUT * d.Hh, 0.65); + + vb.latent = gen_buffer(d.latent_channels() * d.latent_len, 1.3, 0.6); + return vb; +} + +// --- production-side model assembly ----------------------------------------- + +// Owns the ggml resources backing the synthetic model so the test can release +// them deterministically (free_supertonic_model is avoided — this model never +// went through load_supertonic_gguf and carries no scheduler / source map). +struct GgmlModelArena { + ggml_backend_t backend = nullptr; + ggml_context * ctx = nullptr; + ggml_backend_buffer_t buffer = nullptr; + + ~GgmlModelArena() { + if (buffer) ggml_backend_buffer_free(buffer); + if (ctx) ggml_free(ctx); + if (backend) ggml_backend_free(backend); + } +}; + +void set_tensor(ggml_tensor * t, const std::vector & data) { + if ((std::size_t) ggml_nelements(t) != data.size()) { + throw std::runtime_error("tensor element count mismatch while uploading weights"); + } + ggml_backend_tensor_set(t, data.data(), 0, data.size() * sizeof(float)); +} + +void build_block_tensors(ggml_context * ctx, const VocoderDims & d, + supertonic_vocoder_convnext_weights & w) { + w.dw_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_dw, 1, d.C); + w.dw_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + w.norm_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + w.norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + w.pw1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.C, d.hidden); + w.pw1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.hidden); + w.pw2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.hidden, d.C); + w.pw2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + w.gamma = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); +} + +void build_vocoder_tensors(ggml_context * ctx, const VocoderDims & d, supertonic_model & model) { + supertonic_vocoder_weights & v = model.vocoder; + v.normalizer_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + v.latent_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C_latent); + v.latent_std = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C_latent); + v.embed_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_embed, d.C_latent, d.C); + v.embed_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + for (int i = 0; i < kNumBlocks; ++i) build_block_tensors(ctx, d, v.convnext[(std::size_t) i]); + v.final_norm_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + v.final_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + v.final_norm_running_mean = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + v.final_norm_running_var = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.C); + v.head1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d.K_head1, d.C, d.Hh); + v.head1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, d.Hh); + v.head_prelu = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + v.head2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, d.Hh, d.OUT); +} + +void upload_vocoder_weights(const VocoderBuffers & vb, supertonic_model & model) { + const supertonic_vocoder_weights & v = model.vocoder; + set_tensor(v.normalizer_scale, vb.normalizer_scale); + set_tensor(v.latent_mean, vb.latent_mean); + set_tensor(v.latent_std, vb.latent_std); + set_tensor(v.embed_w, vb.embed_w); + set_tensor(v.embed_b, vb.embed_b); + for (int i = 0; i < kNumBlocks; ++i) { + const BlockBuffers & b = vb.blocks[(std::size_t) i]; + const supertonic_vocoder_convnext_weights & w = v.convnext[(std::size_t) i]; + set_tensor(w.dw_w, b.dw_w); + set_tensor(w.dw_b, b.dw_b); + set_tensor(w.norm_g, b.norm_g); + set_tensor(w.norm_b, b.norm_b); + set_tensor(w.pw1_w, b.pw1_w); + set_tensor(w.pw1_b, b.pw1_b); + set_tensor(w.pw2_w, b.pw2_w); + set_tensor(w.pw2_b, b.pw2_b); + set_tensor(w.gamma, b.gamma); + } + set_tensor(v.final_norm_g, vb.final_g); + set_tensor(v.final_norm_b, vb.final_b); + set_tensor(v.final_norm_running_mean, vb.final_mean); + set_tensor(v.final_norm_running_var, vb.final_var); + set_tensor(v.head1_w, vb.head1_w); + set_tensor(v.head1_b, vb.head1_b); + set_tensor(v.head_prelu, vb.head_prelu); + set_tensor(v.head2_w, vb.head2_w); +} + +// Builds a CPU-backed `supertonic_model` whose vocoder weights are the synthetic +// buffers. `arena` owns the ggml resources for deterministic teardown. +void build_synthetic_model(const VocoderBuffers & vb, supertonic_model & model, GgmlModelArena & arena) { + const VocoderDims & d = vb.dims; + model.hparams.latent_dim = d.C_latent; + model.hparams.ttl_chunk_compress_factor = d.factor; + model.hparams.latent_channels = d.latent_channels(); + + arena.backend = ggml_backend_cpu_init(); + if (!arena.backend) throw std::runtime_error("ggml_backend_cpu_init failed"); + + constexpr int kMaxTensors = 256; // 5 + 10*9 + 4 + 4 = 103 tensors, padded. + const std::size_t mem = ggml_tensor_overhead() * kMaxTensors; + ggml_init_params params = { mem, nullptr, /*no_alloc=*/true }; + arena.ctx = ggml_init(params); + if (!arena.ctx) throw std::runtime_error("ggml_init failed"); + + build_vocoder_tensors(arena.ctx, d, model); + arena.buffer = ggml_backend_alloc_ctx_tensors(arena.ctx, arena.backend); + if (!arena.buffer) throw std::runtime_error("ggml_backend_alloc_ctx_tensors failed"); + + upload_vocoder_weights(vb, model); + + model.backend = arena.backend; + model.backend_is_cpu = true; +} + +// --- reference-side weight assembly ----------------------------------------- + +std::vector to_double(const std::vector & v) { + return std::vector(v.begin(), v.end()); +} + +VocoderWeights build_reference_weights(const VocoderBuffers & vb) { + const VocoderDims & d = vb.dims; + VocoderWeights w; + w.latent_len = d.latent_len; + w.C_latent = d.C_latent; + w.factor = d.factor; + w.C = d.C; + w.normalizer_scale = vb.normalizer_scale[0]; + w.latent_mean = to_double(vb.latent_mean); + w.latent_std = to_double(vb.latent_std); + + w.K_embed = d.K_embed; + w.embed_w = to_double(vb.embed_w); + w.embed_b = to_double(vb.embed_b); + + for (int i = 0; i < kNumBlocks; ++i) { + const BlockBuffers & b = vb.blocks[(std::size_t) i]; + VocConvNextWeights c; + c.C = d.C; c.hidden = d.hidden; c.K = d.K_dw; c.dilation = b.dilation; + c.dw_w = to_double(b.dw_w); + c.dw_b = to_double(b.dw_b); + c.ln_gamma = to_double(b.norm_g); + c.ln_beta = to_double(b.norm_b); + c.pw1_w = to_double(b.pw1_w); + c.pw1_b = to_double(b.pw1_b); + c.pw2_w = to_double(b.pw2_w); + c.pw2_b = to_double(b.pw2_b); + c.gamma = to_double(b.gamma); + w.convnext.push_back(std::move(c)); + } + + w.bn_gamma = to_double(vb.final_g); + w.bn_beta = to_double(vb.final_b); + w.bn_running_mean = to_double(vb.final_mean); + w.bn_running_var = to_double(vb.final_var); + + w.Hh = d.Hh; + w.K_head1 = d.K_head1; + w.head1_w = to_double(vb.head1_w); + w.head1_b = to_double(vb.head1_b); + w.prelu_slope = vb.head_prelu[0]; + + w.OUT = d.OUT; + w.head2_w = to_double(vb.head2_w); + return w; +} + +// --- the parity check -------------------------------------------------------- + +void test_forward_parity() { + const VocoderBuffers vb = gen_vocoder_buffers(); + const VocoderDims & d = vb.dims; + + supertonic_model model; + GgmlModelArena arena; + build_synthetic_model(vb, model, arena); + + std::vector wav_prod; + std::string error; + const bool ok = supertonic_vocoder_forward_cpu(model, vb.latent.data(), d.latent_len, wav_prod, &error); + CHECK(ok, "supertonic_vocoder_forward_cpu failed: %s", error.c_str()); + if (!ok) return; + + VocoderBackward backward(build_reference_weights(vb)); + const std::vector wav_ref = backward.forward(to_double(vb.latent)); + + const std::size_t expected = (std::size_t) d.T0() * d.OUT; + CHECK(wav_prod.size() == expected, "production wav size %zu != expected %zu", wav_prod.size(), expected); + CHECK(wav_ref.size() == expected, "reference wav size %zu != expected %zu", wav_ref.size(), expected); + if (wav_prod.size() != expected || wav_ref.size() != expected) return; + + double max_abs = 0.0; + double max_mag = 0.0; + for (std::size_t i = 0; i < expected; ++i) { + max_abs = std::max(max_abs, std::fabs((double) wav_prod[i] - wav_ref[i])); + max_mag = std::max(max_mag, std::fabs(wav_ref[i])); + } + // float production vs double reference over a 10-block chain; observed error + // is ~2e-8, so this tight bar is a meaningful parity check (~500x margin) + // while the bug it guards (per-channel gamma modeled as scalar, wrong + // dilation, swapped weight index) shifts the output by O(output magnitude). + constexpr double kAbsTol = 1e-5; + std::fprintf(stderr, "[vocoder forward parity] max_abs_err=%.3e max_ref_mag=%.3e atol=%.0e\n", + max_abs, max_mag, kAbsTol); + CHECK(max_abs <= kAbsTol, "forward parity exceeded tolerance: max_abs=%.3e > %.0e", max_abs, kAbsTol); +} + +} // namespace + +int main() { + try { + test_forward_parity(); + } catch (const std::exception & e) { + ++g_failures; + std::fprintf(stderr, "FAIL uncaught exception: %s\n", e.what()); + } + std::fprintf(stderr, "\n%s: %d/%d checks passed\n", + g_failures == 0 ? "PASS" : "FAIL", g_checks - g_failures, g_checks); + return g_failures == 0 ? 0 : 1; +}