Skip to content

Commit 4c8767a

Browse files
authored
QVAC-16579 feat: add LavaSR enhancer (CPU/GGML) to tts-cpp (#68)
* QVAC-16579 feat: add LavaSR enhancer (CPU/GGML) to tts-cpp Port the LavaSR Vocos bandwidth-extension enhancer from @qvac/tts-onnx to tts-cpp as an opt-in CPU/GGML post-process: synthesized PCM is neurally upsampled to 48 kHz via a ConvNeXt backbone + ISTFT spec head. - DSP core: Lanczos resampler, radix-2 FFT + STFT/ISTFT, Slaney mel filterbank, FastLR spectral crossover merge (+ standalone unit test). - Scalar CPU enhancer forward + high-level enhance(pcm, sr) pipeline (resample -> mel -> backbone/spec-head -> ISTFT -> FastLR -> 48 kHz). - GGUF weight loader + public tts_cpp::lavasr::Enhancer API. - convert-lavasr-enhancer-to-gguf.py + dump-lavasr-enhancer-fixtures.py. - onnxruntime-parity test for the enhancer core (max abs err ~1e-3). The denoiser stage (UL-UNAS GRU U-Net) is a planned follow-up. * QVAC-16579 feat: f16 enhancer GGUF support + GGUF round-trip test - enhancer GGUF loader: dequant f16 weights to f32 on load (the converter already emits --ftype f16). Halves the model to ~28 MB at ~0.5% relative error vs f32. - test-lavasr-enhancer-gguf: load the converted GGUF via the public tts_cpp::lavasr::Enhancer API + load_enhancer_gguf() and compare the backbone/spec-head output to the onnxruntime golden. Validated end-to-end against a CPU ggml build (f32 ~1e-3 abs; f16 ~0.5% rel). * QVAC-16579 review: address LavaSR enhancer feedback (GustavoA1604) - enhancer_gguf: read lavasr.enhancer.work_sample_rate + mel_ref_sample_rate and use them (drop the hardcoded 48000/44100 + the misleading comment). - enhancer_gguf: validate n_fft/win are powers of two and key tensor shapes (embed/dwconv/pwconv/spec_head) at load; StftProcessor::fft now throws on a non-power-of-two N instead of silently corrupting output. - enhancer_core: drop the dead `eps` placeholder. - converter: derive clip_max from the spec-head Clip node's max input (not a scalar-range scan), with a documented fallback + warning. - fixtures + test: add an end-to-end golden (numpy reference pipeline: resample -> mel -> backbone/spec-head -> ISTFT -> FastLR) and compare enhance(pcm_in) against it in test-lavasr-enhancer-gguf (f32 e2e ~2e-5); soften the core-test wording to "core parity + pipeline smoke". --------- Co-authored-by: Zbigniew Herman <212399199+Zbig9000@users.noreply.github.com>
1 parent 586268b commit 4c8767a

23 files changed

Lines changed: 2197 additions & 0 deletions

tts-cpp/CMakeLists.txt

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,15 @@ set(TTS_CPP_LIB_SOURCES
252252
src/supertonic_chunker.cpp
253253
src/mtl_tokenizer.cpp
254254
src/text_preprocess.cpp
255+
# LavaSR neural speech enhancement (QVAC-16579) — CPU/GGML post-process.
256+
src/lavasr/dsp/resampler.cpp
257+
src/lavasr/dsp/stft_processor.cpp
258+
src/lavasr/dsp/mel_filterbank.cpp
259+
src/lavasr/dsp/fastlr_merge.cpp
260+
src/lavasr/enhancer_core.cpp
261+
src/lavasr/enhancer.cpp
262+
src/lavasr/enhancer_gguf.cpp
263+
src/lavasr/enhancer_api.cpp
255264
)
256265

257266
if (TTS_CPP_BUILD_LIBRARY)
@@ -821,6 +830,58 @@ if (TTS_CPP_BUILD_TESTS)
821830
tts_cpp_apply_ccache(test-text-split)
822831
tts_cpp_register_test(test-text-split LABEL "cpu")
823832

833+
# LavaSR DSP primitives (QVAC-16579): resampler, STFT/ISTFT, Slaney mel,
834+
# FastLR crossover merge. Pure host math (no model / no ggml), compiled
835+
# straight from the lavasr/dsp sources so it always runs in CI.
836+
add_executable(test-lavasr-dsp
837+
test/test_lavasr_dsp.cpp
838+
src/lavasr/dsp/resampler.cpp
839+
src/lavasr/dsp/stft_processor.cpp
840+
src/lavasr/dsp/mel_filterbank.cpp
841+
src/lavasr/dsp/fastlr_merge.cpp)
842+
target_include_directories(test-lavasr-dsp PRIVATE src)
843+
tts_cpp_apply_ccache(test-lavasr-dsp)
844+
tts_cpp_register_test(test-lavasr-dsp LABEL "unit")
845+
846+
# LavaSR enhancer scalar-core parity test (QVAC-16579): numerical parity of
847+
# the neural core (mel -> backbone -> spec head -> real/imag) against an
848+
# onnxruntime golden, plus a finite/length smoke test of the full enhance()
849+
# pipeline. Pure host math (no ggml); fixtures from
850+
# scripts/dump-lavasr-enhancer-fixtures.py. The end-to-end DSP comparison
851+
# (resampler/mel/ISTFT/FastLR vs the numpy reference) lives in
852+
# test-lavasr-enhancer-gguf below. DISABLED until the fixtures exist
853+
# (REQUIRES the golden real.npy).
854+
add_executable(test-lavasr-enhancer-core
855+
test/test_lavasr_enhancer_core.cpp
856+
src/lavasr/enhancer_core.cpp
857+
src/lavasr/enhancer.cpp
858+
src/lavasr/dsp/resampler.cpp
859+
src/lavasr/dsp/stft_processor.cpp
860+
src/lavasr/dsp/mel_filterbank.cpp
861+
src/lavasr/dsp/fastlr_merge.cpp)
862+
target_include_directories(test-lavasr-enhancer-core PRIVATE src)
863+
tts_cpp_apply_ccache(test-lavasr-enhancer-core)
864+
tts_cpp_register_test(test-lavasr-enhancer-core
865+
LABEL "fixture"
866+
ARGS "${TTS_CPP_TEST_REF_DIR}/lavasr-enhancer"
867+
REQUIRES "${TTS_CPP_TEST_REF_DIR}/lavasr-enhancer/real.npy")
868+
869+
# LavaSR enhancer GGUF round-trip (QVAC-16579): load the converted GGUF via
870+
# load_enhancer_gguf() + the public tts_cpp::lavasr::Enhancer API, compare
871+
# the neural core to the onnxruntime golden, AND compare the full
872+
# enhance(pcm_in) output (resampler + mel + ISTFT + FastLR + neural) against
873+
# the numpy end-to-end golden (enhanced_48k.npy). Links ggml for GGUF I/O
874+
# (mirrors test-gguf-stream). DISABLED until the GGUF + fixtures exist
875+
# (convert-lavasr-enhancer-to-gguf.py + dump-lavasr-enhancer-fixtures.py).
876+
add_executable(test-lavasr-enhancer-gguf test/test_lavasr_enhancer_gguf.cpp)
877+
target_link_libraries(test-lavasr-enhancer-gguf PRIVATE tts-cpp ggml)
878+
target_include_directories(test-lavasr-enhancer-gguf PRIVATE include src ggml/include)
879+
tts_cpp_apply_ccache(test-lavasr-enhancer-gguf)
880+
tts_cpp_register_test(test-lavasr-enhancer-gguf
881+
LABEL "fixture"
882+
ARGS "${TTS_CPP_TEST_MODEL_DIR}/lavasr-enhancer.gguf" "${TTS_CPP_TEST_REF_DIR}/lavasr-enhancer"
883+
REQUIRES "${TTS_CPP_TEST_MODEL_DIR}/lavasr-enhancer.gguf" "${TTS_CPP_TEST_REF_DIR}/lavasr-enhancer/real.npy")
884+
824885
# QVAC-20979 — voice-clone test harness.
825886
#
826887
# Two header-only, model-free harnesses that every later cloning task builds
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#pragma once
2+
3+
// Public LavaSR enhancer API (QVAC-16579).
4+
//
5+
// Opt-in neural speech enhancement applied to a synthesized PCM signal:
6+
// bandwidth-extends the engine output to 48 kHz using the LavaSR Vocos
7+
// enhancer (ConvNeXt backbone + ISTFT spec head), converted to a single GGUF
8+
// and run on the CPU/GGML path. The denoiser stage is a planned follow-up.
9+
//
10+
// Usage (e.g. from the tts-ggml addon, after engine->synthesize()):
11+
//
12+
// auto enh = tts_cpp::lavasr::Enhancer::load("lavasr-enhancer.gguf");
13+
// result.pcm = enh->enhance(result.pcm, result.sample_rate);
14+
// result.sample_rate = enh->output_sample_rate(); // 48000
15+
//
16+
// The Enhancer is immutable after load and safe to share across threads for
17+
// concurrent enhance() calls (it holds only const weights).
18+
19+
#include "tts-cpp/export.h"
20+
21+
#include <memory>
22+
#include <string>
23+
#include <vector>
24+
25+
namespace tts_cpp::lavasr {
26+
27+
class TTS_CPP_API Enhancer {
28+
public:
29+
// Load the enhancer GGUF. Throws std::runtime_error on failure (file
30+
// missing, wrong architecture, missing tensors).
31+
static std::unique_ptr<Enhancer> load(const std::string & gguf_path);
32+
33+
~Enhancer();
34+
Enhancer(const Enhancer &) = delete;
35+
Enhancer & operator=(const Enhancer &) = delete;
36+
37+
// Enhance mono float32 PCM at `sr_in` Hz (the engine's native rate) to a
38+
// 48 kHz enhanced signal. Returns empty for empty input.
39+
std::vector<float> enhance(const std::vector<float> & pcm_in, int sr_in) const;
40+
41+
// Output sample rate of enhance() (48 kHz).
42+
int output_sample_rate() const;
43+
44+
private:
45+
Enhancer();
46+
struct Impl;
47+
std::unique_ptr<Impl> impl_;
48+
};
49+
50+
} // namespace tts_cpp::lavasr
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
"""Convert the LavaSR enhancer (Vocos bandwidth-extension) ONNX pair into a
3+
single GGUF for the tts-cpp CPU/GGML enhancer.
4+
5+
The enhancer is two ONNX graphs:
6+
* enhancer_backbone.onnx mel[B,80,T] -> hidden[B,T,512]
7+
embed Conv1d(80->512,k7,pad3) -> LayerNorm
8+
8x ConvNeXt block:
9+
dwconv Conv1d(512->512,k7,pad3,group=512)
10+
LayerNorm(eps=1e-6)
11+
pwconv1 Linear(512->1536) + erf-GELU
12+
pwconv2 Linear(1536->512)
13+
*gamma (layer scale) + residual
14+
final LayerNorm
15+
* enhancer_spec_head.onnx hidden[B,T,512] -> real[B,1025,T], imag[B,1025,T]
16+
Linear(512->2050) -> transpose -> split(1025,1025)
17+
mag = clip(exp(split0), max=clip_max); real = mag*cos(split1); imag = mag*sin(split1)
18+
19+
Linear (MatMul) weights are stored ONNX-side as [in,out]; we transpose them to
20+
[out,in] (PyTorch convention) so the C++ loader reads ggml ne=[in,out] and runs
21+
ggml_mul_mat(W, x) directly. Conv weights are stored as ONNX [out,in,k].
22+
23+
Usage:
24+
python convert-lavasr-enhancer-to-gguf.py \
25+
--backbone enhancer_backbone.onnx \
26+
--spec-head enhancer_spec_head.onnx \
27+
--out lavasr-enhancer.gguf \
28+
--ftype f32 # or f16
29+
"""
30+
import argparse
31+
import sys
32+
33+
import numpy as np
34+
import onnx
35+
from gguf import GGUFWriter
36+
from onnx import numpy_helper
37+
38+
ARCH = "lavasr-enhancer"
39+
40+
# Mel / STFT params (must match src/lavasr/dsp + the @qvac/tts-onnx enhancer).
41+
N_MELS = 80
42+
DIM = 512
43+
FFN_DIM = 1536
44+
N_BLOCKS = 8
45+
KERNEL = 7
46+
N_FFT = 2048
47+
HOP = 512
48+
WIN = 2048
49+
SPEC_BINS = N_FFT // 2 + 1 # 1025
50+
MEL_REF_SR = 44100 # Slaney mel reference rate (Vocos training)
51+
WORK_SR = 48000 # enhancer operates on 48 kHz audio
52+
LN_EPS = 1e-6
53+
54+
55+
def init_map(graph):
56+
return {t.name: numpy_helper.to_array(t) for t in graph.initializer}
57+
58+
59+
def node_by_output(graph):
60+
out = {}
61+
for n in graph.node:
62+
for o in n.output:
63+
out[o] = n
64+
return out
65+
66+
67+
def find_matmul_weight(graph, inits, by_out, bias_name):
68+
"""Given a `*.bias` initializer name added right after a MatMul, return the
69+
MatMul's weight array (an initializer)."""
70+
for n in graph.node:
71+
if n.op_type == "Add" and bias_name in n.input:
72+
other = [i for i in n.input if i != bias_name][0]
73+
mm = by_out.get(other)
74+
if mm is None or mm.op_type != "MatMul":
75+
raise RuntimeError(f"expected MatMul feeding Add of {bias_name}")
76+
for i in mm.input:
77+
if i in inits:
78+
return inits[i]
79+
raise RuntimeError(f"MatMul for {bias_name} has no initializer input")
80+
raise RuntimeError(f"no Add node consuming bias {bias_name}")
81+
82+
83+
def store(writer, name, arr, ftype, allow_f16=True):
84+
arr = np.ascontiguousarray(arr)
85+
if ftype == "f16" and allow_f16 and arr.ndim >= 2 and arr.dtype == np.float32:
86+
arr = arr.astype(np.float16)
87+
elif arr.dtype != np.float32 and arr.dtype != np.float16:
88+
arr = arr.astype(np.float32)
89+
writer.add_tensor(name, arr)
90+
print(f" {name:42s} {str(arr.dtype):8s} {list(arr.shape)}")
91+
92+
93+
def main():
94+
ap = argparse.ArgumentParser()
95+
ap.add_argument("--backbone", required=True)
96+
ap.add_argument("--spec-head", required=True)
97+
ap.add_argument("--out", required=True)
98+
ap.add_argument("--ftype", choices=["f32", "f16"], default="f32")
99+
args = ap.parse_args()
100+
101+
bb = onnx.load(args.backbone, load_external_data=True).graph
102+
sh = onnx.load(args.spec_head, load_external_data=True).graph
103+
bi = init_map(bb)
104+
si = init_map(sh)
105+
bb_by_out = node_by_output(bb)
106+
107+
# The spec head clamps the log-magnitude via Clip(exp(x), None, max) before
108+
# the cos/sin polar reconstruction (see the graph: Exp -> Clip -> Mul). Read
109+
# the clamp upper bound straight from that Clip node's `max` input (Clip's
110+
# 3rd input in opset >= 11) rather than scanning all scalar constants, so a
111+
# re-export carrying another scalar can't silently change the clamp. Falls
112+
# back to 1000.0 (with a warning) only if the graph shape ever changes.
113+
clip_max = 1000.0
114+
clip_nodes = [n for n in sh.node if n.op_type == "Clip"]
115+
if (len(clip_nodes) == 1 and len(clip_nodes[0].input) >= 3
116+
and clip_nodes[0].input[2] in si):
117+
# si values are already numpy arrays (init_map -> numpy_helper.to_array).
118+
clip_max = float(si[clip_nodes[0].input[2]].reshape(-1)[0])
119+
else:
120+
print(f"WARNING: could not uniquely resolve the spec-head Clip max input "
121+
f"({len(clip_nodes)} Clip node(s)); using fallback clip_max={clip_max}")
122+
123+
writer = GGUFWriter(args.out, ARCH)
124+
writer.add_uint32("lavasr.enhancer.dim", DIM)
125+
writer.add_uint32("lavasr.enhancer.ffn_dim", FFN_DIM)
126+
writer.add_uint32("lavasr.enhancer.n_blocks", N_BLOCKS)
127+
writer.add_uint32("lavasr.enhancer.n_mels", N_MELS)
128+
writer.add_uint32("lavasr.enhancer.kernel", KERNEL)
129+
writer.add_uint32("lavasr.enhancer.n_fft", N_FFT)
130+
writer.add_uint32("lavasr.enhancer.hop", HOP)
131+
writer.add_uint32("lavasr.enhancer.win", WIN)
132+
writer.add_uint32("lavasr.enhancer.spec_bins", SPEC_BINS)
133+
writer.add_uint32("lavasr.enhancer.mel_ref_sample_rate", MEL_REF_SR)
134+
writer.add_uint32("lavasr.enhancer.work_sample_rate", WORK_SR)
135+
writer.add_float32("lavasr.enhancer.clip_max", clip_max)
136+
writer.add_float32("lavasr.enhancer.layernorm_eps", LN_EPS)
137+
138+
print("tensors:")
139+
# --- embed + first norm ---
140+
store(writer, "enhancer.embed.weight", bi["backbone.embed.weight"], args.ftype)
141+
store(writer, "enhancer.embed.bias", bi["backbone.embed.bias"], args.ftype, allow_f16=False)
142+
store(writer, "enhancer.norm.weight", bi["backbone.norm.weight"], args.ftype, allow_f16=False)
143+
store(writer, "enhancer.norm.bias", bi["backbone.norm.bias"], args.ftype, allow_f16=False)
144+
145+
# --- 8 ConvNeXt blocks ---
146+
for i in range(N_BLOCKS):
147+
p = f"backbone.convnext.{i}"
148+
store(writer, f"enhancer.block.{i}.dwconv.weight", bi[f"{p}.dwconv.weight"], args.ftype)
149+
store(writer, f"enhancer.block.{i}.dwconv.bias", bi[f"{p}.dwconv.bias"], args.ftype, allow_f16=False)
150+
store(writer, f"enhancer.block.{i}.norm.weight", bi[f"{p}.norm.weight"], args.ftype, allow_f16=False)
151+
store(writer, f"enhancer.block.{i}.norm.bias", bi[f"{p}.norm.bias"], args.ftype, allow_f16=False)
152+
w1 = find_matmul_weight(bb, bi, bb_by_out, f"{p}.pwconv1.bias") # [in=512, out=1536]
153+
w2 = find_matmul_weight(bb, bi, bb_by_out, f"{p}.pwconv2.bias") # [in=1536, out=512]
154+
store(writer, f"enhancer.block.{i}.pwconv1.weight", w1.T, args.ftype) # -> [out,in]
155+
store(writer, f"enhancer.block.{i}.pwconv1.bias", bi[f"{p}.pwconv1.bias"], args.ftype, allow_f16=False)
156+
store(writer, f"enhancer.block.{i}.pwconv2.weight", w2.T, args.ftype) # -> [out,in]
157+
store(writer, f"enhancer.block.{i}.pwconv2.bias", bi[f"{p}.pwconv2.bias"], args.ftype, allow_f16=False)
158+
store(writer, f"enhancer.block.{i}.gamma", bi[f"{p}.gamma"], args.ftype, allow_f16=False)
159+
160+
# --- final layer norm ---
161+
store(writer, "enhancer.final_norm.weight", bi["backbone.final_layer_norm.weight"], args.ftype, allow_f16=False)
162+
store(writer, "enhancer.final_norm.bias", bi["backbone.final_layer_norm.bias"], args.ftype, allow_f16=False)
163+
164+
# --- spec head ---
165+
w_out = find_matmul_weight(sh, si, node_by_output(sh), "out.bias") # [in=512, out=2050]
166+
store(writer, "spec_head.out.weight", w_out.T, args.ftype) # -> [out=2050, in=512]
167+
store(writer, "spec_head.out.bias", si["out.bias"], args.ftype, allow_f16=False)
168+
169+
writer.write_header_to_file()
170+
writer.write_kv_data_to_file()
171+
writer.write_tensors_to_file()
172+
writer.close()
173+
print(f"\nWrote {args.out} (arch={ARCH}, ftype={args.ftype}, clip_max={clip_max})")
174+
175+
176+
if __name__ == "__main__":
177+
sys.exit(main())

0 commit comments

Comments
 (0)