Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ cmake --build build -j4
```

> **Note:** The top-level CMake currently expects GGML in `./ggml` with libraries under `./ggml/build/src`.
> For NVIDIA CUDA, build GGML with `-DGGML_CUDA=ON` and run with `QWEN3_TTS_BACKEND=cuda`.

## Model Setup (Recommended)

Expand Down Expand Up @@ -193,6 +194,10 @@ At runtime, each component logs its selected backend (for example, `TTSTransform
- Preferred order: `IGPU` -> `GPU` -> `ACCEL` -> `CPU`
- Encoder and transformer can run on Metal/other accelerators with CPU fallback in the scheduler
- Decoder now follows the same backend preference and will use Metal when available
- `QWEN3_TTS_BACKEND` overrides runtime selection: `auto` (default), `cuda`, or `cpu`
- `QWEN3_TTS_DEVICE` selects CUDA device index when `QWEN3_TTS_BACKEND=cuda` (default device is index 0)
- `QWEN3_TTS_DECODER_GPU_MAX_FRAMES` controls max frames per CUDA vocoder chunk (default: `34`)
- `QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES` controls left-context frames per CUDA vocoder chunk (default: `12`)

## Architecture

Expand Down
132 changes: 128 additions & 4 deletions src/audio_tokenizer_decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,38 @@
#include "ggml-cpu.h"

#include <cmath>
#include <cstdlib>
#include <cstring>
#include <algorithm>
#include <limits>
#include <numeric>

#define QWEN3_TTS_DEC_MAX_NODES 32768

namespace qwen3_tts {

static int32_t get_env_i32(const char * key, int32_t default_value) {
const char * v = std::getenv(key);
if (!v || !*v) {
return default_value;
}

char * end = nullptr;
long parsed = strtol(v, &end, 10);
if (end == v || *end != '\0') {
return default_value;
}

if (parsed < 0) {
return default_value;
}
if (parsed > std::numeric_limits<int32_t>::max()) {
return std::numeric_limits<int32_t>::max();
}

return (int32_t) parsed;
}

AudioTokenizerDecoder::AudioTokenizerDecoder() = default;

AudioTokenizerDecoder::~AudioTokenizerDecoder() {
Expand Down Expand Up @@ -366,7 +390,7 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
error_msg_ = "Failed to create backend scheduler";
return false;
}

state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_DEC_MAX_NODES + ggml_graph_overhead());

return true;
Expand Down Expand Up @@ -801,8 +825,38 @@ struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) {
return gf;
}

bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
std::vector<float> & samples) {
int64_t AudioTokenizerDecoder::output_samples_for_frames(int32_t n_frames) const {
if (n_frames <= 0) {
return 0;
}

int64_t n = n_frames;

auto conv_t_out = [](int64_t in_len, int64_t stride, int64_t kernel) -> int64_t {
// ConvTranspose1D output length with padding=0, dilation=1, output_padding=0.
return (in_len - 1) * stride + kernel;
};

// Two ConvNeXt-style upsample blocks (stride=2, no crop).
const int64_t up0_k = model_.upsample[0].conv_w ? model_.upsample[0].conv_w->ne[0] : 4;
const int64_t up1_k = model_.upsample[1].conv_w ? model_.upsample[1].conv_w->ne[0] : 4;
n = conv_t_out(n, 2, up0_k);
n = conv_t_out(n, 2, up1_k);

// Decoder blocks: ConvTranspose1D followed by symmetric crop of (k - s) on both sides.
for (int i = 0; i < 4; ++i) {
const int64_t s = model_.config.upsample_rates[i];
const int64_t k = model_.dec_blocks[i].conv_t_w ? model_.dec_blocks[i].conv_t_w->ne[0] : 2 * s;
const int64_t out_full = conv_t_out(n, s, k);
const int64_t crop = k - s;
n = out_full - crop - crop;
}

return n > 0 ? n : 0;
}

bool AudioTokenizerDecoder::decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset,
std::vector<float> & samples) {
if (!model_.ctx) {
error_msg_ = "Model not loaded";
return false;
Expand Down Expand Up @@ -848,7 +902,7 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
if (positions_tensor) {
std::vector<int32_t> positions(n_frames);
for (int i = 0; i < n_frames; ++i) {
positions[i] = i;
positions[i] = position_offset + i;
}
ggml_backend_tensor_set(positions_tensor, positions.data(), 0,
n_frames * sizeof(int32_t));
Expand Down Expand Up @@ -878,6 +932,76 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
return true;
}

bool AudioTokenizerDecoder::is_primary_backend_cuda() const {
ggml_backend_dev_t device = state_.backend ? ggml_backend_get_device(state_.backend) : nullptr;
if (!device) {
return false;
}

ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device);
const char * reg_name = reg ? ggml_backend_reg_name(reg) : nullptr;
return reg_name && strcmp(reg_name, "CUDA") == 0;
}

bool AudioTokenizerDecoder::decode_chunked_cuda(const int32_t * codes, int32_t n_frames,
std::vector<float> & samples,
int32_t max_gpu_frames, int32_t context_frames_cfg) {
// Chunked CUDA decode to avoid large IM2COL launches for long utterances.
const int32_t context_frames = std::min(context_frames_cfg, std::max(0, max_gpu_frames - 1));
const int32_t chunk_payload = std::max(1, max_gpu_frames - context_frames);

fprintf(stderr,
" AudioTokenizerDecoder: chunked GPU decode enabled (frames=%d, chunk=%d, context=%d)\n",
n_frames, max_gpu_frames, context_frames);

const auto & cfg = model_.config;
samples.clear();
samples.reserve((size_t) output_samples_for_frames(n_frames));

for (int32_t start = 0; start < n_frames; start += chunk_payload) {
const int32_t ctx_start = std::max(0, start - context_frames);
const int32_t end = std::min(n_frames, start + chunk_payload);
const int32_t seg_frames = end - ctx_start;
const int32_t warmup_frames = start - ctx_start;

std::vector<float> seg_samples;
if (!decode_single(codes + (size_t) ctx_start * cfg.n_codebooks, seg_frames, ctx_start, seg_samples)) {
return false;
}

const int64_t drop = output_samples_for_frames(warmup_frames);
const size_t keep_from = (size_t) std::min<int64_t>(drop, (int64_t) seg_samples.size());
samples.insert(samples.end(),
seg_samples.begin() + (std::vector<float>::difference_type) keep_from,
seg_samples.end());
}

return true;
}

bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
std::vector<float> & samples) {
if (!model_.ctx) {
error_msg_ = "Model not loaded";
return false;
}

if (n_frames <= 0) {
samples.clear();
return true;
}

const int32_t max_gpu_frames = get_env_i32("QWEN3_TTS_DECODER_GPU_MAX_FRAMES", 34);
const int32_t context_frames_cfg = get_env_i32("QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES", 12);

// Fast path: non-CUDA backends, or requests that fit one decode chunk.
if (!is_primary_backend_cuda() || max_gpu_frames <= 0 || n_frames <= max_gpu_frames) {
return decode_single(codes, n_frames, 0, samples);
}

return decode_chunked_cuda(codes, n_frames, samples, max_gpu_frames, context_frames_cfg);
}

void free_audio_decoder_model(audio_decoder_model & model) {
if (model.buffer) {
ggml_backend_buffer_free(model.buffer);
Expand Down
7 changes: 7 additions & 0 deletions src/audio_tokenizer_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,13 @@ class AudioTokenizerDecoder {
private:
// Build computation graph for decoding
struct ggml_cgraph * build_graph(int32_t n_frames);
bool decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset,
std::vector<float> & samples);
bool is_primary_backend_cuda() const;
bool decode_chunked_cuda(const int32_t * codes, int32_t n_frames,
std::vector<float> & samples,
int32_t max_gpu_frames, int32_t context_frames_cfg);
int64_t output_samples_for_frames(int32_t n_frames) const;

// Apply Snake activation: x + (1/alpha) * sin^2(alpha * x)
struct ggml_tensor * apply_snake(struct ggml_context * ctx,
Expand Down
Loading