diff --git a/README.md b/README.md index 4a085ea..f74a594 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ cmake --build build -j4 ``` > **Note:** The top-level CMake currently expects GGML in `./ggml` with libraries under `./ggml/build/src`. +> For NVIDIA CUDA, build GGML with `-DGGML_CUDA=ON` and run with `QWEN3_TTS_BACKEND=cuda`. ## Model Setup (Recommended) @@ -193,6 +194,10 @@ At runtime, each component logs its selected backend (for example, `TTSTransform - Preferred order: `IGPU` -> `GPU` -> `ACCEL` -> `CPU` - Encoder and transformer can run on Metal/other accelerators with CPU fallback in the scheduler - Decoder now follows the same backend preference and will use Metal when available +- `QWEN3_TTS_BACKEND` overrides runtime selection: `auto` (default), `cuda`, or `cpu` +- `QWEN3_TTS_DEVICE` selects CUDA device index when `QWEN3_TTS_BACKEND=cuda` (default device is index 0) +- `QWEN3_TTS_DECODER_GPU_MAX_FRAMES` controls max frames per CUDA vocoder chunk (default: `34`) +- `QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES` controls left-context frames per CUDA vocoder chunk (default: `12`) ## Architecture diff --git a/src/audio_tokenizer_decoder.cpp b/src/audio_tokenizer_decoder.cpp index 8129ab0..1dd8051 100644 --- a/src/audio_tokenizer_decoder.cpp +++ b/src/audio_tokenizer_decoder.cpp @@ -3,14 +3,38 @@ #include "ggml-cpu.h" #include +#include #include #include +#include #include #define QWEN3_TTS_DEC_MAX_NODES 32768 namespace qwen3_tts { +static int32_t get_env_i32(const char * key, int32_t default_value) { + const char * v = std::getenv(key); + if (!v || !*v) { + return default_value; + } + + char * end = nullptr; + long parsed = strtol(v, &end, 10); + if (end == v || *end != '\0') { + return default_value; + } + + if (parsed < 0) { + return default_value; + } + if (parsed > std::numeric_limits::max()) { + return std::numeric_limits::max(); + } + + return (int32_t) parsed; +} + AudioTokenizerDecoder::AudioTokenizerDecoder() = default; AudioTokenizerDecoder::~AudioTokenizerDecoder() { @@ -366,7 +390,7 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) { error_msg_ = "Failed to create backend scheduler"; return false; } - + state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_DEC_MAX_NODES + ggml_graph_overhead()); return true; @@ -801,8 +825,38 @@ struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) { return gf; } -bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames, - std::vector & samples) { +int64_t AudioTokenizerDecoder::output_samples_for_frames(int32_t n_frames) const { + if (n_frames <= 0) { + return 0; + } + + int64_t n = n_frames; + + auto conv_t_out = [](int64_t in_len, int64_t stride, int64_t kernel) -> int64_t { + // ConvTranspose1D output length with padding=0, dilation=1, output_padding=0. + return (in_len - 1) * stride + kernel; + }; + + // Two ConvNeXt-style upsample blocks (stride=2, no crop). + const int64_t up0_k = model_.upsample[0].conv_w ? model_.upsample[0].conv_w->ne[0] : 4; + const int64_t up1_k = model_.upsample[1].conv_w ? model_.upsample[1].conv_w->ne[0] : 4; + n = conv_t_out(n, 2, up0_k); + n = conv_t_out(n, 2, up1_k); + + // Decoder blocks: ConvTranspose1D followed by symmetric crop of (k - s) on both sides. + for (int i = 0; i < 4; ++i) { + const int64_t s = model_.config.upsample_rates[i]; + const int64_t k = model_.dec_blocks[i].conv_t_w ? model_.dec_blocks[i].conv_t_w->ne[0] : 2 * s; + const int64_t out_full = conv_t_out(n, s, k); + const int64_t crop = k - s; + n = out_full - crop - crop; + } + + return n > 0 ? n : 0; +} + +bool AudioTokenizerDecoder::decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset, + std::vector & samples) { if (!model_.ctx) { error_msg_ = "Model not loaded"; return false; @@ -848,7 +902,7 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames, if (positions_tensor) { std::vector positions(n_frames); for (int i = 0; i < n_frames; ++i) { - positions[i] = i; + positions[i] = position_offset + i; } ggml_backend_tensor_set(positions_tensor, positions.data(), 0, n_frames * sizeof(int32_t)); @@ -878,6 +932,76 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames, return true; } +bool AudioTokenizerDecoder::is_primary_backend_cuda() const { + ggml_backend_dev_t device = state_.backend ? ggml_backend_get_device(state_.backend) : nullptr; + if (!device) { + return false; + } + + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device); + const char * reg_name = reg ? ggml_backend_reg_name(reg) : nullptr; + return reg_name && strcmp(reg_name, "CUDA") == 0; +} + +bool AudioTokenizerDecoder::decode_chunked_cuda(const int32_t * codes, int32_t n_frames, + std::vector & samples, + int32_t max_gpu_frames, int32_t context_frames_cfg) { + // Chunked CUDA decode to avoid large IM2COL launches for long utterances. + const int32_t context_frames = std::min(context_frames_cfg, std::max(0, max_gpu_frames - 1)); + const int32_t chunk_payload = std::max(1, max_gpu_frames - context_frames); + + fprintf(stderr, + " AudioTokenizerDecoder: chunked GPU decode enabled (frames=%d, chunk=%d, context=%d)\n", + n_frames, max_gpu_frames, context_frames); + + const auto & cfg = model_.config; + samples.clear(); + samples.reserve((size_t) output_samples_for_frames(n_frames)); + + for (int32_t start = 0; start < n_frames; start += chunk_payload) { + const int32_t ctx_start = std::max(0, start - context_frames); + const int32_t end = std::min(n_frames, start + chunk_payload); + const int32_t seg_frames = end - ctx_start; + const int32_t warmup_frames = start - ctx_start; + + std::vector seg_samples; + if (!decode_single(codes + (size_t) ctx_start * cfg.n_codebooks, seg_frames, ctx_start, seg_samples)) { + return false; + } + + const int64_t drop = output_samples_for_frames(warmup_frames); + const size_t keep_from = (size_t) std::min(drop, (int64_t) seg_samples.size()); + samples.insert(samples.end(), + seg_samples.begin() + (std::vector::difference_type) keep_from, + seg_samples.end()); + } + + return true; +} + +bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames, + std::vector & samples) { + if (!model_.ctx) { + error_msg_ = "Model not loaded"; + return false; + } + + if (n_frames <= 0) { + samples.clear(); + return true; + } + + const int32_t max_gpu_frames = get_env_i32("QWEN3_TTS_DECODER_GPU_MAX_FRAMES", 34); + const int32_t context_frames_cfg = get_env_i32("QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES", 12); + + // Fast path: non-CUDA backends, or requests that fit one decode chunk. + if (!is_primary_backend_cuda() || max_gpu_frames <= 0 || n_frames <= max_gpu_frames) { + return decode_single(codes, n_frames, 0, samples); + } + + return decode_chunked_cuda(codes, n_frames, samples, max_gpu_frames, context_frames_cfg); +} + void free_audio_decoder_model(audio_decoder_model & model) { if (model.buffer) { ggml_backend_buffer_free(model.buffer); diff --git a/src/audio_tokenizer_decoder.h b/src/audio_tokenizer_decoder.h index 03d7344..ebb9a37 100644 --- a/src/audio_tokenizer_decoder.h +++ b/src/audio_tokenizer_decoder.h @@ -181,6 +181,13 @@ class AudioTokenizerDecoder { private: // Build computation graph for decoding struct ggml_cgraph * build_graph(int32_t n_frames); + bool decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset, + std::vector & samples); + bool is_primary_backend_cuda() const; + bool decode_chunked_cuda(const int32_t * codes, int32_t n_frames, + std::vector & samples, + int32_t max_gpu_frames, int32_t context_frames_cfg); + int64_t output_samples_for_frames(int32_t n_frames) const; // Apply Snake activation: x + (1/alpha) * sin^2(alpha * x) struct ggml_tensor * apply_snake(struct ggml_context * ctx, diff --git a/src/gguf_loader.cpp b/src/gguf_loader.cpp index 649af52..ff62052 100644 --- a/src/gguf_loader.cpp +++ b/src/gguf_loader.cpp @@ -1,7 +1,11 @@ #include "gguf_loader.h" +#include +#include +#include #include #include +#include #include namespace qwen3_tts { @@ -16,6 +20,129 @@ shared_backend_state & get_shared_backend_state() { static shared_backend_state state; return state; } + +enum class backend_mode { + AUTO, + CPU, + CUDA, +}; + +bool iequals(const char * a, const char * b) { + if (!a || !b) { + return false; + } + while (*a && *b) { + if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) { + return false; + } + ++a; + ++b; + } + return *a == '\0' && *b == '\0'; +} + +bool parse_non_negative_int(const char * s, int & out) { + if (!s || s[0] == '\0') { + return false; + } + + errno = 0; + char * end = nullptr; + long v = std::strtol(s, &end, 10); + if (errno != 0 || end == s || *end != '\0' || v < 0 || v > INT_MAX) { + return false; + } + + out = (int) v; + return true; +} + +backend_mode get_backend_mode_from_env() { + const char * env = std::getenv("QWEN3_TTS_BACKEND"); + if (!env || env[0] == '\0' || iequals(env, "auto")) { + return backend_mode::AUTO; + } + if (iequals(env, "cpu")) { + return backend_mode::CPU; + } + if (iequals(env, "cuda")) { + return backend_mode::CUDA; + } + + fprintf(stderr, " [backend] Unknown QWEN3_TTS_BACKEND=%s, using auto\n", env); + return backend_mode::AUTO; +} + +int get_cuda_device_index_from_env() { + const char * env = std::getenv("QWEN3_TTS_DEVICE"); + if (!env || env[0] == '\0') { + return -1; // first CUDA device + } + + int parsed = -1; + if (!parse_non_negative_int(env, parsed)) { + fprintf(stderr, " [backend] Invalid QWEN3_TTS_DEVICE=%s, using default CUDA device\n", env); + return -1; + } + return parsed; +} + +ggml_backend_t init_cuda_backend_from_env() { + const int target_cuda_idx = get_cuda_device_index_from_env(); + int matched_cuda_devices = 0; + + const size_t n_devs = ggml_backend_dev_count(); + for (size_t i = 0; i < n_devs; ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); + const char * reg_name = reg ? ggml_backend_reg_name(reg) : nullptr; + if (!reg_name || !iequals(reg_name, "CUDA")) { + continue; + } + + if (target_cuda_idx >= 0 && matched_cuda_devices != target_cuda_idx) { + matched_cuda_devices++; + continue; + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + if (backend) { + return backend; + } + + if (target_cuda_idx >= 0) { + break; + } + matched_cuda_devices++; + } + + if (target_cuda_idx >= 0) { + fprintf(stderr, " [backend] Requested CUDA device index %d not available\n", target_cuda_idx); + } + + return nullptr; +} + +ggml_backend_t init_tensor_loader_backend(enum ggml_backend_dev_type preferred_backend_type) { + const backend_mode mode = get_backend_mode_from_env(); + + if (mode == backend_mode::CPU) { + return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } + if (mode == backend_mode::CUDA) { + ggml_backend_t backend = init_cuda_backend_from_env(); + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } + return backend; + } + + ggml_backend_t backend = ggml_backend_init_by_type(preferred_backend_type, nullptr); + if (!backend && preferred_backend_type != GGML_BACKEND_DEVICE_TYPE_CPU) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } + return backend; +} } GGUFLoader::GGUFLoader() = default; @@ -33,20 +160,33 @@ ggml_backend_t init_preferred_backend(const char * component_name, std::string * return shared.backend; } - ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr); - if (!backend) { - backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr); - } - if (!backend) { - backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_ACCEL, nullptr); - } - if (!backend) { + ggml_backend_t backend = nullptr; + const backend_mode mode = get_backend_mode_from_env(); + if (mode == backend_mode::CPU) { backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } else if (mode == backend_mode::CUDA) { + backend = init_cuda_backend_from_env(); + if (!backend) { + fprintf(stderr, " [backend] CUDA requested but unavailable, falling back to CPU\n"); + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } + } else { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr); + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr); + } + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_ACCEL, nullptr); + } + if (!backend) { + backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); + } } if (!backend && error_msg) { const char * name = component_name ? component_name : "component"; - *error_msg = "Failed to initialize backend (IGPU/GPU/ACCEL/CPU) for " + std::string(name); + *error_msg = "Failed to initialize backend for " + std::string(name) + + " (QWEN3_TTS_BACKEND=auto|cpu|cuda)"; } if (backend) { @@ -160,10 +300,7 @@ bool load_tensor_data_from_file( std::string & error_msg, enum ggml_backend_dev_type preferred_backend_type ) { - ggml_backend_t backend = ggml_backend_init_by_type(preferred_backend_type, nullptr); - if (!backend && preferred_backend_type != GGML_BACKEND_DEVICE_TYPE_CPU) { - backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - } + ggml_backend_t backend = init_tensor_loader_backend(preferred_backend_type); if (!backend) { error_msg = "Failed to initialize backend for GGUF tensor loader"; return false; diff --git a/src/main.cpp b/src/main.cpp index b609099..e6207a7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -21,6 +21,13 @@ void print_usage(const char * program) { fprintf(stderr, " -j, --threads Number of threads (default: 4)\n"); fprintf(stderr, " -h, --help Show this help\n"); fprintf(stderr, "\n"); + fprintf(stderr, "Environment:\n"); + fprintf(stderr, " QWEN3_TTS_BACKEND Runtime backend override: auto|cuda|cpu\n"); + fprintf(stderr, " QWEN3_TTS_DEVICE CUDA device index when backend=cuda (default: 0)\n"); + fprintf(stderr, " QWEN3_TTS_DECODER_GPU_MAX_FRAMES Max frames per CUDA vocoder chunk (default: 34)\n"); + fprintf(stderr, " QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES Left context per CUDA vocoder chunk (default: 12)\n"); + fprintf(stderr, " QWEN3_TTS_LOW_MEM Enable low-memory mode (set to 1)\n"); + fprintf(stderr, "\n"); fprintf(stderr, "Example:\n"); fprintf(stderr, " %s -m ./models -t \"Hello, world!\" -o hello.wav\n", program); fprintf(stderr, " %s -m ./models -t \"Hello!\" -r reference.wav -o cloned.wav\n", program);