diff --git a/README.md b/README.md
index 4a085ea..f74a594 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,7 @@ cmake --build build -j4
 ```
 
 > **Note:** The top-level CMake currently expects GGML in `./ggml` with libraries under `./ggml/build/src`.
+> For NVIDIA CUDA, build GGML with `-DGGML_CUDA=ON` and run with `QWEN3_TTS_BACKEND=cuda`.
 
 ## Model Setup (Recommended)
 
@@ -193,6 +194,10 @@ At runtime, each component logs its selected backend (for example, `TTSTransform
 - Preferred order: `IGPU` -> `GPU` -> `ACCEL` -> `CPU`
 - Encoder and transformer can run on Metal/other accelerators with CPU fallback in the scheduler
 - Decoder now follows the same backend preference and will use Metal when available
+- `QWEN3_TTS_BACKEND` overrides runtime selection: `auto` (default), `cuda`, or `cpu`
+- `QWEN3_TTS_DEVICE` selects CUDA device index when `QWEN3_TTS_BACKEND=cuda` (default device is index 0)
+- `QWEN3_TTS_DECODER_GPU_MAX_FRAMES` controls max frames per CUDA vocoder chunk (default: `34`)
+- `QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES` controls left-context frames per CUDA vocoder chunk (default: `12`)
 
 ## Architecture
 
diff --git a/src/audio_tokenizer_decoder.cpp b/src/audio_tokenizer_decoder.cpp
index 8129ab0..1dd8051 100644
--- a/src/audio_tokenizer_decoder.cpp
+++ b/src/audio_tokenizer_decoder.cpp
@@ -3,14 +3,38 @@
 #include "ggml-cpu.h"
 
 #include <cmath>
+#include <cstdlib>
 #include <cstring>
 #include <algorithm>
+#include <limits>
 #include <numeric>
 
 #define QWEN3_TTS_DEC_MAX_NODES 32768
 
 namespace qwen3_tts {
 
+static int32_t get_env_i32(const char * key, int32_t default_value) {
+    const char * v = std::getenv(key);
+    if (!v || !*v) {
+        return default_value;
+    }
+
+    char * end = nullptr;
+    long parsed = strtol(v, &end, 10);
+    if (end == v || *end != '\0') {
+        return default_value;
+    }
+
+    if (parsed < 0) {
+        return default_value;
+    }
+    if (parsed > std::numeric_limits<int32_t>::max()) {
+        return std::numeric_limits<int32_t>::max();
+    }
+
+    return (int32_t) parsed;
+}
+
 AudioTokenizerDecoder::AudioTokenizerDecoder() = default;
 
 AudioTokenizerDecoder::~AudioTokenizerDecoder() {
@@ -366,7 +390,7 @@ bool AudioTokenizerDecoder::load_model(const std::string & model_path) {
         error_msg_ = "Failed to create backend scheduler";
         return false;
     }
-    
+
     state_.compute_meta.resize(ggml_tensor_overhead() * QWEN3_TTS_DEC_MAX_NODES + ggml_graph_overhead());
     
     return true;
@@ -801,8 +825,38 @@ struct ggml_cgraph * AudioTokenizerDecoder::build_graph(int32_t n_frames) {
     return gf;
 }
 
-bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
-                                    std::vector<float> & samples) {
+int64_t AudioTokenizerDecoder::output_samples_for_frames(int32_t n_frames) const {
+    if (n_frames <= 0) {
+        return 0;
+    }
+
+    int64_t n = n_frames;
+
+    auto conv_t_out = [](int64_t in_len, int64_t stride, int64_t kernel) -> int64_t {
+        // ConvTranspose1D output length with padding=0, dilation=1, output_padding=0.
+        return (in_len - 1) * stride + kernel;
+    };
+
+    // Two ConvNeXt-style upsample blocks (stride=2, no crop).
+    const int64_t up0_k = model_.upsample[0].conv_w ? model_.upsample[0].conv_w->ne[0] : 4;
+    const int64_t up1_k = model_.upsample[1].conv_w ? model_.upsample[1].conv_w->ne[0] : 4;
+    n = conv_t_out(n, 2, up0_k);
+    n = conv_t_out(n, 2, up1_k);
+
+    // Decoder blocks: ConvTranspose1D followed by symmetric crop of (k - s) on both sides.
+    for (int i = 0; i < 4; ++i) {
+        const int64_t s = model_.config.upsample_rates[i];
+        const int64_t k = model_.dec_blocks[i].conv_t_w ? model_.dec_blocks[i].conv_t_w->ne[0] : 2 * s;
+        const int64_t out_full = conv_t_out(n, s, k);
+        const int64_t crop = k - s;
+        n = out_full - crop - crop;
+    }
+
+    return n > 0 ? n : 0;
+}
+
+bool AudioTokenizerDecoder::decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset,
+                                          std::vector<float> & samples) {
     if (!model_.ctx) {
         error_msg_ = "Model not loaded";
         return false;
@@ -848,7 +902,7 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
     if (positions_tensor) {
         std::vector<int32_t> positions(n_frames);
         for (int i = 0; i < n_frames; ++i) {
-            positions[i] = i;
+            positions[i] = position_offset + i;
         }
         ggml_backend_tensor_set(positions_tensor, positions.data(), 0, 
                                 n_frames * sizeof(int32_t));
@@ -878,6 +932,76 @@ bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
     return true;
 }
 
+bool AudioTokenizerDecoder::is_primary_backend_cuda() const {
+    ggml_backend_dev_t device = state_.backend ? ggml_backend_get_device(state_.backend) : nullptr;
+    if (!device) {
+        return false;
+    }
+
+    ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(device);
+    const char * reg_name = reg ? ggml_backend_reg_name(reg) : nullptr;
+    return reg_name && strcmp(reg_name, "CUDA") == 0;
+}
+
+bool AudioTokenizerDecoder::decode_chunked_cuda(const int32_t * codes, int32_t n_frames,
+                                                std::vector<float> & samples,
+                                                int32_t max_gpu_frames, int32_t context_frames_cfg) {
+    // Chunked CUDA decode to avoid large IM2COL launches for long utterances.
+    const int32_t context_frames = std::min(context_frames_cfg, std::max(0, max_gpu_frames - 1));
+    const int32_t chunk_payload = std::max(1, max_gpu_frames - context_frames);
+
+    fprintf(stderr,
+            "  AudioTokenizerDecoder: chunked GPU decode enabled (frames=%d, chunk=%d, context=%d)\n",
+            n_frames, max_gpu_frames, context_frames);
+
+    const auto & cfg = model_.config;
+    samples.clear();
+    samples.reserve((size_t) output_samples_for_frames(n_frames));
+
+    for (int32_t start = 0; start < n_frames; start += chunk_payload) {
+        const int32_t ctx_start = std::max(0, start - context_frames);
+        const int32_t end = std::min(n_frames, start + chunk_payload);
+        const int32_t seg_frames = end - ctx_start;
+        const int32_t warmup_frames = start - ctx_start;
+
+        std::vector<float> seg_samples;
+        if (!decode_single(codes + (size_t) ctx_start * cfg.n_codebooks, seg_frames, ctx_start, seg_samples)) {
+            return false;
+        }
+
+        const int64_t drop = output_samples_for_frames(warmup_frames);
+        const size_t keep_from = (size_t) std::min<int64_t>(drop, (int64_t) seg_samples.size());
+        samples.insert(samples.end(),
+                       seg_samples.begin() + (std::vector<float>::difference_type) keep_from,
+                       seg_samples.end());
+    }
+
+    return true;
+}
+
+bool AudioTokenizerDecoder::decode(const int32_t * codes, int32_t n_frames,
+                                    std::vector<float> & samples) {
+    if (!model_.ctx) {
+        error_msg_ = "Model not loaded";
+        return false;
+    }
+
+    if (n_frames <= 0) {
+        samples.clear();
+        return true;
+    }
+
+    const int32_t max_gpu_frames = get_env_i32("QWEN3_TTS_DECODER_GPU_MAX_FRAMES", 34);
+    const int32_t context_frames_cfg = get_env_i32("QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES", 12);
+
+    // Fast path: non-CUDA backends, or requests that fit one decode chunk.
+    if (!is_primary_backend_cuda() || max_gpu_frames <= 0 || n_frames <= max_gpu_frames) {
+        return decode_single(codes, n_frames, 0, samples);
+    }
+
+    return decode_chunked_cuda(codes, n_frames, samples, max_gpu_frames, context_frames_cfg);
+}
+
 void free_audio_decoder_model(audio_decoder_model & model) {
     if (model.buffer) {
         ggml_backend_buffer_free(model.buffer);
diff --git a/src/audio_tokenizer_decoder.h b/src/audio_tokenizer_decoder.h
index 03d7344..ebb9a37 100644
--- a/src/audio_tokenizer_decoder.h
+++ b/src/audio_tokenizer_decoder.h
@@ -181,6 +181,13 @@ class AudioTokenizerDecoder {
 private:
     // Build computation graph for decoding
     struct ggml_cgraph * build_graph(int32_t n_frames);
+    bool decode_single(const int32_t * codes, int32_t n_frames, int32_t position_offset,
+                       std::vector<float> & samples);
+    bool is_primary_backend_cuda() const;
+    bool decode_chunked_cuda(const int32_t * codes, int32_t n_frames,
+                             std::vector<float> & samples,
+                             int32_t max_gpu_frames, int32_t context_frames_cfg);
+    int64_t output_samples_for_frames(int32_t n_frames) const;
     
     // Apply Snake activation: x + (1/alpha) * sin^2(alpha * x)
     struct ggml_tensor * apply_snake(struct ggml_context * ctx,
diff --git a/src/gguf_loader.cpp b/src/gguf_loader.cpp
index 649af52..ff62052 100644
--- a/src/gguf_loader.cpp
+++ b/src/gguf_loader.cpp
@@ -1,7 +1,11 @@
 #include "gguf_loader.h"
 
+#include <cerrno>
+#include <climits>
+#include <cctype>
 #include <cstdio>
 #include <cstring>
+#include <cstdlib>
 #include <fstream>
 
 namespace qwen3_tts {
@@ -16,6 +20,129 @@ shared_backend_state & get_shared_backend_state() {
     static shared_backend_state state;
     return state;
 }
+
+enum class backend_mode {
+    AUTO,
+    CPU,
+    CUDA,
+};
+
+bool iequals(const char * a, const char * b) {
+    if (!a || !b) {
+        return false;
+    }
+    while (*a && *b) {
+        if (std::tolower((unsigned char) *a) != std::tolower((unsigned char) *b)) {
+            return false;
+        }
+        ++a;
+        ++b;
+    }
+    return *a == '\0' && *b == '\0';
+}
+
+bool parse_non_negative_int(const char * s, int & out) {
+    if (!s || s[0] == '\0') {
+        return false;
+    }
+
+    errno = 0;
+    char * end = nullptr;
+    long v = std::strtol(s, &end, 10);
+    if (errno != 0 || end == s || *end != '\0' || v < 0 || v > INT_MAX) {
+        return false;
+    }
+
+    out = (int) v;
+    return true;
+}
+
+backend_mode get_backend_mode_from_env() {
+    const char * env = std::getenv("QWEN3_TTS_BACKEND");
+    if (!env || env[0] == '\0' || iequals(env, "auto")) {
+        return backend_mode::AUTO;
+    }
+    if (iequals(env, "cpu")) {
+        return backend_mode::CPU;
+    }
+    if (iequals(env, "cuda")) {
+        return backend_mode::CUDA;
+    }
+
+    fprintf(stderr, "  [backend] Unknown QWEN3_TTS_BACKEND=%s, using auto\n", env);
+    return backend_mode::AUTO;
+}
+
+int get_cuda_device_index_from_env() {
+    const char * env = std::getenv("QWEN3_TTS_DEVICE");
+    if (!env || env[0] == '\0') {
+        return -1; // first CUDA device
+    }
+
+    int parsed = -1;
+    if (!parse_non_negative_int(env, parsed)) {
+        fprintf(stderr, "  [backend] Invalid QWEN3_TTS_DEVICE=%s, using default CUDA device\n", env);
+        return -1;
+    }
+    return parsed;
+}
+
+ggml_backend_t init_cuda_backend_from_env() {
+    const int target_cuda_idx = get_cuda_device_index_from_env();
+    int matched_cuda_devices = 0;
+
+    const size_t n_devs = ggml_backend_dev_count();
+    for (size_t i = 0; i < n_devs; ++i) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        const char * reg_name = reg ? ggml_backend_reg_name(reg) : nullptr;
+        if (!reg_name || !iequals(reg_name, "CUDA")) {
+            continue;
+        }
+
+        if (target_cuda_idx >= 0 && matched_cuda_devices != target_cuda_idx) {
+            matched_cuda_devices++;
+            continue;
+        }
+
+        ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
+        if (backend) {
+            return backend;
+        }
+
+        if (target_cuda_idx >= 0) {
+            break;
+        }
+        matched_cuda_devices++;
+    }
+
+    if (target_cuda_idx >= 0) {
+        fprintf(stderr, "  [backend] Requested CUDA device index %d not available\n", target_cuda_idx);
+    }
+
+    return nullptr;
+}
+
+ggml_backend_t init_tensor_loader_backend(enum ggml_backend_dev_type preferred_backend_type) {
+    const backend_mode mode = get_backend_mode_from_env();
+
+    if (mode == backend_mode::CPU) {
+        return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }
+    if (mode == backend_mode::CUDA) {
+        ggml_backend_t backend = init_cuda_backend_from_env();
+        if (!backend) {
+            backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        }
+        return backend;
+    }
+
+    ggml_backend_t backend = ggml_backend_init_by_type(preferred_backend_type, nullptr);
+    if (!backend && preferred_backend_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
+        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }
+    return backend;
+}
 }
 
 GGUFLoader::GGUFLoader() = default;
@@ -33,20 +160,33 @@ ggml_backend_t init_preferred_backend(const char * component_name, std::string *
         return shared.backend;
     }
 
-    ggml_backend_t backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
-    if (!backend) {
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
-    }
-    if (!backend) {
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_ACCEL, nullptr);
-    }
-    if (!backend) {
+    ggml_backend_t backend = nullptr;
+    const backend_mode mode = get_backend_mode_from_env();
+    if (mode == backend_mode::CPU) {
         backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    } else if (mode == backend_mode::CUDA) {
+        backend = init_cuda_backend_from_env();
+        if (!backend) {
+            fprintf(stderr, "  [backend] CUDA requested but unavailable, falling back to CPU\n");
+            backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        }
+    } else {
+        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU, nullptr);
+        if (!backend) {
+            backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr);
+        }
+        if (!backend) {
+            backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_ACCEL, nullptr);
+        }
+        if (!backend) {
+            backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        }
     }
 
     if (!backend && error_msg) {
         const char * name = component_name ? component_name : "component";
-        *error_msg = "Failed to initialize backend (IGPU/GPU/ACCEL/CPU) for " + std::string(name);
+        *error_msg = "Failed to initialize backend for " + std::string(name)
+            + " (QWEN3_TTS_BACKEND=auto|cpu|cuda)";
     }
 
     if (backend) {
@@ -160,10 +300,7 @@ bool load_tensor_data_from_file(
     std::string & error_msg,
     enum ggml_backend_dev_type preferred_backend_type
 ) {
-    ggml_backend_t backend = ggml_backend_init_by_type(preferred_backend_type, nullptr);
-    if (!backend && preferred_backend_type != GGML_BACKEND_DEVICE_TYPE_CPU) {
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-    }
+    ggml_backend_t backend = init_tensor_loader_backend(preferred_backend_type);
     if (!backend) {
         error_msg = "Failed to initialize backend for GGUF tensor loader";
         return false;
diff --git a/src/main.cpp b/src/main.cpp
index b609099..e6207a7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -21,6 +21,13 @@ void print_usage(const char * program) {
     fprintf(stderr, "  -j, --threads <n>      Number of threads (default: 4)\n");
     fprintf(stderr, "  -h, --help             Show this help\n");
     fprintf(stderr, "\n");
+    fprintf(stderr, "Environment:\n");
+    fprintf(stderr, "  QWEN3_TTS_BACKEND      Runtime backend override: auto|cuda|cpu\n");
+    fprintf(stderr, "  QWEN3_TTS_DEVICE       CUDA device index when backend=cuda (default: 0)\n");
+    fprintf(stderr, "  QWEN3_TTS_DECODER_GPU_MAX_FRAMES     Max frames per CUDA vocoder chunk (default: 34)\n");
+    fprintf(stderr, "  QWEN3_TTS_DECODER_GPU_CONTEXT_FRAMES Left context per CUDA vocoder chunk (default: 12)\n");
+    fprintf(stderr, "  QWEN3_TTS_LOW_MEM      Enable low-memory mode (set to 1)\n");
+    fprintf(stderr, "\n");
     fprintf(stderr, "Example:\n");
     fprintf(stderr, "  %s -m ./models -t \"Hello, world!\" -o hello.wav\n", program);
     fprintf(stderr, "  %s -m ./models -t \"Hello!\" -r reference.wav -o cloned.wav\n", program);