antirez · cdome94 · May 4, 2026 · May 4, 2026
diff --git a/README.md b/README.md
@@ -1,22 +1,73 @@
 # DeepSeek v4 Flash experimental support
 
-This is a fork of llama.cpp that implements DSv4 support, with generated GGUF that aims to target MacBooks with just 128GB of RAM using 2bit quantization of routed experts.
+This is a fork of [antirez/llama.cpp-deepseek-v4-flash](https://github.com/antirez/llama.cpp-deepseek-v4-flash) that adds **NVIDIA CUDA support** for running DeepSeek V4 Flash quantized GGUF models with GPU acceleration.
 
-Disclaimer:
-* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference.
-* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested.
-* The code runs both with CPU and Metal backends. With Metal is faster.
+## CUDA fix (this fork)
 
-Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf
+The original fork only supported CPU and Metal (macOS). On NVIDIA GPUs, running with `-ngl` caused an immediate crash:
+
+```
+GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
+```
+
+**Root causes fixed:**
+- `ggml_cuda_op_concat` had three hard assertions requiring F32 type, blocking any quantized input
+- Float offset calculations used hardcoded `/ 4` (sizeof float) instead of `ggml_nbytes()`
 
-Then to test it run with (but for production you may want to tune context, disable thinking for faster replies and so forth):
+**Changes in [`ggml/src/ggml-cuda/concat.cu`](ggml/src/ggml-cuda/concat.cu):**
+- Removed F32-only assertions, replaced with `src0->type == dst->type` consistency check
+- Added byte-level `cudaMemcpy` path for contiguous quantized tensors along dim 1/2/3
+- F32 path left entirely unchanged
 
+Tested on **NVIDIA GB10** (122 GB unified memory) with `IQ2XXS-w2Q2K-AProjQ8-SExpQ8` quantization.
+
+## Build (NVIDIA CUDA)
+
+```bash
+git clone https://github.com/cdome94/llama.cpp-deepseek-v4-flash
+cd llama.cpp-deepseek-v4-flash
+cmake -B build -DGGML_CUDA=ON
+cmake --build build --config Release -j$(nproc)
 ```
-llama-cli \
-    -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat.gguf \
-    -cnv
+
+## Download the model
+
+```bash
+pip install huggingface_hub
+huggingface-cli download antirez/deepseek-v4-gguf \
+    DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
+    --local-dir .
 ```
 
+## Run (interactive chat)
+
+```bash
+./build/bin/llama-cli \
+    -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
+    -cnv \
+    -ngl 999 \
+    -c 8192
+```
+
+## Run (API server, OpenAI-compatible)
+
+```bash
+./build/bin/llama-server \
+    -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
+    -ngl 999 \
+    -c 8192 \
+    --host 0.0.0.0 --port 8080
+```
+
+---
+
+Disclaimer (from original fork):
+* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference.
+* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested.
+* The original code runs on CPU and Metal backends. This fork adds NVIDIA CUDA support.
+
+Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf
+
 # llama.cpp
 
 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu
@@ -162,33 +162,82 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     const int32_t dim = ((int32_t *) dst->op_params)[0];
 
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(src0->type == src1->type);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const bool is_f32 = (src0->type == GGML_TYPE_F32);
 
     if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const float * src0_d = (const float *)src0->data;
-        const float * src1_d = (const float *)src1->data;
-
-        float * dst_d = (float *)dst->data;
-
-        if (dim != 3) {
-            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_f32_cuda(
-                        src0_d + i3 * (src0->nb[3] / 4),
-                        src1_d + i3 * (src1->nb[3] / 4),
-                        dst_d + i3 * ( dst->nb[3] / 4),
-                        src0->ne[0], src0->ne[1], src0->ne[2],
-                        dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
+        if (is_f32) {
+            // Original F32 fast path (unchanged)
+            const float * src0_d = (const float *)src0->data;
+            const float * src1_d = (const float *)src1->data;
+            float * dst_d = (float *)dst->data;
+
+            if (dim != 3) {
+                for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                    concat_f32_cuda(
+                            src0_d + i3 * (src0->nb[3] / sizeof(float)),
+                            src1_d + i3 * (src1->nb[3] / sizeof(float)),
+                            dst_d  + i3 * ( dst->nb[3] / sizeof(float)),
+                            src0->ne[0], src0->ne[1], src0->ne[2],
+                            dst->ne[0],  dst->ne[1],  dst->ne[2], dim, stream);
+                }
+            } else {
+                const size_t size0 = ggml_nbytes(src0);
+                const size_t size1 = ggml_nbytes(src1);
+                CUDA_CHECK(cudaMemcpyAsync(dst_d,                        src0_d, size0, cudaMemcpyDeviceToDevice, stream));
+                CUDA_CHECK(cudaMemcpyAsync((char *)dst_d + size0, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
             }
         } else {
-            const size_t size0 = ggml_nbytes(src0);
-            const size_t size1 = ggml_nbytes(src1);
+            // Generic byte-level path for quantized / non-F32 types
+            const char * src0_d = (const char *)src0->data;
+            const char * src1_d = (const char *)src1->data;
+            char       * dst_d  = (char       *)dst->data;
 
-            CUDA_CHECK(cudaMemcpyAsync(dst_d,           src0_d, size0, cudaMemcpyDeviceToDevice, stream));
-            CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+            if (dim == 3) {
+                const size_t size0 = ggml_nbytes(src0);
+                const size_t size1 = ggml_nbytes(src1);
+                CUDA_CHECK(cudaMemcpyAsync(dst_d,          src0_d, size0, cudaMemcpyDeviceToDevice, stream));
+                CUDA_CHECK(cudaMemcpyAsync(dst_d + size0,  src1_d, size1, cudaMemcpyDeviceToDevice, stream));
+            } else if (dim == 2) {
+                // Iterate over dim3 slices; within each, src0 planes then src1 planes
+                const size_t src0_slice = src0->nb[3]; // bytes per i3 slice in src0
+                const size_t src1_slice = src1->nb[3];
+                const size_t dst_slice  =  dst->nb[3];
+                for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                    CUDA_CHECK(cudaMemcpyAsync(dst_d  + i3 * dst_slice,
+                                               src0_d + i3 * src0_slice,
+                                               src0_slice, cudaMemcpyDeviceToDevice, stream));
+                    CUDA_CHECK(cudaMemcpyAsync(dst_d  + i3 * dst_slice + src0_slice,
+                                               src1_d + i3 * src1_slice,
+                                               src1_slice, cudaMemcpyDeviceToDevice, stream));
+                }
+            } else if (dim == 1) {
+                // Iterate over dim3 and dim2 slices
+                const size_t src0_row = src0->nb[2]; // bytes per (i3,i2) slice in src0
+                const size_t src1_row = src1->nb[2];
+                const size_t dst_row  =  dst->nb[2];
+                for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+                    for (int i2 = 0; i2 < dst->ne[2]; i2++) {
+                        CUDA_CHECK(cudaMemcpyAsync(
+                            dst_d  + i3 * (size_t)dst->nb[3]  + i2 * dst_row,
+                            src0_d + i3 * (size_t)src0->nb[3] + i2 * src0_row,
+                            src0_row, cudaMemcpyDeviceToDevice, stream));
+                        CUDA_CHECK(cudaMemcpyAsync(
+                            dst_d  + i3 * (size_t)dst->nb[3]  + i2 * dst_row + src0_row,
+                            src1_d + i3 * (size_t)src1->nb[3] + i2 * src1_row,
+                            src1_row, cudaMemcpyDeviceToDevice, stream));
+                    }
+                }
+            } else {
+                // dim == 0: concatenation within a row — not supported for quantized types
+                GGML_ABORT("concat along dim 0 not supported for non-F32 quantized types");
+            }
         }
     } else {
+        // non-contiguous path: only supported for F32
+        GGML_ASSERT(is_f32 && "non-contiguous concat only supported for F32; use contiguous tensors for quantized types");
         dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
         auto launch_kernel = [&](auto dim) {
             concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(