diff --git a/README.md b/README.md index 44ec52766..43828eab2 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,73 @@ # DeepSeek v4 Flash experimental support -This is a fork of llama.cpp that implements DSv4 support, with generated GGUF that aims to target MacBooks with just 128GB of RAM using 2bit quantization of routed experts. +This is a fork of [antirez/llama.cpp-deepseek-v4-flash](https://github.com/antirez/llama.cpp-deepseek-v4-flash) that adds **NVIDIA CUDA support** for running DeepSeek V4 Flash quantized GGUF models with GPU acceleration. -Disclaimer: -* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference. -* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested. -* The code runs both with CPU and Metal backends. With Metal is faster. +## CUDA fix (this fork) -Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf +The original fork only supported CPU and Metal (macOS). On NVIDIA GPUs, running with `-ngl` caused an immediate crash: + +``` +GGML_ASSERT(src0->type == GGML_TYPE_F32) failed +``` + +**Root causes fixed:** +- `ggml_cuda_op_concat` had three hard assertions requiring F32 type, blocking any quantized input +- Float offset calculations used hardcoded `/ 4` (sizeof float) instead of `ggml_nbytes()` -Then to test it run with (but for production you may want to tune context, disable thinking for faster replies and so forth): +**Changes in [`ggml/src/ggml-cuda/concat.cu`](ggml/src/ggml-cuda/concat.cu):** +- Removed F32-only assertions, replaced with `src0->type == dst->type` consistency check +- Added byte-level `cudaMemcpy` path for contiguous quantized tensors along dim 1/2/3 +- F32 path left entirely unchanged +Tested on **NVIDIA GB10** (122 GB unified memory) with `IQ2XXS-w2Q2K-AProjQ8-SExpQ8` quantization. + +## Build (NVIDIA CUDA) + +```bash +git clone https://github.com/cdome94/llama.cpp-deepseek-v4-flash +cd llama.cpp-deepseek-v4-flash +cmake -B build -DGGML_CUDA=ON +cmake --build build --config Release -j$(nproc) ``` -llama-cli \ - -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat.gguf \ - -cnv + +## Download the model + +```bash +pip install huggingface_hub +huggingface-cli download antirez/deepseek-v4-gguf \ + DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \ + --local-dir . ``` +## Run (interactive chat) + +```bash +./build/bin/llama-cli \ + -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \ + -cnv \ + -ngl 999 \ + -c 8192 +``` + +## Run (API server, OpenAI-compatible) + +```bash +./build/bin/llama-server \ + -m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \ + -ngl 999 \ + -c 8192 \ + --host 0.0.0.0 --port 8080 +``` + +--- + +Disclaimer (from original fork): +* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference. +* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested. +* The original code runs on CPU and Metal backends. This fork adds NVIDIA CUDA support. + +Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf + # llama.cpp ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index e9ffd274b..fce5585c2 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -162,33 +162,82 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int32_t dim = ((int32_t *) dst->op_params)[0]; - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == src1->type); + GGML_ASSERT(src0->type == dst->type); + + const bool is_f32 = (src0->type == GGML_TYPE_F32); if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - - float * dst_d = (float *)dst->data; - - if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_cuda( - src0_d + i3 * (src0->nb[3] / 4), - src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * ( dst->nb[3] / 4), - src0->ne[0], src0->ne[1], src0->ne[2], - dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + if (is_f32) { + // Original F32 fast path (unchanged) + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + + if (dim != 3) { + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_cuda( + src0_d + i3 * (src0->nb[3] / sizeof(float)), + src1_d + i3 * (src1->nb[3] / sizeof(float)), + dst_d + i3 * ( dst->nb[3] / sizeof(float)), + src0->ne[0], src0->ne[1], src0->ne[2], + dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); + } + } else { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync((char *)dst_d + size0, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); } } else { - const size_t size0 = ggml_nbytes(src0); - const size_t size1 = ggml_nbytes(src1); + // Generic byte-level path for quantized / non-F32 types + const char * src0_d = (const char *)src0->data; + const char * src1_d = (const char *)src1->data; + char * dst_d = (char *)dst->data; - CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); + if (dim == 3) { + const size_t size0 = ggml_nbytes(src0); + const size_t size1 = ggml_nbytes(src1); + CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dst_d + size0, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); + } else if (dim == 2) { + // Iterate over dim3 slices; within each, src0 planes then src1 planes + const size_t src0_slice = src0->nb[3]; // bytes per i3 slice in src0 + const size_t src1_slice = src1->nb[3]; + const size_t dst_slice = dst->nb[3]; + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + CUDA_CHECK(cudaMemcpyAsync(dst_d + i3 * dst_slice, + src0_d + i3 * src0_slice, + src0_slice, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(dst_d + i3 * dst_slice + src0_slice, + src1_d + i3 * src1_slice, + src1_slice, cudaMemcpyDeviceToDevice, stream)); + } + } else if (dim == 1) { + // Iterate over dim3 and dim2 slices + const size_t src0_row = src0->nb[2]; // bytes per (i3,i2) slice in src0 + const size_t src1_row = src1->nb[2]; + const size_t dst_row = dst->nb[2]; + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + for (int i2 = 0; i2 < dst->ne[2]; i2++) { + CUDA_CHECK(cudaMemcpyAsync( + dst_d + i3 * (size_t)dst->nb[3] + i2 * dst_row, + src0_d + i3 * (size_t)src0->nb[3] + i2 * src0_row, + src0_row, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync( + dst_d + i3 * (size_t)dst->nb[3] + i2 * dst_row + src0_row, + src1_d + i3 * (size_t)src1->nb[3] + i2 * src1_row, + src1_row, cudaMemcpyDeviceToDevice, stream)); + } + } + } else { + // dim == 0: concatenation within a row — not supported for quantized types + GGML_ABORT("concat along dim 0 not supported for non-F32 quantized types"); + } } } else { + // non-contiguous path: only supported for F32 + GGML_ASSERT(is_f32 && "non-contiguous concat only supported for F32; use contiguous tensors for quantized types"); dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]); auto launch_kernel = [&](auto dim) { concat_f32_non_cont<<>>(