Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 61 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,73 @@
# DeepSeek v4 Flash experimental support

This is a fork of llama.cpp that implements DSv4 support, with generated GGUF that aims to target MacBooks with just 128GB of RAM using 2bit quantization of routed experts.
This is a fork of [antirez/llama.cpp-deepseek-v4-flash](https://github.com/antirez/llama.cpp-deepseek-v4-flash) that adds **NVIDIA CUDA support** for running DeepSeek V4 Flash quantized GGUF models with GPU acceleration.

Disclaimer:
* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference.
* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested.
* The code runs both with CPU and Metal backends. With Metal is faster.
## CUDA fix (this fork)

Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf
The original fork only supported CPU and Metal (macOS). On NVIDIA GPUs, running with `-ngl` caused an immediate crash:

```
GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
```

**Root causes fixed:**
- `ggml_cuda_op_concat` had three hard assertions requiring F32 type, blocking any quantized input
- Float offset calculations used hardcoded `/ 4` (sizeof float) instead of `ggml_nbytes()`

Then to test it run with (but for production you may want to tune context, disable thinking for faster replies and so forth):
**Changes in [`ggml/src/ggml-cuda/concat.cu`](ggml/src/ggml-cuda/concat.cu):**
- Removed F32-only assertions, replaced with `src0->type == dst->type` consistency check
- Added byte-level `cudaMemcpy` path for contiguous quantized tensors along dim 1/2/3
- F32 path left entirely unchanged

Tested on **NVIDIA GB10** (122 GB unified memory) with `IQ2XXS-w2Q2K-AProjQ8-SExpQ8` quantization.

## Build (NVIDIA CUDA)

```bash
git clone https://github.com/cdome94/llama.cpp-deepseek-v4-flash
cd llama.cpp-deepseek-v4-flash
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release -j$(nproc)
```
llama-cli \
-m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat.gguf \
-cnv

## Download the model

```bash
pip install huggingface_hub
huggingface-cli download antirez/deepseek-v4-gguf \
DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
--local-dir .
```

## Run (interactive chat)

```bash
./build/bin/llama-cli \
-m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
-cnv \
-ngl 999 \
-c 8192
```

## Run (API server, OpenAI-compatible)

```bash
./build/bin/llama-server \
-m DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf \
-ngl 999 \
-c 8192 \
--host 0.0.0.0 --port 8080
```

---

Disclaimer (from original fork):
* This code was written with heavy help from GPT 5.5 and the official DeepSeek v4 Flash as reference.
* The model quantized in this way behaves very very well in the chat, frontier-model vibes, but it was not extensively tested.
* The original code runs on CPU and Metal backends. This fork adds NVIDIA CUDA support.

Download the GGUF from: https://huggingface.co/antirez/deepseek-v4-gguf

# llama.cpp

![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
Expand Down
89 changes: 69 additions & 20 deletions ggml/src/ggml-cuda/concat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -162,33 +162,82 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

const int32_t dim = ((int32_t *) dst->op_params)[0];

GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->type == src1->type);
GGML_ASSERT(src0->type == dst->type);

const bool is_f32 = (src0->type == GGML_TYPE_F32);

if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data;

float * dst_d = (float *)dst->data;

if (dim != 3) {
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
concat_f32_cuda(
src0_d + i3 * (src0->nb[3] / 4),
src1_d + i3 * (src1->nb[3] / 4),
dst_d + i3 * ( dst->nb[3] / 4),
src0->ne[0], src0->ne[1], src0->ne[2],
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
if (is_f32) {
// Original F32 fast path (unchanged)
const float * src0_d = (const float *)src0->data;
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;

if (dim != 3) {
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
concat_f32_cuda(
src0_d + i3 * (src0->nb[3] / sizeof(float)),
src1_d + i3 * (src1->nb[3] / sizeof(float)),
dst_d + i3 * ( dst->nb[3] / sizeof(float)),
src0->ne[0], src0->ne[1], src0->ne[2],
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
}
} else {
const size_t size0 = ggml_nbytes(src0);
const size_t size1 = ggml_nbytes(src1);
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync((char *)dst_d + size0, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
}
} else {
const size_t size0 = ggml_nbytes(src0);
const size_t size1 = ggml_nbytes(src1);
// Generic byte-level path for quantized / non-F32 types
const char * src0_d = (const char *)src0->data;
const char * src1_d = (const char *)src1->data;
char * dst_d = (char *)dst->data;

CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
if (dim == 3) {
const size_t size0 = ggml_nbytes(src0);
const size_t size1 = ggml_nbytes(src1);
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
} else if (dim == 2) {
// Iterate over dim3 slices; within each, src0 planes then src1 planes
const size_t src0_slice = src0->nb[3]; // bytes per i3 slice in src0
const size_t src1_slice = src1->nb[3];
const size_t dst_slice = dst->nb[3];
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
CUDA_CHECK(cudaMemcpyAsync(dst_d + i3 * dst_slice,
src0_d + i3 * src0_slice,
src0_slice, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(dst_d + i3 * dst_slice + src0_slice,
src1_d + i3 * src1_slice,
src1_slice, cudaMemcpyDeviceToDevice, stream));
}
} else if (dim == 1) {
// Iterate over dim3 and dim2 slices
const size_t src0_row = src0->nb[2]; // bytes per (i3,i2) slice in src0
const size_t src1_row = src1->nb[2];
const size_t dst_row = dst->nb[2];
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
for (int i2 = 0; i2 < dst->ne[2]; i2++) {
CUDA_CHECK(cudaMemcpyAsync(
dst_d + i3 * (size_t)dst->nb[3] + i2 * dst_row,
src0_d + i3 * (size_t)src0->nb[3] + i2 * src0_row,
src0_row, cudaMemcpyDeviceToDevice, stream));
CUDA_CHECK(cudaMemcpyAsync(
dst_d + i3 * (size_t)dst->nb[3] + i2 * dst_row + src0_row,
src1_d + i3 * (size_t)src1->nb[3] + i2 * src1_row,
src1_row, cudaMemcpyDeviceToDevice, stream));
}
}
} else {
// dim == 0: concatenation within a row — not supported for quantized types
GGML_ABORT("concat along dim 0 not supported for non-F32 quantized types");
}
}
} else {
// non-contiguous path: only supported for F32
GGML_ASSERT(is_f32 && "non-contiguous concat only supported for F32; use contiguous tensors for quantized types");
dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]);
auto launch_kernel = [&](auto dim) {
concat_f32_non_cont<dim><<<grid_dim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(
Expand Down