From 64effb4ca303d4ba925783dbdabdbf80f0d606af Mon Sep 17 00:00:00 2001
From: Jake Mackay <jmackay@gemstone.local>
Date: Fri, 8 May 2026 08:51:37 -0700
Subject: [PATCH 1/2] fix: two x86_64 Linux inference crashes

Patch 1 (src/llama-context.cpp): when n_ctx <= n_batch,
sched_reserve() set reserve_pos0 = n_tokens, placing the dry-run
batch outside the KV cache window. This caused:
  GGML_ASSERT(n_comp_visible <= n_comp_cache) failed
common_params_fit_impl() probes n_ctx=n_batch first, so this fires
on every Linux run. Fix: use 0 as fallback (prefill-from-zero is the
only valid graph shape when n_ctx == n_batch).

Patch 2 (ggml/src/ggml-cuda/ggml-cuda.cu): supports_op() reported
GGML_OP_CONCAT as OK for any non-I32/I16 type, but the kernel asserts
src0->type == F32. DeepSeek V4 attention state tensors are non-F32,
so the scheduler dispatched them to CUDA and the kernel aborted with:
  GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
Fix: restrict supports_op for CONCAT to F32 only, matching the kernel.

Verified on Ubuntu 24.04 x86_64, CUDA 12.1, RTX 3090 with
DeepSeek-V4-Flash-IQ2XXS GGUF. Both bugs reproduced 100% before fix.
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 src/llama-context.cpp           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 1c2c3b4ac..edb93d34e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -5044,7 +5044,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                return src0_type == GGML_TYPE_F32;
             } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cfcb5699a..de7d2385e 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -579,7 +579,7 @@ void llama_context::sched_reserve() {
     // graph, which is larger than the position-zero prefill graph.
     if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
         const llama_pos reserve_pos0 = std::min<llama_pos>(
-                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : 0,
                 std::max<uint32_t>(cparams.n_batch, 8u*n_tokens));
         auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
                 model.hparams.no_alloc, nullptr, reserve_pos0);

From 2d6935aeea20f9b7c465f5dfdd746728842dacaf Mon Sep 17 00:00:00 2001
From: Billy McGee <randomsamples@users.noreply.github.com>
Date: Fri, 8 May 2026 09:01:31 -0700
Subject: [PATCH 2/2] tests: add regression tests for the two Linux inference
 crash fixes

---
 tests/CMakeLists.txt                 |  3 +
 tests/test-deepseek4-regressions.cpp | 96 ++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 tests/test-deepseek4-regressions.cpp

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index edb585b9f..721141316 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -294,6 +294,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
+llama_build(test-deepseek4-regressions.cpp)
+llama_test(test-deepseek4-regressions)
+
 llama_build(export-graph-ops.cpp)
 target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 if (TARGET gguf-model-data)
diff --git a/tests/test-deepseek4-regressions.cpp b/tests/test-deepseek4-regressions.cpp
new file mode 100644
index 000000000..a330fdde7
--- /dev/null
+++ b/tests/test-deepseek4-regressions.cpp
@@ -0,0 +1,96 @@
+// Regression tests for the two x86_64 Linux inference crashes fixed in
+// https://github.com/antirez/llama.cpp-deepseek-v4-flash/pull/7
+
+#include <ggml.h>
+#include <ggml-backend.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+
+// ---------------------------------------------------------------------------
+// Test 1 — Patch 1: sched_reserve reserve_pos0 formula
+//
+// Regression: when n_ctx <= n_tokens (common_params_fit_impl probes
+// n_ctx = n_batch as its first candidate, so this fired on every run),
+// the old fallback was n_tokens, placing the dry-run batch at positions
+// [n_tokens .. 2*n_tokens-1] — entirely outside the KV window [0 .. n_ctx-1].
+// This triggered GGML_ASSERT(n_comp_visible <= n_comp_cache).
+// Fix: use 0 as the fallback (prefill-from-zero is always a valid graph shape).
+// ---------------------------------------------------------------------------
+
+static long reserve_pos0(long n_ctx, long n_tokens, long n_batch) {
+    return std::min<long>(
+        n_ctx > n_tokens ? n_ctx - n_tokens : 0L,
+        std::max<long>(n_batch, 8L * n_tokens));
+}
+
+static void test_reserve_pos0_formula(void) {
+    // Regression case: n_ctx == n_batch → reserve_pos0 must be 0, not n_tokens.
+    assert(reserve_pos0(512, 512, 512) == 0);
+    // Normal decode case: n_ctx > n_tokens → yields n_ctx - n_tokens.
+    assert(reserve_pos0(1024, 512, 512) == 512);
+    // Degenerate case: n_ctx < n_tokens → fallback must also be 0.
+    assert(reserve_pos0(256, 512, 512) == 0);
+
+    printf("PASS  test_reserve_pos0_formula\n");
+}
+
+// ---------------------------------------------------------------------------
+// Test 2 — Patch 2: CUDA CONCAT supports_op type filter
+//
+// Regression: ggml_backend_cuda_device_supports_op returned true for
+// GGML_OP_CONCAT with any type except I32/I16, but ggml_cuda_op_concat()
+// asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32,
+// so the scheduler dispatched them to CUDA and the kernel aborted:
+//   GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
+// Fix: restrict supports_op for CONCAT to F32 only, matching the kernel.
+// ---------------------------------------------------------------------------
+
+static ggml_backend_dev_t find_cuda_dev(void) {
+    size_t n = ggml_backend_dev_count();
+    for (size_t i = 0; i < n; i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (strncmp(ggml_backend_dev_name(dev), "CUDA", 4) == 0) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static void test_cuda_concat_type_filter(void) {
+    ggml_backend_load_all();
+
+    ggml_backend_dev_t cuda_dev = find_cuda_dev();
+    if (!cuda_dev) {
+        printf("SKIP  test_cuda_concat_type_filter (no CUDA device)\n");
+        return;
+    }
+
+    // Construct minimal fake op nodes. src[i].buffer = nullptr so the
+    // per-device buffer-location check inside supports_op is skipped.
+
+    struct ggml_tensor src_f32 = {};
+    src_f32.type = GGML_TYPE_F32;
+    struct ggml_tensor op_f32 = {};
+    op_f32.op     = GGML_OP_CONCAT;
+    op_f32.src[0] = &src_f32;
+    assert(ggml_backend_dev_supports_op(cuda_dev, &op_f32) == true);
+
+    // Before the fix this returned true, dispatching to a kernel that aborts.
+    struct ggml_tensor src_f16 = {};
+    src_f16.type = GGML_TYPE_F16;
+    struct ggml_tensor op_f16 = {};
+    op_f16.op     = GGML_OP_CONCAT;
+    op_f16.src[0] = &src_f16;
+    assert(ggml_backend_dev_supports_op(cuda_dev, &op_f16) == false);
+
+    printf("PASS  test_cuda_concat_type_filter\n");
+}
+
+int main(void) {
+    test_reserve_pos0_formula();
+    test_cuda_concat_type_filter();
+    return 0;
+}