antirez · randomsamples · May 8, 2026 · May 8, 2026
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -5044,7 +5044,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CONCAT:
             {
                 ggml_type src0_type = op->src[0]->type;
-                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
+                return src0_type == GGML_TYPE_F32;
             } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -579,7 +579,7 @@ void llama_context::sched_reserve() {
     // graph, which is larger than the position-zero prefill graph.
     if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
         const llama_pos reserve_pos0 = std::min<llama_pos>(
-                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : 0,
                 std::max<uint32_t>(cparams.n_batch, 8u*n_tokens));
         auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
                 model.hparams.no_alloc, nullptr, reserve_pos0);

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -294,6 +294,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
 llama_build_and_test(test-alloc.cpp)
 target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 
+llama_build(test-deepseek4-regressions.cpp)
+llama_test(test-deepseek4-regressions)
+
 llama_build(export-graph-ops.cpp)
 target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
 if (TARGET gguf-model-data)

diff --git a/tests/test-deepseek4-regressions.cpp b/tests/test-deepseek4-regressions.cpp
@@ -0,0 +1,96 @@
+// Regression tests for the two x86_64 Linux inference crashes fixed in
+// https://github.com/antirez/llama.cpp-deepseek-v4-flash/pull/7
+
+#include <ggml.h>
+#include <ggml-backend.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+
+// ---------------------------------------------------------------------------
+// Test 1 — Patch 1: sched_reserve reserve_pos0 formula
+//
+// Regression: when n_ctx <= n_tokens (common_params_fit_impl probes
+// n_ctx = n_batch as its first candidate, so this fired on every run),
+// the old fallback was n_tokens, placing the dry-run batch at positions
+// [n_tokens .. 2*n_tokens-1] — entirely outside the KV window [0 .. n_ctx-1].
+// This triggered GGML_ASSERT(n_comp_visible <= n_comp_cache).
+// Fix: use 0 as the fallback (prefill-from-zero is always a valid graph shape).
+// ---------------------------------------------------------------------------
+
+static long reserve_pos0(long n_ctx, long n_tokens, long n_batch) {
+    return std::min<long>(
+        n_ctx > n_tokens ? n_ctx - n_tokens : 0L,
+        std::max<long>(n_batch, 8L * n_tokens));
+}
+
+static void test_reserve_pos0_formula(void) {
+    // Regression case: n_ctx == n_batch → reserve_pos0 must be 0, not n_tokens.
+    assert(reserve_pos0(512, 512, 512) == 0);
+    // Normal decode case: n_ctx > n_tokens → yields n_ctx - n_tokens.
+    assert(reserve_pos0(1024, 512, 512) == 512);
+    // Degenerate case: n_ctx < n_tokens → fallback must also be 0.
+    assert(reserve_pos0(256, 512, 512) == 0);
+
+    printf("PASS  test_reserve_pos0_formula\n");
+}
+
+// ---------------------------------------------------------------------------
+// Test 2 — Patch 2: CUDA CONCAT supports_op type filter
+//
+// Regression: ggml_backend_cuda_device_supports_op returned true for
+// GGML_OP_CONCAT with any type except I32/I16, but ggml_cuda_op_concat()
+// asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32,
+// so the scheduler dispatched them to CUDA and the kernel aborted:
+//   GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
+// Fix: restrict supports_op for CONCAT to F32 only, matching the kernel.
+// ---------------------------------------------------------------------------
+
+static ggml_backend_dev_t find_cuda_dev(void) {
+    size_t n = ggml_backend_dev_count();
+    for (size_t i = 0; i < n; i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+        if (strncmp(ggml_backend_dev_name(dev), "CUDA", 4) == 0) {
+            return dev;
+        }
+    }
+    return nullptr;
+}
+
+static void test_cuda_concat_type_filter(void) {
+    ggml_backend_load_all();
+
+    ggml_backend_dev_t cuda_dev = find_cuda_dev();
+    if (!cuda_dev) {
+        printf("SKIP  test_cuda_concat_type_filter (no CUDA device)\n");
+        return;
+    }
+
+    // Construct minimal fake op nodes. src[i].buffer = nullptr so the
+    // per-device buffer-location check inside supports_op is skipped.
+
+    struct ggml_tensor src_f32 = {};
+    src_f32.type = GGML_TYPE_F32;
+    struct ggml_tensor op_f32 = {};
+    op_f32.op     = GGML_OP_CONCAT;
+    op_f32.src[0] = &src_f32;
+    assert(ggml_backend_dev_supports_op(cuda_dev, &op_f32) == true);
+
+    // Before the fix this returned true, dispatching to a kernel that aborts.
+    struct ggml_tensor src_f16 = {};
+    src_f16.type = GGML_TYPE_F16;
+    struct ggml_tensor op_f16 = {};
+    op_f16.op     = GGML_OP_CONCAT;
+    op_f16.src[0] = &src_f16;
+    assert(ggml_backend_dev_supports_op(cuda_dev, &op_f16) == false);
+
+    printf("PASS  test_cuda_concat_type_filter\n");
+}
+
+int main(void) {
+    test_reserve_pos0_formula();
+    test_cuda_concat_type_filter();
+    return 0;
+}