From 64effb4ca303d4ba925783dbdabdbf80f0d606af Mon Sep 17 00:00:00 2001 From: Jake Mackay Date: Fri, 8 May 2026 08:51:37 -0700 Subject: [PATCH 1/2] fix: two x86_64 Linux inference crashes Patch 1 (src/llama-context.cpp): when n_ctx <= n_batch, sched_reserve() set reserve_pos0 = n_tokens, placing the dry-run batch outside the KV cache window. This caused: GGML_ASSERT(n_comp_visible <= n_comp_cache) failed common_params_fit_impl() probes n_ctx=n_batch first, so this fires on every Linux run. Fix: use 0 as fallback (prefill-from-zero is the only valid graph shape when n_ctx == n_batch). Patch 2 (ggml/src/ggml-cuda/ggml-cuda.cu): supports_op() reported GGML_OP_CONCAT as OK for any non-I32/I16 type, but the kernel asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32, so the scheduler dispatched them to CUDA and the kernel aborted with: GGML_ASSERT(src0->type == GGML_TYPE_F32) failed Fix: restrict supports_op for CONCAT to F32 only, matching the kernel. Verified on Ubuntu 24.04 x86_64, CUDA 12.1, RTX 3090 with DeepSeek-V4-Flash-IQ2XXS GGUF. Both bugs reproduced 100% before fix. --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- src/llama-context.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1c2c3b4ac..edb93d34e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -5044,7 +5044,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + return src0_type == GGML_TYPE_F32; } break; case GGML_OP_CONV_TRANSPOSE_1D: { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index cfcb5699a..de7d2385e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -579,7 +579,7 @@ void llama_context::sched_reserve() { // graph, which is larger than the position-zero prefill graph. if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) { const llama_pos reserve_pos0 = std::min( - cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens, + cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : 0, std::max(cparams.n_batch, 8u*n_tokens)); auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc, nullptr, reserve_pos0); From 2d6935aeea20f9b7c465f5dfdd746728842dacaf Mon Sep 17 00:00:00 2001 From: Billy McGee Date: Fri, 8 May 2026 09:01:31 -0700 Subject: [PATCH 2/2] tests: add regression tests for the two Linux inference crash fixes --- tests/CMakeLists.txt | 3 + tests/test-deepseek4-regressions.cpp | 96 ++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 tests/test-deepseek4-regressions.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index edb585b9f..721141316 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -294,6 +294,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) +llama_build(test-deepseek4-regressions.cpp) +llama_test(test-deepseek4-regressions) + llama_build(export-graph-ops.cpp) target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) if (TARGET gguf-model-data) diff --git a/tests/test-deepseek4-regressions.cpp b/tests/test-deepseek4-regressions.cpp new file mode 100644 index 000000000..a330fdde7 --- /dev/null +++ b/tests/test-deepseek4-regressions.cpp @@ -0,0 +1,96 @@ +// Regression tests for the two x86_64 Linux inference crashes fixed in +// https://github.com/antirez/llama.cpp-deepseek-v4-flash/pull/7 + +#include +#include + +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Test 1 — Patch 1: sched_reserve reserve_pos0 formula +// +// Regression: when n_ctx <= n_tokens (common_params_fit_impl probes +// n_ctx = n_batch as its first candidate, so this fired on every run), +// the old fallback was n_tokens, placing the dry-run batch at positions +// [n_tokens .. 2*n_tokens-1] — entirely outside the KV window [0 .. n_ctx-1]. +// This triggered GGML_ASSERT(n_comp_visible <= n_comp_cache). +// Fix: use 0 as the fallback (prefill-from-zero is always a valid graph shape). +// --------------------------------------------------------------------------- + +static long reserve_pos0(long n_ctx, long n_tokens, long n_batch) { + return std::min( + n_ctx > n_tokens ? n_ctx - n_tokens : 0L, + std::max(n_batch, 8L * n_tokens)); +} + +static void test_reserve_pos0_formula(void) { + // Regression case: n_ctx == n_batch → reserve_pos0 must be 0, not n_tokens. + assert(reserve_pos0(512, 512, 512) == 0); + // Normal decode case: n_ctx > n_tokens → yields n_ctx - n_tokens. + assert(reserve_pos0(1024, 512, 512) == 512); + // Degenerate case: n_ctx < n_tokens → fallback must also be 0. + assert(reserve_pos0(256, 512, 512) == 0); + + printf("PASS test_reserve_pos0_formula\n"); +} + +// --------------------------------------------------------------------------- +// Test 2 — Patch 2: CUDA CONCAT supports_op type filter +// +// Regression: ggml_backend_cuda_device_supports_op returned true for +// GGML_OP_CONCAT with any type except I32/I16, but ggml_cuda_op_concat() +// asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32, +// so the scheduler dispatched them to CUDA and the kernel aborted: +// GGML_ASSERT(src0->type == GGML_TYPE_F32) failed +// Fix: restrict supports_op for CONCAT to F32 only, matching the kernel. +// --------------------------------------------------------------------------- + +static ggml_backend_dev_t find_cuda_dev(void) { + size_t n = ggml_backend_dev_count(); + for (size_t i = 0; i < n; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (strncmp(ggml_backend_dev_name(dev), "CUDA", 4) == 0) { + return dev; + } + } + return nullptr; +} + +static void test_cuda_concat_type_filter(void) { + ggml_backend_load_all(); + + ggml_backend_dev_t cuda_dev = find_cuda_dev(); + if (!cuda_dev) { + printf("SKIP test_cuda_concat_type_filter (no CUDA device)\n"); + return; + } + + // Construct minimal fake op nodes. src[i].buffer = nullptr so the + // per-device buffer-location check inside supports_op is skipped. + + struct ggml_tensor src_f32 = {}; + src_f32.type = GGML_TYPE_F32; + struct ggml_tensor op_f32 = {}; + op_f32.op = GGML_OP_CONCAT; + op_f32.src[0] = &src_f32; + assert(ggml_backend_dev_supports_op(cuda_dev, &op_f32) == true); + + // Before the fix this returned true, dispatching to a kernel that aborts. + struct ggml_tensor src_f16 = {}; + src_f16.type = GGML_TYPE_F16; + struct ggml_tensor op_f16 = {}; + op_f16.op = GGML_OP_CONCAT; + op_f16.src[0] = &src_f16; + assert(ggml_backend_dev_supports_op(cuda_dev, &op_f16) == false); + + printf("PASS test_cuda_concat_type_filter\n"); +} + +int main(void) { + test_reserve_pos0_formula(); + test_cuda_concat_type_filter(); + return 0; +}