diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 1c2c3b4ac..edb93d34e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -5044,7 +5044,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + return src0_type == GGML_TYPE_F32; } break; case GGML_OP_CONV_TRANSPOSE_1D: { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index cfcb5699a..de7d2385e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -579,7 +579,7 @@ void llama_context::sched_reserve() { // graph, which is larger than the position-zero prefill graph. if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) { const llama_pos reserve_pos0 = std::min( - cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens, + cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : 0, std::max(cparams.n_batch, 8u*n_tokens)); auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc, nullptr, reserve_pos0); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index edb585b9f..721141316 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -294,6 +294,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) +llama_build(test-deepseek4-regressions.cpp) +llama_test(test-deepseek4-regressions) + llama_build(export-graph-ops.cpp) target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) if (TARGET gguf-model-data) diff --git a/tests/test-deepseek4-regressions.cpp b/tests/test-deepseek4-regressions.cpp new file mode 100644 index 000000000..a330fdde7 --- /dev/null +++ b/tests/test-deepseek4-regressions.cpp @@ -0,0 +1,96 @@ +// Regression tests for the two x86_64 Linux inference crashes fixed in +// https://github.com/antirez/llama.cpp-deepseek-v4-flash/pull/7 + +#include +#include + +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// Test 1 — Patch 1: sched_reserve reserve_pos0 formula +// +// Regression: when n_ctx <= n_tokens (common_params_fit_impl probes +// n_ctx = n_batch as its first candidate, so this fired on every run), +// the old fallback was n_tokens, placing the dry-run batch at positions +// [n_tokens .. 2*n_tokens-1] — entirely outside the KV window [0 .. n_ctx-1]. +// This triggered GGML_ASSERT(n_comp_visible <= n_comp_cache). +// Fix: use 0 as the fallback (prefill-from-zero is always a valid graph shape). +// --------------------------------------------------------------------------- + +static long reserve_pos0(long n_ctx, long n_tokens, long n_batch) { + return std::min( + n_ctx > n_tokens ? n_ctx - n_tokens : 0L, + std::max(n_batch, 8L * n_tokens)); +} + +static void test_reserve_pos0_formula(void) { + // Regression case: n_ctx == n_batch → reserve_pos0 must be 0, not n_tokens. + assert(reserve_pos0(512, 512, 512) == 0); + // Normal decode case: n_ctx > n_tokens → yields n_ctx - n_tokens. + assert(reserve_pos0(1024, 512, 512) == 512); + // Degenerate case: n_ctx < n_tokens → fallback must also be 0. + assert(reserve_pos0(256, 512, 512) == 0); + + printf("PASS test_reserve_pos0_formula\n"); +} + +// --------------------------------------------------------------------------- +// Test 2 — Patch 2: CUDA CONCAT supports_op type filter +// +// Regression: ggml_backend_cuda_device_supports_op returned true for +// GGML_OP_CONCAT with any type except I32/I16, but ggml_cuda_op_concat() +// asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32, +// so the scheduler dispatched them to CUDA and the kernel aborted: +// GGML_ASSERT(src0->type == GGML_TYPE_F32) failed +// Fix: restrict supports_op for CONCAT to F32 only, matching the kernel. +// --------------------------------------------------------------------------- + +static ggml_backend_dev_t find_cuda_dev(void) { + size_t n = ggml_backend_dev_count(); + for (size_t i = 0; i < n; i++) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + if (strncmp(ggml_backend_dev_name(dev), "CUDA", 4) == 0) { + return dev; + } + } + return nullptr; +} + +static void test_cuda_concat_type_filter(void) { + ggml_backend_load_all(); + + ggml_backend_dev_t cuda_dev = find_cuda_dev(); + if (!cuda_dev) { + printf("SKIP test_cuda_concat_type_filter (no CUDA device)\n"); + return; + } + + // Construct minimal fake op nodes. src[i].buffer = nullptr so the + // per-device buffer-location check inside supports_op is skipped. + + struct ggml_tensor src_f32 = {}; + src_f32.type = GGML_TYPE_F32; + struct ggml_tensor op_f32 = {}; + op_f32.op = GGML_OP_CONCAT; + op_f32.src[0] = &src_f32; + assert(ggml_backend_dev_supports_op(cuda_dev, &op_f32) == true); + + // Before the fix this returned true, dispatching to a kernel that aborts. + struct ggml_tensor src_f16 = {}; + src_f16.type = GGML_TYPE_F16; + struct ggml_tensor op_f16 = {}; + op_f16.op = GGML_OP_CONCAT; + op_f16.src[0] = &src_f16; + assert(ggml_backend_dev_supports_op(cuda_dev, &op_f16) == false); + + printf("PASS test_cuda_concat_type_filter\n"); +} + +int main(void) { + test_reserve_pos0_formula(); + test_cuda_concat_type_filter(); + return 0; +}