Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5044,7 +5044,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
return src0_type == GGML_TYPE_F32;
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
Expand Down
2 changes: 1 addition & 1 deletion src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ void llama_context::sched_reserve() {
// graph, which is larger than the position-zero prefill graph.
if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
const llama_pos reserve_pos0 = std::min<llama_pos>(
cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : 0,
std::max<uint32_t>(cparams.n_batch, 8u*n_tokens));
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
model.hparams.no_alloc, nullptr, reserve_pos0);
Expand Down
3 changes: 3 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama)
llama_build_and_test(test-alloc.cpp)
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)

llama_build(test-deepseek4-regressions.cpp)
llama_test(test-deepseek4-regressions)

llama_build(export-graph-ops.cpp)
target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
if (TARGET gguf-model-data)
Expand Down
96 changes: 96 additions & 0 deletions tests/test-deepseek4-regressions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Regression tests for the two x86_64 Linux inference crashes fixed in
// https://github.com/antirez/llama.cpp-deepseek-v4-flash/pull/7

#include <ggml.h>
#include <ggml-backend.h>

#include <algorithm>
#include <cassert>
#include <cstdio>
#include <cstring>

// ---------------------------------------------------------------------------
// Test 1 — Patch 1: sched_reserve reserve_pos0 formula
//
// Regression: when n_ctx <= n_tokens (common_params_fit_impl probes
// n_ctx = n_batch as its first candidate, so this fired on every run),
// the old fallback was n_tokens, placing the dry-run batch at positions
// [n_tokens .. 2*n_tokens-1] — entirely outside the KV window [0 .. n_ctx-1].
// This triggered GGML_ASSERT(n_comp_visible <= n_comp_cache).
// Fix: use 0 as the fallback (prefill-from-zero is always a valid graph shape).
// ---------------------------------------------------------------------------

static long reserve_pos0(long n_ctx, long n_tokens, long n_batch) {
return std::min<long>(
n_ctx > n_tokens ? n_ctx - n_tokens : 0L,
std::max<long>(n_batch, 8L * n_tokens));
}

static void test_reserve_pos0_formula(void) {
// Regression case: n_ctx == n_batch → reserve_pos0 must be 0, not n_tokens.
assert(reserve_pos0(512, 512, 512) == 0);
// Normal decode case: n_ctx > n_tokens → yields n_ctx - n_tokens.
assert(reserve_pos0(1024, 512, 512) == 512);
// Degenerate case: n_ctx < n_tokens → fallback must also be 0.
assert(reserve_pos0(256, 512, 512) == 0);

printf("PASS test_reserve_pos0_formula\n");
}

// ---------------------------------------------------------------------------
// Test 2 — Patch 2: CUDA CONCAT supports_op type filter
//
// Regression: ggml_backend_cuda_device_supports_op returned true for
// GGML_OP_CONCAT with any type except I32/I16, but ggml_cuda_op_concat()
// asserts src0->type == F32. DeepSeek V4 attention state tensors are non-F32,
// so the scheduler dispatched them to CUDA and the kernel aborted:
// GGML_ASSERT(src0->type == GGML_TYPE_F32) failed
// Fix: restrict supports_op for CONCAT to F32 only, matching the kernel.
// ---------------------------------------------------------------------------

static ggml_backend_dev_t find_cuda_dev(void) {
size_t n = ggml_backend_dev_count();
for (size_t i = 0; i < n; i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (strncmp(ggml_backend_dev_name(dev), "CUDA", 4) == 0) {
return dev;
}
}
return nullptr;
}

static void test_cuda_concat_type_filter(void) {
ggml_backend_load_all();

ggml_backend_dev_t cuda_dev = find_cuda_dev();
if (!cuda_dev) {
printf("SKIP test_cuda_concat_type_filter (no CUDA device)\n");
return;
}

// Construct minimal fake op nodes. src[i].buffer = nullptr so the
// per-device buffer-location check inside supports_op is skipped.

struct ggml_tensor src_f32 = {};
src_f32.type = GGML_TYPE_F32;
struct ggml_tensor op_f32 = {};
op_f32.op = GGML_OP_CONCAT;
op_f32.src[0] = &src_f32;
assert(ggml_backend_dev_supports_op(cuda_dev, &op_f32) == true);

// Before the fix this returned true, dispatching to a kernel that aborts.
struct ggml_tensor src_f16 = {};
src_f16.type = GGML_TYPE_F16;
struct ggml_tensor op_f16 = {};
op_f16.op = GGML_OP_CONCAT;
op_f16.src[0] = &src_f16;
assert(ggml_backend_dev_supports_op(cuda_dev, &op_f16) == false);

printf("PASS test_cuda_concat_type_filter\n");
}

int main(void) {
test_reserve_pos0_formula();
test_cuda_concat_type_filter();
return 0;
}