We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 8a4cc1a commit 44656e1Copy full SHA for 44656e1
2 files changed
third_party/xllm_atb_layers
xllm/core/runtime/worker_impl.cpp
@@ -288,9 +288,12 @@ bool WorkerImpl::allocate_kv_cache(
288
} else {
289
// Full attention layer: allocate key_cache and value_cache only
290
#if defined(USE_NPU)
291
+ // Keep runtime allocation format consistent with capacity estimation in
292
+ // llm_engine: only deepseek_v3 uses FRACTAL_NZ with prefix cache.
293
+ const auto& model_type = context_.get_model_args().model_type();
294
aclFormat npu_format_type =
- context_.get_model_args().model_type() == "deepseek_v3" &&
- FLAGS_enable_prefix_cache
295
+ ((model_type == "deepseek_v3" || model_type == "deepseek_v3_mtp") &&
296
+ FLAGS_enable_prefix_cache)
297
? ACL_FORMAT_FRACTAL_NZ
298
: ACL_FORMAT_ND;
299
key_cache = at_npu::native::npu_format_cast(
0 commit comments