Skip to content

Commit 4ac70ce

Browse files
committed
models : optimize qwen3next graph (llama/19375)
* models : optimizing qwen3next graph * cont * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * cont : remove redundant q, g chunking * minor * minor * avoid passing masks around * avoid concats during chunking * naming + shapes * update names and use prefix to disable CUDA graphs
1 parent 226e8c0 commit 4ac70ce

2 files changed

Lines changed: 6 additions & 1 deletion

File tree

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2872,6 +2872,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
28722872
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
28732873
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
28742874
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
2875+
const std::string delta_net_prefix = "dnet_add";
28752876

28762877
for (int i = 0; i < cgraph->n_nodes; i++) {
28772878
ggml_tensor * node = cgraph->nodes[i];
@@ -2902,7 +2903,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
29022903
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
29032904
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
29042905
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
2905-
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
2906+
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
2907+
strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
29062908
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
29072909
// by means of matching node names. See
29082910
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@@ -4544,6 +4546,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
45444546
case GGML_UNARY_OP_CEIL:
45454547
case GGML_UNARY_OP_ROUND:
45464548
case GGML_UNARY_OP_TRUNC:
4549+
// TODO: should become:
4550+
//return ggml_is_contiguous_rows(op->src[0]);
45474551
return ggml_is_contiguous(op->src[0]);
45484552
default:
45494553
return false;

ggml/src/ggml-metal/ggml-metal-common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
273273
case GGML_OP_DIAG:
274274
case GGML_OP_MUL:
275275
case GGML_OP_ADD:
276+
case GGML_OP_SUB:
276277
case GGML_OP_DIV:
277278
case GGML_OP_GLU:
278279
case GGML_OP_SCALE:

0 commit comments

Comments
 (0)