@@ -699,6 +699,7 @@ struct vk_device_struct {
699699
700700 bool add_rms_fusion;
701701 uint32_t partials_binding_alignment;
702+ uint32_t max_nodes_per_submit;
702703
703704 bool shader_64b_indexing;
704705
@@ -5878,6 +5879,14 @@ static vk_device ggml_vk_get_device(size_t idx) {
58785879 device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
58795880 (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote);
58805881
5882+ // Submit at least every 100 nodes, in case there are workloads without as much matmul.
5883+ device->max_nodes_per_submit = 100;
5884+ const char* GGML_VK_MAX_NODES_PER_SUBMIT = getenv("GGML_VK_MAX_NODES_PER_SUBMIT");
5885+ if (GGML_VK_MAX_NODES_PER_SUBMIT != nullptr) {
5886+ uint32_t max_nodes_per_submit = std::stoul(GGML_VK_MAX_NODES_PER_SUBMIT);
5887+ device->max_nodes_per_submit = std::max(max_nodes_per_submit, 1u);
5888+ }
5889+
58815890 const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;
58825891
58835892 device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
@@ -16173,8 +16182,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1617316182 // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
1617416183 // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
1617516184 // (and scaled down based on model size, so smaller models submit earlier).
16176- // Also submit at least every 100 nodes, in case there are workloads without as much matmul.
16177- int nodes_per_submit = 100;
1617816185 int submitted_nodes = 0;
1617916186 int submit_count = 0;
1618016187 uint64_t mul_mat_bytes = 0;
@@ -16400,7 +16407,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1640016407
1640116408 // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining)
1640216409 bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5;
16403- bool submit = (submitted_nodes >= nodes_per_submit ) ||
16410+ bool submit = ((uint32_t) submitted_nodes >= ctx->device->max_nodes_per_submit ) ||
1640416411 (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) ||
1640516412 (i + ctx->num_additional_fused_ops >= last_node) ||
1640616413 (almost_ready && !ctx->almost_ready_fence_pending);
0 commit comments