From 9f4ddbcca9f346cdc5dd25775d7e0c56765b5a65 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 21 Jan 2026 11:29:52 +0100 Subject: [PATCH 01/24] Adds initial PDL setup. --- ggml/src/ggml-cuda/common.cuh | 4 ++ ggml/src/ggml-cuda/ggml-cuda.cu | 92 ++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a3256d59dd0..e0ab1db0a01 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -50,6 +50,7 @@ #define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_ADA_LOVELACE 890 +#define GGML_CUDA_CC_HOPPER 900 // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms #define GGML_CUDA_CC_BLACKWELL 1200 @@ -1175,6 +1176,9 @@ struct ggml_cuda_graph { static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates); } + std::vector graph_nodes; + std::vector graph_dependencies; + bool allow_pdl = true; #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index eeb8625dbeb..080d3b2d519 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1176,6 +1176,14 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; //} +static void ggml_backend_cuda_graph_disable_pdl(ggml_backend_cuda_context & ctx, const void * graph_key) { + + if (graph_key) { + ggml_cuda_graph * graph = ctx.cuda_graph(graph_key); + graph->allow_pdl = false; + } +} + /// kernels typedef void (*ggml_cuda_op_mul_mat_t)( @@ -2180,7 +2188,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { return use_mul_mat_vec_q; } -static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const void * graph_key) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. @@ -2253,6 +2261,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention + ggml_backend_cuda_graph_disable_pdl(ctx, graph_key); ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec_f) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr); @@ -2261,11 +2270,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (use_mul_mat_q) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda); } else { + ggml_backend_cuda_graph_disable_pdl(ctx, graph_key); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); } } -static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { +static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst, const void * graph_key) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; @@ -2406,7 +2416,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; dst_slice.data = dst_data_cur; - ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice, graph_key); CUDA_CHECK(cudaGetLastError()); src1_data_cur += src1_slice.nb[2]; @@ -2419,7 +2429,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * nb1, nb2, nb3, stream); } -static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const void * graph_key) { // why is this here instead of mul_mat? if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); @@ -2614,10 +2624,10 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_rms_norm_back(ctx, dst); break; case GGML_OP_MUL_MAT: - ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); + ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst, graph_key); break; case GGML_OP_MUL_MAT_ID: - ggml_cuda_mul_mat_id(ctx, dst); + ggml_cuda_mul_mat_id(ctx, dst, graph_key); break; case GGML_OP_OUT_PROD: ggml_cuda_out_prod(ctx, dst); @@ -3873,7 +3883,13 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud GGML_UNUSED(integrated); #endif // NDEBUG - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); +#ifdef USE_CUDA_GRAPH + const void * graph_key = ggml_cuda_graph_get_key(cgraph); + bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, graph_key); +#else + bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, nullptr); +#endif // USE_CUDA_GRAPH + if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -3894,6 +3910,68 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud } CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph)); + +#if CUDA_VERSION >= 12030 + // Set programmatic dependent launch (PDL) properties for all edges + // This will only have an effect on Hopper and later GPUs, but is harmless on older GPUs. + // Only allow PDL if it hasn't been disabled due to presence of library kernels in CUDA graph + // since we can't add corresponding CUDA API sync calls to these. + // TO DO identify graph nodes that contain such library kernels and refrain from setting PDL + // launch properties only on those nodes (non-trivial). + if (graph->allow_pdl) { + + size_t num_nodes = 0; + // First call with null arg gives number of nodes + CUDA_CHECK(cudaGraphGetNodes(graph->graph, nullptr, &num_nodes)); + + if (num_nodes > graph->graph_nodes.size()) { + graph->graph_nodes.resize(num_nodes); + } + if (num_nodes > 0) { + // This call gives actual nodes + CUDA_CHECK(cudaGraphGetNodes(graph->graph, graph->graph_nodes.data(), &num_nodes)); + } + + size_t max_dependencies = 0; + for (size_t i = 0; i < num_nodes; i++) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); + if (num_dependencies > max_dependencies) + max_dependencies = num_dependencies; + } + if (max_dependencies > graph->graph_dependencies.size()) { + graph->graph_dependencies.resize(max_dependencies); + } + + if (num_nodes > 0) { + cudaGraphNodeType prev_node_type = cudaGraphNodeTypeKernel; + for (size_t i = 0; i < num_nodes; i++) { + cudaGraphNodeType node_type; + CUDA_CHECK(cudaGraphNodeGetType(graph->graph_nodes[i], &node_type)); + if (node_type == cudaGraphNodeTypeKernel && prev_node_type == cudaGraphNodeTypeKernel) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); + if (num_dependencies > 0) { + // This call gives actual dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], graph->graph_dependencies.data(), nullptr, &num_dependencies)); + for (size_t j = 0; j < num_dependencies; j++) { + cudaGraphEdgeData edge_data = {}; + edge_data.type = cudaGraphDependencyTypeProgrammatic; + edge_data.from_port = cudaGraphKernelNodePortProgrammatic; + edge_data.to_port = 0; + // Remove existing dependency and add it back with PDL edge properties + CUDA_CHECK(cudaGraphRemoveDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], nullptr, 1)); + CUDA_CHECK(cudaGraphAddDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], &edge_data, 1)); + } + } + } + prev_node_type = node_type; + } + } + } +#endif // CUDA_VERSION >=12000 graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); From 73d28e4077c3e91919207d055ee97bde5b93a783 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 21 Jan 2026 15:58:28 +0100 Subject: [PATCH 02/24] Adds PDL barriers based on simple heuristic: place "sync" before first input pointer access, and "launch" after last write, e.g. to tensors like dst. --- ggml/src/ggml-cuda/binbcast.cu | 9 +++++++++ ggml/src/ggml-cuda/common.cuh | 12 ++++++++++++ ggml/src/ggml-cuda/cpy.cu | 14 ++++++++++++++ ggml/src/ggml-cuda/fattn-common.cuh | 9 +++++++++ ggml/src/ggml-cuda/fattn-vec.cuh | 4 ++++ ggml/src/ggml-cuda/getrows.cu | 8 ++++++++ ggml/src/ggml-cuda/mmvf.cu | 3 +++ ggml/src/ggml-cuda/mmvq.cu | 3 +++ ggml/src/ggml-cuda/norm.cu | 10 ++++++++++ ggml/src/ggml-cuda/quantize.cu | 6 ++++++ ggml/src/ggml-cuda/rope.cu | 11 +++++++++++ ggml/src/ggml-cuda/set-rows.cu | 6 ++++++ ggml/src/ggml-cuda/topk-moe.cu | 3 +++ 13 files changed, 98 insertions(+) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 0e6d777b1e6..d3428f286f3 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -55,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z); if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) { + GGML_CUDA_PDL_LC(); return; } @@ -66,6 +67,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + GGML_CUDA_PDL_SYNC(); const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; @@ -81,6 +83,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, dst_row[i0] = (dst_t) result; } + GGML_CUDA_PDL_LC(); } template = ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) { + GGML_CUDA_PDL_LC(); return; } @@ -130,6 +134,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + GGML_CUDA_PDL_SYNC(); const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; @@ -143,6 +148,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, } dst_row[i0] = (dst_t) result; + GGML_CUDA_PDL_LC(); } template @@ -327,10 +333,12 @@ static __global__ void k_repeat_back( const int64_t tid3 = tid23 / ne2; if (tid0 >= ne0) { + GGML_CUDA_PDL_LC(); return; } T sum = 0; + GGML_CUDA_PDL_SYNC(); for (int64_t i3 = tid3; i3 < ne03; i3 += ne3) { for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) { for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) { @@ -341,6 +349,7 @@ static __global__ void k_repeat_back( } } dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum; + GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e0ab1db0a01..413fe1ab9e8 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -106,6 +106,18 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 +#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ <= GGML_CUDA_CC_HOPPER +# define GGML_CUDA_PDL_SYNC() // no-op on HIP/MUSA +#else +# define GGML_CUDA_PDL_SYNC() cudaGridDependencySynchronize() +#endif + +#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ <= GGML_CUDA_CC_HOPPER +# define GGML_CUDA_PDL_LC() // no-op on HIP/MUSA +#else +# define GGML_CUDA_PDL_LC() cudaTriggerProgrammaticLaunchCompletion() +#endif + #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index ee84303ef0e..01fb84b1428 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -19,6 +19,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { + GGML_CUDA_PDL_LC(); return; } @@ -36,7 +37,9 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + GGML_CUDA_PDL_SYNC(); cpy_1(cx + x_offset, cdst + dst_offset); + GGML_CUDA_PDL_LC(); } template @@ -58,6 +61,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1]; + GGML_CUDA_PDL_SYNC(); #pragma unroll for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) { @@ -86,6 +90,7 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const } } } + GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -124,6 +129,7 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { + GGML_CUDA_PDL_LC(); return; } @@ -139,7 +145,9 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + GGML_CUDA_PDL_SYNC(); cpy_blck(cx + x_offset, cdst + dst_offset); + GGML_CUDA_PDL_LC(); } template @@ -150,6 +158,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { + GGML_CUDA_PDL_LC(); return; } @@ -165,7 +174,9 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + GGML_CUDA_PDL_SYNC(); cpy_blck(cx + x_offset, cdst + dst_offset); + GGML_CUDA_PDL_LC(); } template @@ -173,13 +184,16 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { + GGML_CUDA_PDL_LC(); return; } const src_t * x = (const src_t *) cx; dst_t * dst = (dst_t *) cdst; + GGML_CUDA_PDL_SYNC(); dst[i] = ggml_cuda_cast(x[i]); + GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index b6a7460da83..3d593ac19d2 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -581,6 +581,7 @@ static __global__ void flash_attn_mask_to_KV_max( const int tid = threadIdx.x; const int sequence = blockIdx.y; const int jt = blockIdx.x; + GGML_CUDA_PDL_SYNC(); mask += sequence*s33 + jt*ncols1*s31; @@ -620,10 +621,12 @@ static __global__ void flash_attn_mask_to_KV_max( KV_max_sj += FATTN_KQ_STRIDE; if (threadIdx.x != 0) { + GGML_CUDA_PDL_LC(); return; } KV_max[sequence*ne31 + jt] = KV_max_sj; + GGML_CUDA_PDL_LC(); } template // D == head size @@ -639,6 +642,7 @@ static __global__ void flash_attn_stream_k_fixup( const int jc = j*ncols2 + c; const int tid = threadIdx.x; + GGML_CUDA_PDL_SYNC(); const float * dst_fixup_data = ((const float *) dst_fixup) + gridDim.x*(2*2*ncols); const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. @@ -654,6 +658,7 @@ static __global__ void flash_attn_stream_k_fixup( const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; const bool did_not_write_last = kbc0/iter_k == kbc0_stop/iter_k && kbc0_stop % iter_k != 0; if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { + GGML_CUDA_PDL_LC(); return; } @@ -666,6 +671,7 @@ static __global__ void flash_attn_stream_k_fixup( const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. if (jt*ncols1 + j >= ne01 || zt_gqa*ncols2 + c >= gqa_ratio) { + GGML_CUDA_PDL_LC(); return; } @@ -723,6 +729,7 @@ static __global__ void flash_attn_stream_k_fixup( // Write back final result: *dst = dst_val / rowsum; + GGML_CUDA_PDL_LC(); } template // D == head size @@ -747,6 +754,7 @@ static __global__ void flash_attn_combine_results( const int j_dst_unrolled = (sequence*ne01 + col)*ne02 + head; + GGML_CUDA_PDL_LC(); VKQ_parts += j_dst_unrolled * parallel_blocks*D; VKQ_meta += j_dst_unrolled * parallel_blocks; dst += j_dst_unrolled * D; @@ -776,6 +784,7 @@ static __global__ void flash_attn_combine_results( } dst[tid] = VKQ_numerator / VKQ_denominator; + GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 3f4a78cc6e5..4717708e6ed 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -44,6 +44,7 @@ static __global__ void flash_attn_ext_vec( // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { + GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, @@ -97,6 +98,7 @@ static __global__ void flash_attn_ext_vec( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + GGML_CUDA_PDL_SYNC(); Q += nb03*sequence + nb02* head + nb01*ic0; K += nb13*sequence + nb12*(head / gqa_ratio); V += nb23*sequence + nb22*(head / gqa_ratio); @@ -492,7 +494,9 @@ static __global__ void flash_attn_ext_vec( if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < int(ne01.z))) { dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]); } + GGML_CUDA_PDL_LC(); #else + GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 2fab33243dd..6dd870eb875 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -11,6 +11,7 @@ static __global__ void k_get_rows( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + GGML_CUDA_PDL_SYNC(); for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -36,6 +37,7 @@ static __global__ void k_get_rows( dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); } } + GGML_CUDA_PDL_LC(); } template @@ -47,6 +49,7 @@ static __global__ void k_get_rows_float( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + GGML_CUDA_PDL_SYNC(); for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher. @@ -55,6 +58,7 @@ static __global__ void k_get_rows_float( const int i12 = z % ne12; if (i00 >= ne00) { + GGML_CUDA_PDL_LC(); return; } @@ -66,6 +70,7 @@ static __global__ void k_get_rows_float( dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } } + GGML_CUDA_PDL_LC(); } template @@ -74,6 +79,7 @@ static __global__ void k_get_rows_back_float( const int col = blockIdx.x*blockDim.x + threadIdx.x; if (col >= ncols) { + GGML_CUDA_PDL_LC(); return; } @@ -81,6 +87,7 @@ static __global__ void k_get_rows_back_float( float sum = 0.0f; + GGML_CUDA_PDL_SYNC(); for (int64_t i = 0; i < nrows_grad; ++i) { if (rows[i] != dst_row) { continue; @@ -89,6 +96,7 @@ static __global__ void k_get_rows_back_float( } dst[dst_row*ncols + col] = sum; + GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index d9147202429..f9fc0aab16e 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -39,6 +39,7 @@ static __global__ void mul_mat_vec_f( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + GGML_CUDA_PDL_SYNC(); x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; @@ -334,6 +335,7 @@ static __global__ void mul_mat_vec_f( } if (tid >= ncols_dst) { + GGML_CUDA_PDL_LC(); return; } @@ -367,6 +369,7 @@ static __global__ void mul_mat_vec_f( } dst[tid*stride_col_dst + row] = value; + GGML_CUDA_PDL_LC(); if constexpr (!has_fusion) { GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index ce25ccf427c..62cfabbe394 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -162,6 +162,7 @@ static __global__ void mul_mat_vec_q( const int blocks_per_row_x = ncols_x / qk; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; + GGML_CUDA_PDL_SYNC(); const uint32_t channel_dst = blockIdx.y; uint32_t token_idx = 0; @@ -287,6 +288,7 @@ static __global__ void mul_mat_vec_q( } __syncthreads(); if (threadIdx.y > 0) { + GGML_CUDA_PDL_LC(); return; } @@ -350,6 +352,7 @@ static __global__ void mul_mat_vec_q( } } + GGML_CUDA_PDL_LC(); if constexpr (!has_fusion) { GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate); } diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index ef98f675aa7..6daf2092acb 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -13,6 +13,7 @@ static __global__ void norm_f32( const int sample = blockIdx.z; const int tid = threadIdx.x; + GGML_CUDA_PDL_SYNC(); x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; @@ -35,6 +36,7 @@ static __global__ void norm_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = (x[col] - mean) * inv_std; } + GGML_CUDA_PDL_LC(); } template @@ -46,6 +48,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr float tmp = 0.0f; // partial sum for thread in warp + GGML_CUDA_PDL_SYNC(); for (int j = start; j < end; j += block_size) { tmp += x[j]; } @@ -69,6 +72,7 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr for (int j = start; j < end; j += block_size) { dst[j] *= scale; } + GGML_CUDA_PDL_LC(); } template @@ -105,6 +109,7 @@ static __global__ void rms_norm_f32(const float * x, static_assert(!do_add || do_multiply, "fusing add is not supported without multiplying"); + GGML_CUDA_PDL_SYNC(); x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; @@ -148,6 +153,7 @@ static __global__ void rms_norm_f32(const float * x, dst[col] = scale * x[col]; } } + GGML_CUDA_PDL_LC(); } template @@ -156,6 +162,7 @@ static __global__ void rms_norm_back_f32( const int row = blockIdx.x*blockDim.y + threadIdx.y; const int tid = threadIdx.x; + GGML_CUDA_PDL_SYNC(); grad += int64_t(row)*ncols; xf += int64_t(row)*ncols; dst += int64_t(row)*ncols; @@ -200,6 +207,7 @@ static __global__ void rms_norm_back_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = scale_grad*grad[col] + scale_x*xf[col]; } + GGML_CUDA_PDL_LC(); } // template @@ -248,6 +256,7 @@ static __global__ void l2_norm_f32( const int sample = blockIdx.z; const int tid = threadIdx.x; + GGML_CUDA_PDL_SYNC(); x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; @@ -268,6 +277,7 @@ static __global__ void l2_norm_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = scale * x[col]; } + GGML_CUDA_PDL_LC(); } static void norm_f32_cuda( diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index a8c68e44b16..7ffd4c9fe9d 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -23,6 +23,7 @@ static __global__ void quantize_q8_1( const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0; + GGML_CUDA_PDL_SYNC(); block_q8_1 * y = (block_q8_1 *) vy; const int64_t ib = i_cont / QK8_1; // block index @@ -45,6 +46,7 @@ static __global__ void quantize_q8_1( } y[ib].ds = make_half2(d, sum); + GGML_CUDA_PDL_LC(); } __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { @@ -100,6 +102,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, const int64_t i2 = blockIdx.z % ne2; const int64_t i3 = blockIdx.z / ne2; + GGML_CUDA_PDL_SYNC(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -170,6 +173,7 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, // Store 2 scales packed into 1 uint32 y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0]; } + GGML_CUDA_PDL_LC(); } template @@ -192,6 +196,7 @@ static __global__ void quantize_mmq_q8_1( const int64_t i3 = blockIdx.z / ne2; const int64_t i00 = i0; + GGML_CUDA_PDL_SYNC(); const int64_t i01 = ids ? ids[i1] : i1; const int64_t i02 = i2; const int64_t i03 = i3; @@ -268,6 +273,7 @@ static __global__ void quantize_mmq_q8_1( } else { y[ib].d4[iqs/32] = d; } + GGML_CUDA_PDL_LC(); } void quantize_row_q8_1_cuda( diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 88ed79111a1..ff2349de6a8 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -127,6 +127,7 @@ static __global__ void rope_neox(const T * x, const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { + GGML_CUDA_PDL_LC(); return; } @@ -137,6 +138,7 @@ static __global__ void rope_neox(const T * x, int idst = row_dst * ne0 + i0 / 2; const int ix = channel_x*s2 + row_x*s1 + i0/2; + GGML_CUDA_PDL_SYNC(); // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. @@ -149,6 +151,7 @@ static __global__ void rope_neox(const T * x, dst[idst + i0 / 2 + 0] = ggml_cuda_cast(x[ix + i0 / 2 + 0]); dst[idst + i0 / 2 + 1] = ggml_cuda_cast(x[ix + i0 / 2 + 1]); + GGML_CUDA_PDL_LC(); return; } @@ -166,6 +169,7 @@ static __global__ void rope_neox(const T * x, dst[idst + 0] = ggml_cuda_cast(x0 * cos_theta - x1 * sin_theta); dst[idst + n_dims / 2] = ggml_cuda_cast(x0 * sin_theta + x1 * cos_theta); + GGML_CUDA_PDL_LC(); } template @@ -176,6 +180,7 @@ static __global__ void rope_multi( const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { + GGML_CUDA_PDL_LC(); return; } @@ -187,10 +192,12 @@ static __global__ void rope_multi( const int idst = row_dst*ne0 + i0/2; const int ix = channel_x*s2 + row_x*s1 + i0/2; + GGML_CUDA_PDL_SYNC(); if (i0 >= n_dims) { dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; + GGML_CUDA_PDL_LC(); return; } @@ -236,6 +243,7 @@ static __global__ void rope_multi( dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; + GGML_CUDA_PDL_LC(); } template @@ -246,6 +254,7 @@ static __global__ void rope_vision( const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne0) { + GGML_CUDA_PDL_LC(); return; } @@ -257,6 +266,7 @@ static __global__ void rope_vision( const int idst = row_dst*ne0 + i0/2; const int ix = channel_x*s2 + row_x*s1 + i0/2; + GGML_CUDA_PDL_SYNC(); const int sect_dims = sections.v[0] + sections.v[1]; const int sec_w = sections.v[1] + sections.v[0]; const int sector = (i0 / 2) % sect_dims; @@ -283,6 +293,7 @@ static __global__ void rope_vision( dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + n_dims] = x0*sin_theta + x1*cos_theta; + GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 631de7e8fa5..389ef48b49f 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -30,6 +30,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; if (i >= ne_total) { + GGML_CUDA_PDL_LC(); return; } @@ -53,6 +54,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + GGML_CUDA_PDL_SYNC(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); const float * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; @@ -62,6 +64,7 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, block_type * dst_block = dst_row_ptr + i00 / qk; quantize_func(src_block, dst_block); + GGML_CUDA_PDL_LC(); GGML_UNUSED(ne10); GGML_UNUSED(ne11); @@ -135,6 +138,7 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; if (i >= ne_total) { + GGML_CUDA_PDL_LC(); return; } @@ -157,12 +161,14 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd); const int64_t i10 = i01; + GGML_CUDA_PDL_SYNC(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; dst_row_ptr[i00] = ggml_cuda_cast(src0_row[i00]); + GGML_CUDA_PDL_LC(); GGML_UNUSED(ne10); GGML_UNUSED(ne11); diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 08a88990dde..438a10a9299 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -88,9 +88,11 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * const topk_moe_config config) { const int row = blockIdx.x * blockDim.y + threadIdx.y; if (row >= n_rows) { + GGML_CUDA_PDL_LC(); return; } + GGML_CUDA_PDL_SYNC(); logits += n_experts * row; weights += n_expert_used * row; ids += n_experts * row; @@ -239,6 +241,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * weights[idx] = output_weights[i] * scale_val; } } + GGML_CUDA_PDL_LC(); } template From 000f4625abc4efd46ee160e4473ffe0da004082e Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 21 Jan 2026 17:57:57 +0100 Subject: [PATCH 03/24] Further optimization pass of the first half of kernels --- ggml/src/ggml-cuda/mmvq.cu | 6 +++++- ggml/src/ggml-cuda/quantize.cu | 4 +++- ggml/src/ggml-cuda/topk-moe.cu | 9 +++++---- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 62cfabbe394..c779e76626e 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -248,6 +248,10 @@ static __global__ void mul_mat_vec_q( // x block quant index when casting the quants to int const int kqs = vdr * (tid % (qi/vdr)); + if (!ids) { + GGML_CUDA_PDL_SYNC(); + } + #pragma unroll for (int j = 0; j < ncols_dst; ++j) { #pragma unroll @@ -351,8 +355,8 @@ static __global__ void mul_mat_vec_q( dst[j*stride_col_dst + threadIdx.x] = result; } } - GGML_CUDA_PDL_LC(); + if constexpr (!has_fusion) { GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate); } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 7ffd4c9fe9d..657ac8be943 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -9,6 +9,7 @@ static __global__ void quantize_q8_1( const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { + GGML_CUDA_PDL_LC(); return; } @@ -23,12 +24,12 @@ static __global__ void quantize_q8_1( const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0; - GGML_CUDA_PDL_SYNC(); block_q8_1 * y = (block_q8_1 *) vy; const int64_t ib = i_cont / QK8_1; // block index const int64_t iqs = i_cont % QK8_1; // quant index + GGML_CUDA_PDL_SYNC(); const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); float sum = xi; @@ -42,6 +43,7 @@ static __global__ void quantize_q8_1( y[ib].qs[iqs] = q; if (iqs > 0) { + GGML_CUDA_PDL_LC(); return; } diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 438a10a9299..d2f90eb268e 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -92,6 +92,11 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * return; } + constexpr int experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1; + float wt[experts_per_thread]; + float wt_sum = 0.f; + float output_weights[experts_per_thread]; + GGML_CUDA_PDL_SYNC(); logits += n_experts * row; weights += n_expert_used * row; @@ -142,10 +147,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * //or the raw logits. We do the argmax reduce over n_expert_used, each time marking //the expert weight as -inf to exclude from the next iteration - float wt_sum = 0.f; - - float output_weights[experts_per_thread]; - #pragma unroll for (int i = 0; i < experts_per_thread; i++) { output_weights[i] = 0.f; From b68aee76493ebe045989935454d1d322fbbe50c8 Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 22 Jan 2026 15:50:24 +0100 Subject: [PATCH 04/24] Optimized PDL barriers for the second batch of kernels --- ggml/src/ggml-cuda/fattn-common.cuh | 2 +- ggml/src/ggml-cuda/fattn-vec.cuh | 4 +++- ggml/src/ggml-cuda/mmvf.cu | 2 +- ggml/src/ggml-cuda/topk-moe.cu | 6 +----- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 3d593ac19d2..7534bcf965f 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -754,7 +754,6 @@ static __global__ void flash_attn_combine_results( const int j_dst_unrolled = (sequence*ne01 + col)*ne02 + head; - GGML_CUDA_PDL_LC(); VKQ_parts += j_dst_unrolled * parallel_blocks*D; VKQ_meta += j_dst_unrolled * parallel_blocks; dst += j_dst_unrolled * D; @@ -763,6 +762,7 @@ static __global__ void flash_attn_combine_results( __builtin_assume(tid < D); extern __shared__ float2 meta[]; + GGML_CUDA_PDL_SYNC(); for (int i = tid; i < 2*parallel_blocks; i += D) { ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; } diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 4717708e6ed..e5195aae00e 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -98,7 +98,7 @@ static __global__ void flash_attn_ext_vec( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - GGML_CUDA_PDL_SYNC(); + Q += nb03*sequence + nb02* head + nb01*ic0; K += nb13*sequence + nb12*(head / gqa_ratio); V += nb23*sequence + nb22*(head / gqa_ratio); @@ -138,6 +138,8 @@ static __global__ void flash_attn_ext_vec( #endif // V_DOT2_F32_F16_AVAILABLE int Q_i32[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; + + GGML_CUDA_PDL_SYNC(); if constexpr (Q_q8_1) { #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index f9fc0aab16e..b31a0691d47 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -39,7 +39,6 @@ static __global__ void mul_mat_vec_f( constexpr int warp_size = ggml_cuda_get_physical_warp_size(); - GGML_CUDA_PDL_SYNC(); x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; @@ -80,6 +79,7 @@ static __global__ void mul_mat_vec_f( gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; } + GGML_CUDA_PDL_SYNC(); const int channel_bias = ids ? channel_x : channel_dst; if constexpr (has_fusion) { diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index d2f90eb268e..0ee203a5688 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -97,21 +97,17 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * float wt_sum = 0.f; float output_weights[experts_per_thread]; - GGML_CUDA_PDL_SYNC(); logits += n_experts * row; weights += n_expert_used * row; ids += n_experts * row; - constexpr int experts_per_thread = (n_experts > WARP_SIZE) ? n_experts / WARP_SIZE : 1; - - float wt[experts_per_thread]; - // Initialize all slots to -INFINITY #pragma unroll for (int i = 0; i < experts_per_thread; i++) { wt[i] = -INFINITY; } + GGML_CUDA_PDL_SYNC(); #pragma unroll for (int i = 0; i < n_experts; i += WARP_SIZE) { const int expert = i + threadIdx.x; From 101583e249556eee8f598da7c4c1898b53618a50 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 4 Feb 2026 16:52:49 +0100 Subject: [PATCH 05/24] Further refinements after rebase. --- ggml/src/ggml-cuda/fattn-vec.cuh | 1 - ggml/src/ggml-cuda/mmvf.cu | 2 +- ggml/src/ggml-cuda/mmvq.cu | 6 +----- ggml/src/ggml-cuda/norm.cu | 2 +- 4 files changed, 3 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index e5195aae00e..2e6e0b4761f 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -98,7 +98,6 @@ static __global__ void flash_attn_ext_vec( const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. - Q += nb03*sequence + nb02* head + nb01*ic0; K += nb13*sequence + nb12*(head / gqa_ratio); V += nb23*sequence + nb22*(head / gqa_ratio); diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index b31a0691d47..b8cdb45a49d 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -21,6 +21,7 @@ static __global__ void mul_mat_vec_f( int channel_y; int sample_dst; + GGML_CUDA_PDL_SYNC(); if constexpr (is_multi_token_id) { // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case token_idx = blockIdx.z; @@ -79,7 +80,6 @@ static __global__ void mul_mat_vec_f( gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; } - GGML_CUDA_PDL_SYNC(); const int channel_bias = ids ? channel_x : channel_dst; if constexpr (has_fusion) { diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index c779e76626e..c3191c636c9 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -162,7 +162,6 @@ static __global__ void mul_mat_vec_q( const int blocks_per_row_x = ncols_x / qk; constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi; - GGML_CUDA_PDL_SYNC(); const uint32_t channel_dst = blockIdx.y; uint32_t token_idx = 0; @@ -170,6 +169,7 @@ static __global__ void mul_mat_vec_q( uint32_t channel_y; uint32_t sample_dst; + GGML_CUDA_PDL_SYNC(); if constexpr (is_multi_token_id) { // Multi-token MUL_MAT_ID path, adding these in the normal path causes a perf regression for n_tokens=1 case token_idx = blockIdx.z; @@ -248,10 +248,6 @@ static __global__ void mul_mat_vec_q( // x block quant index when casting the quants to int const int kqs = vdr * (tid % (qi/vdr)); - if (!ids) { - GGML_CUDA_PDL_SYNC(); - } - #pragma unroll for (int j = 0; j < ncols_dst; ++j) { #pragma unroll diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 6daf2092acb..c90c07377b0 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -109,7 +109,6 @@ static __global__ void rms_norm_f32(const float * x, static_assert(!do_add || do_multiply, "fusing add is not supported without multiplying"); - GGML_CUDA_PDL_SYNC(); x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; @@ -129,6 +128,7 @@ static __global__ void rms_norm_f32(const float * x, float tmp = 0.0f; // partial sum for thread in warp + GGML_CUDA_PDL_SYNC(); for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; From 0e7aa04ff840eb131c81fd5642b0851c4bfc078e Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 5 Feb 2026 11:06:20 +0100 Subject: [PATCH 06/24] Moves pdl logic to separate function, removes some whitespace --- ggml/src/ggml-cuda/ggml-cuda.cu | 129 +++++++++++++++++--------------- 1 file changed, 67 insertions(+), 62 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 080d3b2d519..9606b1bf4fd 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3402,6 +3402,72 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, return false; } +static void ggml_cuda_graph_instantiate_pdl(ggml_cuda_graph * graph) { + + +#if CUDA_VERSION >= 12000 + // Set programmatic dependent launch (PDL) properties for all edges + // This will only have an effect on Hopper and later GPUs, but is harmless on older GPUs. + // Only allow PDL if it hasn't been disabled due to presence of library kernels in CUDA graph + // since we can't add corresponding CUDA API sync calls to these. + // TO DO identify graph nodes that contain such library kernels and refrain from setting PDL + // launch properties only on those nodes (non-trivial). + if (graph->allow_pdl) { + + size_t num_nodes = 0; + // First call with null arg gives number of nodes + CUDA_CHECK(cudaGraphGetNodes(graph->graph, nullptr, &num_nodes)); + + if (num_nodes > graph->graph_nodes.size()) { + graph->graph_nodes.resize(num_nodes); + } + if (num_nodes > 0) { + // This call gives actual nodes + CUDA_CHECK(cudaGraphGetNodes(graph->graph, graph->graph_nodes.data(), &num_nodes)); + } + + size_t max_dependencies = 0; + for (size_t i = 0; i < num_nodes; i++) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); + if (num_dependencies > max_dependencies) + max_dependencies = num_dependencies; + } + if (max_dependencies > graph->graph_dependencies.size()) { + graph->graph_dependencies.resize(max_dependencies); + } + + if (num_nodes > 0) { + cudaGraphNodeType prev_node_type = cudaGraphNodeTypeKernel; + for (size_t i = 0; i < num_nodes; i++) { + cudaGraphNodeType node_type; + CUDA_CHECK(cudaGraphNodeGetType(graph->graph_nodes[i], &node_type)); + if (node_type == cudaGraphNodeTypeKernel && prev_node_type == cudaGraphNodeTypeKernel) { + size_t num_dependencies = 0; + // First call with null arg gives number of dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); + if (num_dependencies > 0) { + // This call gives actual dependencies + CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], graph->graph_dependencies.data(), nullptr, &num_dependencies)); + for (size_t j = 0; j < num_dependencies; j++) { + cudaGraphEdgeData edge_data = {}; + edge_data.type = cudaGraphDependencyTypeProgrammatic; + edge_data.from_port = cudaGraphKernelNodePortProgrammatic; + edge_data.to_port = 0; + // Remove existing dependency and add it back with PDL edge properties + CUDA_CHECK(cudaGraphRemoveDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], nullptr, 1)); + CUDA_CHECK(cudaGraphAddDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], &edge_data, 1)); + } + } + } + prev_node_type = node_type; + } + } + } +#endif // CUDA_VERSION >=12000 +} + static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) { bool graph_evaluated_or_captured = false; @@ -3910,68 +3976,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud } CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph)); - -#if CUDA_VERSION >= 12030 - // Set programmatic dependent launch (PDL) properties for all edges - // This will only have an effect on Hopper and later GPUs, but is harmless on older GPUs. - // Only allow PDL if it hasn't been disabled due to presence of library kernels in CUDA graph - // since we can't add corresponding CUDA API sync calls to these. - // TO DO identify graph nodes that contain such library kernels and refrain from setting PDL - // launch properties only on those nodes (non-trivial). - if (graph->allow_pdl) { - - size_t num_nodes = 0; - // First call with null arg gives number of nodes - CUDA_CHECK(cudaGraphGetNodes(graph->graph, nullptr, &num_nodes)); - - if (num_nodes > graph->graph_nodes.size()) { - graph->graph_nodes.resize(num_nodes); - } - if (num_nodes > 0) { - // This call gives actual nodes - CUDA_CHECK(cudaGraphGetNodes(graph->graph, graph->graph_nodes.data(), &num_nodes)); - } - - size_t max_dependencies = 0; - for (size_t i = 0; i < num_nodes; i++) { - size_t num_dependencies = 0; - // First call with null arg gives number of dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); - if (num_dependencies > max_dependencies) - max_dependencies = num_dependencies; - } - if (max_dependencies > graph->graph_dependencies.size()) { - graph->graph_dependencies.resize(max_dependencies); - } - - if (num_nodes > 0) { - cudaGraphNodeType prev_node_type = cudaGraphNodeTypeKernel; - for (size_t i = 0; i < num_nodes; i++) { - cudaGraphNodeType node_type; - CUDA_CHECK(cudaGraphNodeGetType(graph->graph_nodes[i], &node_type)); - if (node_type == cudaGraphNodeTypeKernel && prev_node_type == cudaGraphNodeTypeKernel) { - size_t num_dependencies = 0; - // First call with null arg gives number of dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); - if (num_dependencies > 0) { - // This call gives actual dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], graph->graph_dependencies.data(), nullptr, &num_dependencies)); - for (size_t j = 0; j < num_dependencies; j++) { - cudaGraphEdgeData edge_data = {}; - edge_data.type = cudaGraphDependencyTypeProgrammatic; - edge_data.from_port = cudaGraphKernelNodePortProgrammatic; - edge_data.to_port = 0; - // Remove existing dependency and add it back with PDL edge properties - CUDA_CHECK(cudaGraphRemoveDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], nullptr, 1)); - CUDA_CHECK(cudaGraphAddDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], &edge_data, 1)); - } - } - } - prev_node_type = node_type; - } - } - } -#endif // CUDA_VERSION >=12000 + ggml_cuda_graph_instantiate_pdl(graph); graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); From d8eb8abf42b45db8322fbf7e3699dffa8ee8d1b7 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 13 Feb 2026 11:55:13 +0100 Subject: [PATCH 07/24] Strips post-hoc PDL logic --- ggml/src/ggml-cuda/common.cuh | 3 - ggml/src/ggml-cuda/ggml-cuda.cu | 97 +++------------------------------ 2 files changed, 7 insertions(+), 93 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 413fe1ab9e8..ca3422d99bc 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1188,9 +1188,6 @@ struct ggml_cuda_graph { static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates); } - std::vector graph_nodes; - std::vector graph_dependencies; - bool allow_pdl = true; #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 9606b1bf4fd..eeb8625dbeb 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -1176,14 +1176,6 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; //} -static void ggml_backend_cuda_graph_disable_pdl(ggml_backend_cuda_context & ctx, const void * graph_key) { - - if (graph_key) { - ggml_cuda_graph * graph = ctx.cuda_graph(graph_key); - graph->allow_pdl = false; - } -} - /// kernels typedef void (*ggml_cuda_op_mul_mat_t)( @@ -2188,7 +2180,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { return use_mul_mat_vec_q; } -static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const void * graph_key) { +static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q. @@ -2261,7 +2253,6 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && (use_batched_cublas_f16 || use_batched_cublas_bf16 || use_batched_cublas_f32) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // general KQ + KQV multi-batch without FlashAttention - ggml_backend_cuda_graph_disable_pdl(ctx, graph_key); ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_mul_mat_vec_f) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_f, nullptr); @@ -2270,12 +2261,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (use_mul_mat_q) { ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda); } else { - ggml_backend_cuda_graph_disable_pdl(ctx, graph_key); ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr); } } -static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst, const void * graph_key) { +static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * ids = dst->src[2]; @@ -2416,7 +2406,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst_slice.nb[3] = dst_slice.ne[2] * dst_slice.nb[2]; dst_slice.data = dst_data_cur; - ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice, graph_key); + ggml_cuda_mul_mat(ctx, &src0_slice, &src1_slice, &dst_slice); CUDA_CHECK(cudaGetLastError()); src1_data_cur += src1_slice.nb[2]; @@ -2429,7 +2419,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * nb1, nb2, nb3, stream); } -static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst, const void * graph_key) { +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { // why is this here instead of mul_mat? if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) { ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); @@ -2624,10 +2614,10 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg ggml_cuda_op_rms_norm_back(ctx, dst); break; case GGML_OP_MUL_MAT: - ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst, graph_key); + ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); break; case GGML_OP_MUL_MAT_ID: - ggml_cuda_mul_mat_id(ctx, dst, graph_key); + ggml_cuda_mul_mat_id(ctx, dst); break; case GGML_OP_OUT_PROD: ggml_cuda_out_prod(ctx, dst); @@ -3402,72 +3392,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, return false; } -static void ggml_cuda_graph_instantiate_pdl(ggml_cuda_graph * graph) { - - -#if CUDA_VERSION >= 12000 - // Set programmatic dependent launch (PDL) properties for all edges - // This will only have an effect on Hopper and later GPUs, but is harmless on older GPUs. - // Only allow PDL if it hasn't been disabled due to presence of library kernels in CUDA graph - // since we can't add corresponding CUDA API sync calls to these. - // TO DO identify graph nodes that contain such library kernels and refrain from setting PDL - // launch properties only on those nodes (non-trivial). - if (graph->allow_pdl) { - - size_t num_nodes = 0; - // First call with null arg gives number of nodes - CUDA_CHECK(cudaGraphGetNodes(graph->graph, nullptr, &num_nodes)); - - if (num_nodes > graph->graph_nodes.size()) { - graph->graph_nodes.resize(num_nodes); - } - if (num_nodes > 0) { - // This call gives actual nodes - CUDA_CHECK(cudaGraphGetNodes(graph->graph, graph->graph_nodes.data(), &num_nodes)); - } - - size_t max_dependencies = 0; - for (size_t i = 0; i < num_nodes; i++) { - size_t num_dependencies = 0; - // First call with null arg gives number of dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); - if (num_dependencies > max_dependencies) - max_dependencies = num_dependencies; - } - if (max_dependencies > graph->graph_dependencies.size()) { - graph->graph_dependencies.resize(max_dependencies); - } - - if (num_nodes > 0) { - cudaGraphNodeType prev_node_type = cudaGraphNodeTypeKernel; - for (size_t i = 0; i < num_nodes; i++) { - cudaGraphNodeType node_type; - CUDA_CHECK(cudaGraphNodeGetType(graph->graph_nodes[i], &node_type)); - if (node_type == cudaGraphNodeTypeKernel && prev_node_type == cudaGraphNodeTypeKernel) { - size_t num_dependencies = 0; - // First call with null arg gives number of dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], nullptr, nullptr, &num_dependencies)); - if (num_dependencies > 0) { - // This call gives actual dependencies - CUDA_CHECK(cudaGraphNodeGetDependencies(graph->graph_nodes[i], graph->graph_dependencies.data(), nullptr, &num_dependencies)); - for (size_t j = 0; j < num_dependencies; j++) { - cudaGraphEdgeData edge_data = {}; - edge_data.type = cudaGraphDependencyTypeProgrammatic; - edge_data.from_port = cudaGraphKernelNodePortProgrammatic; - edge_data.to_port = 0; - // Remove existing dependency and add it back with PDL edge properties - CUDA_CHECK(cudaGraphRemoveDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], nullptr, 1)); - CUDA_CHECK(cudaGraphAddDependencies(graph->graph, &graph->graph_dependencies[j], &graph->graph_nodes[i], &edge_data, 1)); - } - } - } - prev_node_type = node_type; - } - } - } -#endif // CUDA_VERSION >=12000 -} - static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) { bool graph_evaluated_or_captured = false; @@ -3949,13 +3873,7 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud GGML_UNUSED(integrated); #endif // NDEBUG -#ifdef USE_CUDA_GRAPH - const void * graph_key = ggml_cuda_graph_get_key(cgraph); - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, graph_key); -#else - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node, nullptr); -#endif // USE_CUDA_GRAPH - + bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -3976,7 +3894,6 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud } CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &graph->graph)); - ggml_cuda_graph_instantiate_pdl(graph); graph_evaluated_or_captured = true; // CUDA graph has been captured std::lock_guard lock(ggml_cuda_lock); From 12ddf128827aade2dd27dbfbd379ec1b3a28f57f Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 13 Feb 2026 12:50:13 +0100 Subject: [PATCH 08/24] Adds stream capture PDL setup. Enrolls quantize_q8_1 to leverage pdl to overlap execution with previous kernels --- ggml/src/ggml-cuda/common.cuh | 27 +++++++++++++++++++++++++++ ggml/src/ggml-cuda/quantize.cu | 3 ++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index ca3422d99bc..1fdaad8e745 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -118,6 +118,33 @@ # define GGML_CUDA_PDL_LC() cudaTriggerProgrammaticLaunchCompletion() #endif +struct ggml_cuda_pdl_config { + cudaLaunchAttribute attr; + cudaLaunchConfig_t cfg; + + ggml_cuda_pdl_config(dim3 grid, dim3 block, size_t shmem, cudaStream_t s) { + attr.id = cudaLaunchAttributeProgrammaticStreamSerialization; + attr.val.programmaticStreamSerializationAllowed = 1; + + cfg = {}; + cfg.gridDim = grid; + cfg.blockDim = block; + cfg.dynamicSmemBytes = shmem; + cfg.stream = s; + cfg.attrs = &attr; + cfg.numAttrs = 1; + } + + // Delete due to &attr + ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; + + ggml_cuda_pdl_config(ggml_cuda_pdl_config&& o) noexcept : attr(o.attr), cfg(o.cfg) { + cfg.attrs = &attr; + } +}; + #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 657ac8be943..474bfa1afd4 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -290,7 +290,8 @@ void quantize_row_q8_1_cuda( const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); + auto pdl_cfg = ggml_cuda_pdl_config(num_blocks, block_size, 0, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, quantize_q8_1, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv)); GGML_UNUSED(type_src0); } From adfd442ac2212c70a4ad1ba4ec6dcfe0ebbfee69 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 13 Feb 2026 14:53:20 +0100 Subject: [PATCH 09/24] Enrolls mul_mat_vec_q, rms_norm_f32 and k_bin_bcast (partly) into PDL --- ggml/src/ggml-cuda/binbcast.cu | 6 ++++-- ggml/src/ggml-cuda/mmvq.cu | 14 ++++++++------ ggml/src/ggml-cuda/norm.cu | 11 +++++++++-- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index d3428f286f3..2ae919239fd 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -304,17 +304,19 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * } else { const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); if constexpr (sizeof...(I) > 0) { + // TODO revisit this invocation, variadic templates are difficult to use with cudaLaunchKernelEx k_bin_bcast<<>>( src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /* s0, */ s1, s2, s3, /* s00,*/ s01, s02, s03, /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { - k_bin_bcast<<>>( + auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, 0, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, k_bin_bcast, src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /* s0, */ s1, s2, s3, /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12, s13); + /* s10,*/ s11, s12, s13)); } } } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index c3191c636c9..cba574bb001 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -380,20 +380,22 @@ static void mul_mat_vec_q_switch_fusion( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_q, + vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_q<<>> - (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, + auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_q, +vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); } template diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index c90c07377b0..a2b5ca203c9 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -310,10 +310,17 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + auto pdl_cfg = ggml_cuda_pdl_config(blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rms_norm_f32<256, false>, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + // cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0))); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + auto pdl_cfg = ggml_cuda_pdl_config(blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rms_norm_f32<1024, false>, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0))); } } From 7f1342af807c010d86daad8ca94f0e74d62bdf15 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 18 Feb 2026 10:43:00 +0100 Subject: [PATCH 10/24] Enrolls mmvf, rope, set-rows and topk kernels for gpt-oss into PDL --- ggml/src/ggml-cuda/mmvf.cu | 13 +++++----- ggml/src/ggml-cuda/rope.cu | 9 ++++--- ggml/src/ggml-cuda/set-rows.cu | 8 +++--- ggml/src/ggml-cuda/topk-moe.cu | 45 +++++++++++++++++----------------- 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index b8cdb45a49d..37a266c62ae 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -385,23 +385,24 @@ static void mul_mat_vec_f_switch_fusion( const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) { + auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (ncols_dst == 1) { if (has_fusion) { - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_f, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - mul_mat_vec_f<<>> - (x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_f, + x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index ff2349de6a8..ef3fefcdfd4 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -358,15 +358,16 @@ static void rope_neox_cuda(const T * x, const dim3 block_nums(nr, n_blocks_x, 1); const float theta_scale = powf(freq_base, -2.0f/n_dims); + auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, 0, stream); if (freq_factors == nullptr) { - rope_neox<<>>( + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rope_neox, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + freq_factors, row_indices, set_rows_stride)); } else { - rope_neox<<>>( + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rope_neox, x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, - freq_factors, row_indices, set_rows_stride); + freq_factors, row_indices, set_rows_stride)); } } diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 389ef48b49f..b1419f66511 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -209,9 +209,11 @@ static void set_rows_cuda( const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); - k_set_rows<<>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, - s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, - ne11_fd, ne12_fd); + auto pdl_cfg = ggml_cuda_pdl_config(grid_size, block_size, 0, stream); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, k_set_rows, + src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, + s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, + ne11_fd, ne12_fd)); } } diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 0ee203a5688..e7ac3b2f378 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -259,51 +259,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1); dim3 block_dims(WARP_SIZE, rows_per_block, 1); cudaStream_t stream = ctx.stream(); + auto pdl_cfg = ggml_cuda_pdl_config(grid_dims, block_dims, 0, stream); switch (n_expert) { case 1: - topk_moe_cuda<1, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<1, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 2: - topk_moe_cuda<2, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<2, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 4: - topk_moe_cuda<4, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<4, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 8: - topk_moe_cuda<8, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<8, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 16: - topk_moe_cuda<16, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<16, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 32: - topk_moe_cuda<32, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<32, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 64: - topk_moe_cuda<64, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<64, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 128: - topk_moe_cuda<128, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<128, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 256: - topk_moe_cuda<256, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<256, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 512: - topk_moe_cuda<512, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<512, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; case 576: - topk_moe_cuda<576, has_bias><<>>(logits, weights, ids, bias, n_rows, n_expert_used, - clamp_val, scale_val, config); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<576, has_bias>, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); break; default: GGML_ASSERT(false && "fatal error"); From c2d9d47a22cf16449578c2647c91ba2d8ddb1860 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 18 Feb 2026 16:39:25 +0100 Subject: [PATCH 11/24] Introduce ggml_cuda_kernel_launch, to abstract away cudaLaunchKernelEx, to enable hip/musa compatibility --- ggml/src/ggml-cuda/binbcast.cu | 10 ++-- ggml/src/ggml-cuda/common.cuh | 95 ++++++++++++++++++++++------------ ggml/src/ggml-cuda/mmvf.cu | 11 ++-- ggml/src/ggml-cuda/mmvq.cu | 12 ++--- ggml/src/ggml-cuda/norm.cu | 20 +++---- ggml/src/ggml-cuda/quantize.cu | 4 +- ggml/src/ggml-cuda/rope.cu | 16 ++---- ggml/src/ggml-cuda/set-rows.cu | 6 +-- ggml/src/ggml-cuda/topk-moe.cu | 46 ++++++++-------- 9 files changed, 124 insertions(+), 96 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 2ae919239fd..07c4a60ad11 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -304,19 +304,21 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * } else { const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); if constexpr (sizeof...(I) > 0) { - // TODO revisit this invocation, variadic templates are difficult to use with cudaLaunchKernelEx + // TODO discuss. Variadic templates are difficult to use with cudaLaunchKernelEx. + // For <<<>>>, the compiler can see all args at the call site and deduce src_ptrs... at compile time + // For cudaLaunchKernelEx, we would need to explicitly instantiate the kernel template. k_bin_bcast<<>>( src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /* s0, */ s1, s2, s3, /* s00,*/ s01, s02, s03, /* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } else { - auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, 0, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, k_bin_bcast, + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast, launch_params, src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, /* s0, */ s1, s2, s3, /* s00,*/ s01, s02, s03, - /* s10,*/ s11, s12, s13)); + /* s10,*/ s11, s12, s13); } } } diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1fdaad8e745..1d6d4cb936d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -106,45 +106,22 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 -#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ <= GGML_CUDA_CC_HOPPER -# define GGML_CUDA_PDL_SYNC() // no-op on HIP/MUSA -#else +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +# define GGML_CUDA_USE_PDL +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER + +#if defined(GGML_CUDA_USE_PDL) # define GGML_CUDA_PDL_SYNC() cudaGridDependencySynchronize() +#else +# define GGML_CUDA_PDL_SYNC() // no-op on HIP/MUSA #endif -#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) || __CUDA_ARCH__ <= GGML_CUDA_CC_HOPPER -# define GGML_CUDA_PDL_LC() // no-op on HIP/MUSA -#else +#if defined(GGML_CUDA_USE_PDL) # define GGML_CUDA_PDL_LC() cudaTriggerProgrammaticLaunchCompletion() +#else +# define GGML_CUDA_PDL_LC() // no-op on HIP/MUSA #endif -struct ggml_cuda_pdl_config { - cudaLaunchAttribute attr; - cudaLaunchConfig_t cfg; - - ggml_cuda_pdl_config(dim3 grid, dim3 block, size_t shmem, cudaStream_t s) { - attr.id = cudaLaunchAttributeProgrammaticStreamSerialization; - attr.val.programmaticStreamSerializationAllowed = 1; - - cfg = {}; - cfg.gridDim = grid; - cfg.blockDim = block; - cfg.dynamicSmemBytes = shmem; - cfg.stream = s; - cfg.attrs = &attr; - cfg.numAttrs = 1; - } - - // Delete due to &attr - ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete; - ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete; - ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; - - ggml_cuda_pdl_config(ggml_cuda_pdl_config&& o) noexcept : attr(o.attr), cfg(o.cfg) { - cfg.attrs = &attr; - } -}; - #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; @@ -203,6 +180,58 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) +struct ggml_cuda_kernel_launch_params { + dim3 block_nums; + dim3 block_dims; + size_t shmem; + cudaStream_t stream; + + // size_t shmem + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, size_t shmem_, cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem(shmem_), stream(stream_) {} + + // int shmem + ggml_cuda_kernel_launch_params(const dim3& block_nums_, const dim3& block_dims_, const int shmem_, cudaStream_t stream_) + : block_nums(block_nums_), block_dims(block_dims_), shmem((size_t)shmem_), stream(stream_) {} +}; + +#if defined(GGML_CUDA_USE_PDL) +struct ggml_cuda_pdl_config { + cudaLaunchAttribute attr; + cudaLaunchConfig_t cfg; + + ggml_cuda_pdl_config(const ggml_cuda_kernel_launch_params & params) { + attr.id = cudaLaunchAttributeProgrammaticStreamSerialization; + attr.val.programmaticStreamSerializationAllowed = 1; + + cfg = {}; + cfg.gridDim = params.block_nums; + cfg.blockDim = params.block_dims; + cfg.dynamicSmemBytes = params.shmem; + cfg.stream = params.stream; + cfg.attrs = &attr; + cfg.numAttrs = 1; + } + + // Delete due to &attr + ggml_cuda_pdl_config(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(const ggml_cuda_pdl_config&) = delete; + ggml_cuda_pdl_config& operator=(ggml_cuda_pdl_config&&) = delete; + +}; +#endif //defined(GGML_CUDA_USE_PDL) + + +template +void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args... args) { +#if defined(GGML_CUDA_USE_PDL) + auto pdl_cfg = ggml_cuda_pdl_config(launch_params); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, args... )); +#else + kernel<<>>(args... ); +#endif //defined(GGML_CUDA_USE_PDL) +} + #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA) static const char * cublas_get_error_str(const cublasStatus_t err) { return cublasGetStatusString(err); diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 37a266c62ae..3b21c8a87b0 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -385,24 +385,25 @@ static void mul_mat_vec_f_switch_fusion( const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const int ids_stride, const cudaStream_t stream) { - auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, nbytes_shared, stream}; + const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (ncols_dst == 1) { if (has_fusion) { - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_f, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_f, + ggml_cuda_kernel_launch(mul_mat_vec_f, launch_params, x, y, ids, fusion, dst, ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index cba574bb001..a5256be949b 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -380,22 +380,22 @@ static void mul_mat_vec_q_switch_fusion( const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr; if constexpr (c_ncols_dst == 1) { if (has_fusion) { - auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_q, + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); return; } } GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1"); - auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, nbytes_shared, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, mul_mat_vec_q, + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, nbytes_shared, stream); + ggml_cuda_kernel_launch(mul_mat_vec_q, launch_params, vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride)); + sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride); } template diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index a2b5ca203c9..f50e20f0f9d 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -310,17 +310,19 @@ static void rms_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - auto pdl_cfg = ggml_cuda_pdl_config(blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rms_norm_f32<256, false>, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, - // rms_norm_f32<256, false><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); - // cudaLaunchKernelEx does not support default params - nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0))); + ggml_cuda_kernel_launch_params launch_params = {blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, false>, launch_params, + x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - auto pdl_cfg = ggml_cuda_pdl_config(blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rms_norm_f32<1024, false>, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, - // cudaLaunchKernelEx does not support default params - nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0))); + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, false>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 474bfa1afd4..bc6f722b561 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -290,8 +290,8 @@ void quantize_row_q8_1_cuda( const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; const dim3 num_blocks(block_num_x, ne1, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); - auto pdl_cfg = ggml_cuda_pdl_config(num_blocks, block_size, 0, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, quantize_q8_1, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv)); + auto launch_params = ggml_cuda_kernel_launch_params(num_blocks, block_size, 0, stream); + ggml_cuda_kernel_launch(quantize_q8_1, launch_params, x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv); GGML_UNUSED(type_src0); } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 08583a9ff00..0aa18a3aff3 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -410,22 +410,16 @@ static void rope_neox_cuda(const T * x, const dim3 block_nums(nr, n_blocks_x, 1); const float theta_scale = powf(freq_base, -2.0f / n_dims); - auto pdl_cfg = ggml_cuda_pdl_config(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch_params launch_params = {block_nums, block_dims, 0, stream}; if (freq_factors == nullptr) { - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rope_neox, + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, - attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride)); - // rope_neox<<>>( - // x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, - // attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } else { - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, rope_neox, + ggml_cuda_kernel_launch(rope_neox, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, - attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride)); - // rope_neox<<>>( - // x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, - // attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); + attn_factor, corr_dims, theta_scale, freq_factors, row_indices, set_rows_stride); } } diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index b1419f66511..b614f1fe57f 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -209,11 +209,11 @@ static void set_rows_cuda( const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11); const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12); - auto pdl_cfg = ggml_cuda_pdl_config(grid_size, block_size, 0, stream); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, k_set_rows, + auto launch_params = ggml_cuda_kernel_launch_params(grid_size, block_size, 0, stream); + ggml_cuda_kernel_launch(k_set_rows, launch_params, src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd, - ne11_fd, ne12_fd)); + ne11_fd, ne12_fd); } } diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index e7ac3b2f378..e142c478feb 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -259,52 +259,52 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx, dim3 grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1); dim3 block_dims(WARP_SIZE, rows_per_block, 1); cudaStream_t stream = ctx.stream(); - auto pdl_cfg = ggml_cuda_pdl_config(grid_dims, block_dims, 0, stream); + auto launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (n_expert) { case 1: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<1, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<1, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 2: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<2, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<2, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 4: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<4, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<4, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 8: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<8, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<8, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 16: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<16, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<16, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 32: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<32, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<32, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 64: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<64, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<64, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 128: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<128, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<128, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 256: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<256, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<256, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 512: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<512, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<512, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; case 576: - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, topk_moe_cuda<576, has_bias>, - logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config)); + ggml_cuda_kernel_launch(topk_moe_cuda<576, has_bias>, launch_params, + logits, weights, ids, bias, n_rows, n_expert_used, clamp_val, scale_val, config); break; default: GGML_ASSERT(false && "fatal error"); From d942a3aee4cdafa024b8db649e1ba2b2096eb34c Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 18 Feb 2026 17:31:56 +0100 Subject: [PATCH 12/24] Enrolls cpy_scalar_contiguous, k_get_rows_float and rms_norm_f32 --- ggml/src/ggml-cuda/cpy.cu | 4 ++-- ggml/src/ggml-cuda/getrows.cu | 3 ++- ggml/src/ggml-cuda/norm.cu | 20 ++++++++++++++------ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 01fb84b1428..22c653aab62 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -203,8 +203,8 @@ cudaStream_t stream) { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar_contiguous<<>> - (cx, cdst, ne); + auto launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_contiguous, launch_params, cx, cdst, ne); } template diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 6dd870eb875..4c6d11a78b3 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -154,7 +154,8 @@ static void get_rows_cuda_float( const size_t s12 = nb12 / sizeof(int32_t); // const size_t s13 = nb13 / sizeof(int32_t); - k_get_rows_float<<>>( + auto launch_params = ggml_cuda_kernel_launch_params{block_nums, block_dims, 0, stream}; + ggml_cuda_kernel_launch(k_get_rows_float, launch_params, src0_d, src1_d, dst_d, ne00, /*ne01, ne02, ne03,*/ /*ne10,*/ ne11, ne12, /*ne13,*/ diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index f50e20f0f9d..94b815cf3f5 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -365,14 +365,20 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, - mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed); + mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, + // underlying cudaLaunchKernelEx does not support default params + nullptr, 0, 0, 0, make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0), make_uint3(0, 0, 0)); } } else { const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols); @@ -386,14 +392,16 @@ static void rms_norm_mul_f32_cuda(const float * x, const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples); if (ncols < 1024) { const dim3 block_dims(256, 1, 1); - rms_norm_f32<256, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims,block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<256, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, add_nchannels_packed, add_nsamples_packed); } else { const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024, true, true><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>( + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(rms_norm_f32<1024, true, true>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add, add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed, From 11150f06b9d19df771f57dd9639d44b78d987637 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 18 Feb 2026 17:49:37 +0100 Subject: [PATCH 13/24] Enrolls flash_attn_combine_results --- ggml/src/ggml-cuda/fattn-common.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 7534bcf965f..648bccdc368 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -1037,9 +1037,9 @@ void launch_fattn( const dim3 blocks_num_combine(Q->ne[1], Q->ne[2], Q->ne[3]); const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2); - flash_attn_combine_results - <<>> - (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); + auto launch_params = ggml_cuda_kernel_launch_params(blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream); + ggml_cuda_kernel_launch(flash_attn_combine_results, launch_params, + dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks); } CUDA_CHECK(cudaGetLastError()); } From 71f8f5883c677ae6e27271478fcf6bf845cb3009 Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 19 Feb 2026 10:28:10 +0100 Subject: [PATCH 14/24] Fix: Drops needless and broken check of CUDA arch for PDL. PDL either works or is without effect. --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1d6d4cb936d..dc2a4858ded 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -106,7 +106,7 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) # define GGML_CUDA_USE_PDL #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER From 86643109e00d3a0a1650c5fe3659856733d9f949 Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 19 Feb 2026 11:17:08 +0100 Subject: [PATCH 15/24] Enrolls flash-attention kernels to pdl --- ggml/src/ggml-cuda/fattn-common.cuh | 4 +++- ggml/src/ggml-cuda/fattn-tile.cuh | 5 +++++ ggml/src/ggml-cuda/fattn-wmma-f16.cu | 3 +++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index 648bccdc368..c8df5141909 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -1006,7 +1006,9 @@ void launch_fattn( const uint3 ne01 = init_fastdiv_values(Q->ne[1]); GGML_ASSERT(block_dim.x % warp_size == 0); - fattn_kernel<<>>( + + auto launch_params = ggml_cuda_kernel_launch_params(blocks_num, block_dim, nbytes_shared, main_stream); + ggml_cuda_kernel_launch(fattn_kernel, launch_params, (const char *) Q->data, K_data, V_data, diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index b6db5822818..6e1e828f3d9 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -769,6 +769,7 @@ static __global__ void flash_attn_tile( #endif // GGML_USE_WMMA_FATTN (use_logit_softcap && !(DV == 128 || DV == 256)) ) { + GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, @@ -820,6 +821,8 @@ static __global__ void flash_attn_tile( constexpr int DKQp = (DKQ + 2*warp_size - 1) & ~(2*warp_size - 1); // DKQ padded to multiple of 2*warp_size. constexpr int DVp = (DV + 2*warp_size - 1) & ~(2*warp_size - 1); // DV padded to multiple of 2*warp_size. + GGML_CUDA_PDL_SYNC(); // needs to guard Q, K, V, mask, sinks, KV_max, dst, dst_meta data accesses. Conservatively placed, not optimal + // Q_tmp == SRAM buffer to hold Q data for the entire lifetime of the kernel. // KV_tmp == SRAM buffer to hold fragments of K/V data while iterating over ne11. // KV_tmp is padded to avoid memory conflicts for K (cpy_ne) and OOB accesses for V (DVp-DV). @@ -956,6 +959,7 @@ static __global__ void flash_attn_tile( KQ_sum_combine[threadIdx.y] = KQ_sum[0]; } + GGML_CUDA_PDL_LC(); return; } @@ -1030,6 +1034,7 @@ static __global__ void flash_attn_tile( const int c = jc % ncols2; if (ncols1 > 1 && col_Q_0 + j >= int(ne01.z)) { + GGML_CUDA_PDL_LC(); return; } diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 8694fd06c7b..f6ac13178a8 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -49,6 +49,7 @@ static __global__ void flash_attn_ext_f16( // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { NO_DEVICE_CODE; + GGML_CUDA_PDL_LC(); return; } @@ -78,6 +79,7 @@ static __global__ void flash_attn_ext_f16( constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + GGML_CUDA_PDL_SYNC(); // needs to guard Q, K, V, mask, sinks, KV_max, dst, dst_meta data accesses. Conservatively placed, not optimal const int sequence = blockIdx.z / ne02; const int head = blockIdx.z - sequence*ne02; const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. @@ -433,6 +435,7 @@ static __global__ void flash_attn_ext_f16( for (int j0 = 0; j0 < ncols; j0 += nwarps) { const int j_VKQ = j0 + threadIdx.y; if (ic0 + j_VKQ >= int(ne01.z)) { + GGML_CUDA_PDL_LC(); return; } From 909ec1f87c25abff8ec7aaa852860c749df07d1f Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 20 Feb 2026 10:36:48 +0100 Subject: [PATCH 16/24] Fix: inlines ggml_cuda_kernel_launch, and uses perfect forwarding for kernels args. This fixes PDL. --- ggml/src/ggml-cuda/common.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index dc2a4858ded..0fca13fe858 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -27,6 +27,7 @@ #include #include #include +#include #include #if defined(GGML_USE_HIP) @@ -223,12 +224,12 @@ struct ggml_cuda_pdl_config { template -void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args... args) { +static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_kernel_launch_params & launch_params, Args&&... args) { #if defined(GGML_CUDA_USE_PDL) auto pdl_cfg = ggml_cuda_pdl_config(launch_params); - CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, args... )); + CUDA_CHECK(cudaLaunchKernelEx(&pdl_cfg.cfg, kernel, std::forward(args)... )); #else - kernel<<>>(args... ); + kernel<<>>(std::forward(args)... ); #endif //defined(GGML_CUDA_USE_PDL) } From 25bbc88dd8f2e630850adbd584d30d0a56c826f1 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 20 Feb 2026 15:05:02 +0100 Subject: [PATCH 17/24] Perf: Enrolls k_bin_bcast variadic template invocation into PDL, via and template alias and template expansion --- ggml/src/ggml-cuda/binbcast.cu | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 749a9273147..a1cb05b4bd7 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -2,6 +2,9 @@ #include #include +template +using type_for_index = T; + static __device__ __forceinline__ float op_repeat(const float a, const float b) { return b; GGML_UNUSED(a); @@ -288,39 +291,24 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1); const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2); - if constexpr (sizeof...(I) > 0) { - k_bin_bcast_unravel<<>>( + { + auto launch_params = ggml_cuda_kernel_launch_params((dim3)block_num, block_size, 0, stream); + ggml_cuda_kernel_launch(k_bin_bcast_unravel...>, launch_params, src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, /*s0,*/ s1, s2, s3, s00, s01, s02, s03, s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { - k_bin_bcast_unravel - <<>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, - ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, - s00, s01, s02, s03, - s10, s11, s12, s13); } } else { const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); - if constexpr (sizeof...(I) > 0) { - // TODO discuss. Variadic templates are difficult to use with cudaLaunchKernelEx. - // For <<<>>>, the compiler can see all args at the call site and deduce src_ptrs... at compile time - // For cudaLaunchKernelEx, we would need to explicitly instantiate the kernel template. - k_bin_bcast<<>>( - src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, - s00 ,s01, s02, s03, - s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); - } else { + { auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); - ggml_cuda_kernel_launch(k_bin_bcast, launch_params, + ggml_cuda_kernel_launch(k_bin_bcast...>, launch_params, src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13, - /*s0,*/ s1, s2, s3, + /*s0,*/ s1, s2, s3, s00, s01, s02, s03, - s10, s11, s12, s13); + s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } } } From c5044bf5479182c6e4d162404f10476457df05c3 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 20 Feb 2026 16:17:15 +0100 Subject: [PATCH 18/24] Enrolls all remaining kernels for qwen3-coder-next into PDL --- ggml/src/ggml-cuda/concat.cu | 6 +++++- ggml/src/ggml-cuda/cpy.cu | 10 ++++++---- ggml/src/ggml-cuda/mean.cu | 6 ++++-- ggml/src/ggml-cuda/norm.cu | 2 +- ggml/src/ggml-cuda/reduce_rows.cuh | 4 ++++ ggml/src/ggml-cuda/scale.cu | 5 ++++- ggml/src/ggml-cuda/ssm-conv.cu | 7 +++++-- ggml/src/ggml-cuda/sumrows.cu | 12 ++++++++---- ggml/src/ggml-cuda/unary.cu | 5 +++++ 9 files changed, 42 insertions(+), 15 deletions(-) diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index e9ffd274b99..353bc74ca12 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -4,6 +4,7 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) { int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { + GGML_CUDA_PDL_LC(); return; } @@ -12,6 +13,7 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * blockIdx.y * ne0 + blockIdx.z * ne0 * gridDim.y; + GGML_CUDA_PDL_SYNC(); if (nidx < ne00) { // src0 int offset_src = nidx + @@ -25,6 +27,7 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * blockIdx.z * (ne0 - ne00) * gridDim.y; dst[offset_dst] = y[offset_src]; } + GGML_CUDA_PDL_LC(); } static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) { @@ -83,7 +86,8 @@ static void concat_f32_cuda(const float * x, const float * y, float * dst, int n int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; dim3 gridDim(num_blocks, ne1, ne2); if (dim == 0) { - concat_f32_dim0<<>>(x, y, dst, ne0, ne00); + auto launch_params = ggml_cuda_kernel_launch_params(gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(concat_f32_dim0, launch_params, x, y, dst, ne0, ne00); return; } if (dim == 1) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 22c653aab62..e918caac4a4 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -234,13 +234,15 @@ static void ggml_cpy_scalar_cuda( GGML_ASSERT(grid_z < USHRT_MAX); dim3 dimGrid(grid_x, grid_y, grid_z); dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); - cpy_scalar_transpose<<>> - (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + auto launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_transpose, launch_params, + cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } else { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; GGML_ASSERT(num_blocks < UINT_MAX); - cpy_scalar><<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + auto launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar>, launch_params, + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } } diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 49af5389957..3d6d0e00277 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -67,9 +67,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132 if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 94b815cf3f5..585e95a51e1 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -256,12 +256,12 @@ static __global__ void l2_norm_f32( const int sample = blockIdx.z; const int tid = threadIdx.x; - GGML_CUDA_PDL_SYNC(); x += sample*stride_sample + channel*stride_channel + row*stride_row; dst += ((sample*nchannels + channel)*nrows + row)*ncols; float tmp = 0.0f; // partial sum for thread in warp + GGML_CUDA_PDL_SYNC(); // needs to guard data access (except pointer arithmetic) for x, dst. for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index de240fd4413..a285a23b8e2 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -10,6 +10,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r const int num_unroll = 8; float temp[num_unroll]; float sum_temp[num_unroll] = { 0.0f }; + + GGML_CUDA_PDL_SYNC(); // needs to guard data access (except pointer arithmetic) for x, dst. for (int i = col; i < ncols;) { for (int j = 0; j < num_unroll; ++j) { if (i < ncols) { @@ -32,8 +34,10 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r sum = block_reduce(sum, shared_vals); if (col != 0) { + GGML_CUDA_PDL_LC(); return; } dst[row] = norm ? sum / ncols : sum; + GGML_CUDA_PDL_LC(); } diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 0ddeff6a175..bdc7c69dba9 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -6,14 +6,17 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; + GGML_CUDA_PDL_SYNC(); // needs to guard data access for x, dst. for (int64_t i = tid; i < nelements; i += stride) { dst[i] = scale * x[i] + bias; } + GGML_CUDA_PDL_LC(); } static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, bias, nelements); + auto launch_params = ggml_cuda_kernel_launch_params(MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(scale_f32, launch_params, x, dst, scale, bias, nelements); } void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 6d5ea704c65..54c4445b175 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -1,3 +1,4 @@ +#include "common.cuh" #include "ssm-conv.cuh" template @@ -9,7 +10,6 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float const int tid = threadIdx.x; const int bidx = blockIdx.x; const int bidy = blockIdx.y; - const float * x_block = (const float *) ((const char *) src0 + bidx * src0_nb2 + bidy * split_d_inner * src0_nb1); const float * w_block = (const float *) ((const char *) src1 + bidy * split_d_inner * src1_nb1); float * y_block = (float *) ((char *) dst + bidx * dst_nb2 + bidy * split_d_inner * dst_nb0); @@ -21,6 +21,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float float x[d_conv] = { 0.0f }; float w[d_conv] = { 0.0f }; + GGML_CUDA_PDL_SYNC(); // needs to guard data access for src0, src1, dst. #pragma unroll for (size_t j = 0; j < d_conv; j++) { w[j] = w_block[tid * stride_w + j]; @@ -43,6 +44,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float } y_block[i * stride_y + tid] = sumf; } + GGML_CUDA_PDL_LC(); } template @@ -106,7 +108,8 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int constexpr int kNC = decltype(NC)::value; if (n_t <= 32) { const dim3 blocks(n_s, (nr + threads - 1) / threads, 1); - ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + auto launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_conv_f32, launch_params, src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { const int64_t split_n_t = 32; diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 4025771aadb..99b3d7237a4 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -7,10 +7,12 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int const dim3 block_nums(nrows, 1, 1); if ((nrows / nsm) < 2) { const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } else { const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, x, dst, ncols); } } @@ -34,10 +36,12 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if ((nrows / nsm) < 2) { // Increase num threads to 512 for small nrows to better hide the latency const dim3 block_dims(512, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } else { // Enough active SMs to hide latency, use smaller blocks to allow better scheduling const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(reduce_rows_f32, launch_params, src0_d, dst_d, ncols); } } diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index d4866067a4f..abe5c15b208 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -114,15 +114,20 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { + GGML_CUDA_PDL_LC(); return; } + GGML_CUDA_PDL_SYNC(); dst[i] = (T)op((float)x[i]); + GGML_CUDA_PDL_LC(); } template static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; + auto launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_op_kernel, launch_params, x, dst, k); unary_op_kernel<<>>(x, dst, k); } From 7e76151583b0eaa4e572507b804f8ec49feef634 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 11 Mar 2026 15:43:35 +0100 Subject: [PATCH 19/24] Remove all PDL LC calls to create a baseline --- ggml/src/ggml-cuda/binbcast.cu | 6 ------ ggml/src/ggml-cuda/concat.cu | 2 -- ggml/src/ggml-cuda/cpy.cu | 9 --------- ggml/src/ggml-cuda/fattn-common.cuh | 6 ------ ggml/src/ggml-cuda/fattn-tile.cuh | 3 --- ggml/src/ggml-cuda/fattn-vec.cuh | 3 --- ggml/src/ggml-cuda/fattn-wmma-f16.cu | 2 -- ggml/src/ggml-cuda/getrows.cu | 5 ----- ggml/src/ggml-cuda/mmvf.cu | 2 -- ggml/src/ggml-cuda/mmvq.cu | 2 -- ggml/src/ggml-cuda/norm.cu | 5 ----- ggml/src/ggml-cuda/quantize.cu | 5 ----- ggml/src/ggml-cuda/reduce_rows.cuh | 2 -- ggml/src/ggml-cuda/rope.cu | 8 -------- ggml/src/ggml-cuda/scale.cu | 1 - ggml/src/ggml-cuda/set-rows.cu | 4 ---- ggml/src/ggml-cuda/ssm-conv.cu | 1 - ggml/src/ggml-cuda/topk-moe.cu | 2 -- ggml/src/ggml-cuda/unary.cu | 2 -- 19 files changed, 70 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index a1cb05b4bd7..86ac9955ad9 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -61,7 +61,6 @@ static __global__ void k_bin_bcast(const src0_t * src0, const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z); if (i0s >= (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) { - GGML_CUDA_PDL_LC(); return; } @@ -89,7 +88,6 @@ static __global__ void k_bin_bcast(const src0_t * src0, dst_row[i0] = (dst_t) result; } - GGML_CUDA_PDL_LC(); } template = ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) { - GGML_CUDA_PDL_LC(); return; } @@ -157,7 +154,6 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, } dst_row[i0] = (dst_t) result; - GGML_CUDA_PDL_LC(); } template @@ -327,7 +323,6 @@ static __global__ void k_repeat_back( const int64_t tid3 = tid23 / ne2; if (tid0 >= ne0) { - GGML_CUDA_PDL_LC(); return; } @@ -343,7 +338,6 @@ static __global__ void k_repeat_back( } } dst[tid3*ne2*ne1*ne0 + tid2*ne1*ne0 + tid1*ne0 + tid0] = sum; - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index 353bc74ca12..ac4ef346c6e 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -4,7 +4,6 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) { int nidx = threadIdx.x + blockIdx.x * blockDim.x; if (nidx >= ne0) { - GGML_CUDA_PDL_LC(); return; } @@ -27,7 +26,6 @@ static __global__ void concat_f32_dim0(const float * x, const float * y, float * blockIdx.z * (ne0 - ne00) * gridDim.y; dst[offset_dst] = y[offset_src]; } - GGML_CUDA_PDL_LC(); } static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) { diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index e918caac4a4..26ba6b236f4 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -19,7 +19,6 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { - GGML_CUDA_PDL_LC(); return; } @@ -39,7 +38,6 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne GGML_CUDA_PDL_SYNC(); cpy_1(cx + x_offset, cdst + dst_offset); - GGML_CUDA_PDL_LC(); } template @@ -90,7 +88,6 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const } } } - GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -129,7 +126,6 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { - GGML_CUDA_PDL_LC(); return; } @@ -147,7 +143,6 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int64_t ne, GGML_CUDA_PDL_SYNC(); cpy_blck(cx + x_offset, cdst + dst_offset); - GGML_CUDA_PDL_LC(); } template @@ -158,7 +153,6 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, const int64_t i = ((int64_t)blockDim.x*blockIdx.x + threadIdx.x)*qk; if (i >= ne) { - GGML_CUDA_PDL_LC(); return; } @@ -176,7 +170,6 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int64_t ne, GGML_CUDA_PDL_SYNC(); cpy_blck(cx + x_offset, cdst + dst_offset); - GGML_CUDA_PDL_LC(); } template @@ -184,7 +177,6 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { - GGML_CUDA_PDL_LC(); return; } @@ -193,7 +185,6 @@ static __global__ void cpy_scalar_contiguous(const char * cx, char * cdst, const GGML_CUDA_PDL_SYNC(); dst[i] = ggml_cuda_cast(x[i]); - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index c8df5141909..f55828a6157 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -621,12 +621,10 @@ static __global__ void flash_attn_mask_to_KV_max( KV_max_sj += FATTN_KQ_STRIDE; if (threadIdx.x != 0) { - GGML_CUDA_PDL_LC(); return; } KV_max[sequence*ne31 + jt] = KV_max_sj; - GGML_CUDA_PDL_LC(); } template // D == head size @@ -658,7 +656,6 @@ static __global__ void flash_attn_stream_k_fixup( const bool wrote_beginning_of_tile = kbc0 % iter_k == 0; const bool did_not_write_last = kbc0/iter_k == kbc0_stop/iter_k && kbc0_stop % iter_k != 0; if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) { - GGML_CUDA_PDL_LC(); return; } @@ -671,7 +668,6 @@ static __global__ void flash_attn_stream_k_fixup( const int zt_Q = z_KV*gqa_ratio + zt_gqa*ncols2; // Global Q head start index. if (jt*ncols1 + j >= ne01 || zt_gqa*ncols2 + c >= gqa_ratio) { - GGML_CUDA_PDL_LC(); return; } @@ -729,7 +725,6 @@ static __global__ void flash_attn_stream_k_fixup( // Write back final result: *dst = dst_val / rowsum; - GGML_CUDA_PDL_LC(); } template // D == head size @@ -784,7 +779,6 @@ static __global__ void flash_attn_combine_results( } dst[tid] = VKQ_numerator / VKQ_denominator; - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index f666f04c37b..4cb6ea4a0fe 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -769,7 +769,6 @@ static __global__ void flash_attn_tile( #endif // GGML_USE_WMMA_FATTN (use_logit_softcap && !(DV == 128 || DV == 256)) ) { - GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, @@ -959,7 +958,6 @@ static __global__ void flash_attn_tile( KQ_sum_combine[threadIdx.y] = KQ_sum[0]; } - GGML_CUDA_PDL_LC(); return; } @@ -1034,7 +1032,6 @@ static __global__ void flash_attn_tile( const int c = jc % ncols2; if (ncols1 > 1 && col_Q_0 + j >= int(ne01.z)) { - GGML_CUDA_PDL_LC(); return; } diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index 2e6e0b4761f..c08afdc5747 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -44,7 +44,6 @@ static __global__ void flash_attn_ext_vec( // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { - GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, @@ -495,9 +494,7 @@ static __global__ void flash_attn_ext_vec( if (gridDim.y != 1 && tid < ncols && (ncols == 1 || ic0 + tid < int(ne01.z))) { dst_meta[((sequence*int(ne01.z) + ic0 + tid)*ne02 + head)*gridDim.y + blockIdx.y] = make_float2(KQ_max[tid], KQ_sum[tid]); } - GGML_CUDA_PDL_LC(); #else - GGML_CUDA_PDL_LC(); GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale, max_bias, m0, m1, n_head_log2, logit_softcap, ne00, ne01, ne02, ne03, diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cu b/ggml/src/ggml-cuda/fattn-wmma-f16.cu index 7711bd4383a..0248f2cd331 100644 --- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu +++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu @@ -49,7 +49,6 @@ static __global__ void flash_attn_ext_f16( // Skip unused kernel variants for faster compilation: if (use_logit_softcap && !(D == 128 || D == 256)) { NO_DEVICE_CODE; - GGML_CUDA_PDL_LC(); return; } @@ -456,7 +455,6 @@ static __global__ void flash_attn_ext_f16( for (int j0 = 0; j0 < ncols; j0 += nwarps) { const int j_VKQ = j0 + threadIdx.y; if (ic0 + j_VKQ >= int(ne01.z)) { - GGML_CUDA_PDL_LC(); return; } diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 4c6d11a78b3..0314d8149f3 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -37,7 +37,6 @@ static __global__ void k_get_rows( dst_row[iybs + iqs + y_offset] = ggml_cuda_cast(v.y); } } - GGML_CUDA_PDL_LC(); } template @@ -58,7 +57,6 @@ static __global__ void k_get_rows_float( const int i12 = z % ne12; if (i00 >= ne00) { - GGML_CUDA_PDL_LC(); return; } @@ -70,7 +68,6 @@ static __global__ void k_get_rows_float( dst_row[i00] = ggml_cuda_cast(src0_row[i00]); } } - GGML_CUDA_PDL_LC(); } template @@ -79,7 +76,6 @@ static __global__ void k_get_rows_back_float( const int col = blockIdx.x*blockDim.x + threadIdx.x; if (col >= ncols) { - GGML_CUDA_PDL_LC(); return; } @@ -96,7 +92,6 @@ static __global__ void k_get_rows_back_float( } dst[dst_row*ncols + col] = sum; - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 3b21c8a87b0..84625a462f2 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -335,7 +335,6 @@ static __global__ void mul_mat_vec_f( } if (tid >= ncols_dst) { - GGML_CUDA_PDL_LC(); return; } @@ -369,7 +368,6 @@ static __global__ void mul_mat_vec_f( } dst[tid*stride_col_dst + row] = value; - GGML_CUDA_PDL_LC(); if constexpr (!has_fusion) { GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index a5256be949b..9c341fb4ec9 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -288,7 +288,6 @@ static __global__ void mul_mat_vec_q( } __syncthreads(); if (threadIdx.y > 0) { - GGML_CUDA_PDL_LC(); return; } @@ -351,7 +350,6 @@ static __global__ void mul_mat_vec_q( dst[j*stride_col_dst + threadIdx.x] = result; } } - GGML_CUDA_PDL_LC(); if constexpr (!has_fusion) { GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 585e95a51e1..c5d427b6fcb 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -36,7 +36,6 @@ static __global__ void norm_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = (x[col] - mean) * inv_std; } - GGML_CUDA_PDL_LC(); } template @@ -72,7 +71,6 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr for (int j = start; j < end; j += block_size) { dst[j] *= scale; } - GGML_CUDA_PDL_LC(); } template @@ -153,7 +151,6 @@ static __global__ void rms_norm_f32(const float * x, dst[col] = scale * x[col]; } } - GGML_CUDA_PDL_LC(); } template @@ -207,7 +204,6 @@ static __global__ void rms_norm_back_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = scale_grad*grad[col] + scale_x*xf[col]; } - GGML_CUDA_PDL_LC(); } // template @@ -277,7 +273,6 @@ static __global__ void l2_norm_f32( for (int col = tid; col < ncols; col += block_size) { dst[col] = scale * x[col]; } - GGML_CUDA_PDL_LC(); } static void norm_f32_cuda( diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index bc6f722b561..7e9740b928a 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -9,7 +9,6 @@ static __global__ void quantize_q8_1( const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { - GGML_CUDA_PDL_LC(); return; } @@ -43,12 +42,10 @@ static __global__ void quantize_q8_1( y[ib].qs[iqs] = q; if (iqs > 0) { - GGML_CUDA_PDL_LC(); return; } y[ib].ds = make_half2(d, sum); - GGML_CUDA_PDL_LC(); } __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) { @@ -175,7 +172,6 @@ static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x, // Store 2 scales packed into 1 uint32 y[ib].d4[quad_idx_in_block] = (scales[1] << 8) | scales[0]; } - GGML_CUDA_PDL_LC(); } template @@ -275,7 +271,6 @@ static __global__ void quantize_mmq_q8_1( } else { y[ib].d4[iqs/32] = d; } - GGML_CUDA_PDL_LC(); } void quantize_row_q8_1_cuda( diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh index a285a23b8e2..0627beb89f4 100644 --- a/ggml/src/ggml-cuda/reduce_rows.cuh +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -34,10 +34,8 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r sum = block_reduce(sum, shared_vals); if (col != 0) { - GGML_CUDA_PDL_LC(); return; } dst[row] = norm ? sum / ncols : sum; - GGML_CUDA_PDL_LC(); } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 0aa18a3aff3..8c84f26e694 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -137,7 +137,6 @@ static __global__ void rope_neox(const T * x, const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne00) { - GGML_CUDA_PDL_LC(); return; } @@ -162,7 +161,6 @@ static __global__ void rope_neox(const T * x, dst[idst + i0 / 2 + 0] = ggml_cuda_cast(x[ix + i0 / 2 + 0]); dst[idst + i0 / 2 + 1] = ggml_cuda_cast(x[ix + i0 / 2 + 1]); - GGML_CUDA_PDL_LC(); return; } @@ -180,7 +178,6 @@ static __global__ void rope_neox(const T * x, dst[idst + 0] = ggml_cuda_cast(x0 * cos_theta - x1 * sin_theta); dst[idst + n_dims / 2] = ggml_cuda_cast(x0 * sin_theta + x1 * cos_theta); - GGML_CUDA_PDL_LC(); } template @@ -208,7 +205,6 @@ static __global__ void rope_multi(const T * x, const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y); if (i0 >= ne00) { - GGML_CUDA_PDL_LC(); return; } @@ -226,7 +222,6 @@ static __global__ void rope_multi(const T * x, dst[idst + i0/2 + 0] = x[ix + i0/2 + 0]; dst[idst + i0/2 + 1] = x[ix + i0/2 + 1]; - GGML_CUDA_PDL_LC(); return; } @@ -269,7 +264,6 @@ static __global__ void rope_multi(const T * x, dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + n_dims/2] = x0*sin_theta + x1*cos_theta; - GGML_CUDA_PDL_LC(); } template @@ -296,7 +290,6 @@ static __global__ void rope_vision(const T * x, const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne00) { - GGML_CUDA_PDL_LC(); return; } @@ -335,7 +328,6 @@ static __global__ void rope_vision(const T * x, dst[idst + 0] = x0*cos_theta - x1*sin_theta; dst[idst + n_dims] = x0*sin_theta + x1*cos_theta; - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index bdc7c69dba9..4f66522c700 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -10,7 +10,6 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale for (int64_t i = tid; i < nelements; i += stride) { dst[i] = scale * x[i] + bias; } - GGML_CUDA_PDL_LC(); } static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) { diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index b614f1fe57f..6bc022a39df 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -30,7 +30,6 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; if (i >= ne_total) { - GGML_CUDA_PDL_LC(); return; } @@ -64,7 +63,6 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0, block_type * dst_block = dst_row_ptr + i00 / qk; quantize_func(src_block, dst_block); - GGML_CUDA_PDL_LC(); GGML_UNUSED(ne10); GGML_UNUSED(ne11); @@ -138,7 +136,6 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; if (i >= ne_total) { - GGML_CUDA_PDL_LC(); return; } @@ -168,7 +165,6 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; dst_row_ptr[i00] = ggml_cuda_cast(src0_row[i00]); - GGML_CUDA_PDL_LC(); GGML_UNUSED(ne10); GGML_UNUSED(ne11); diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 54c4445b175..9c411ef380f 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -44,7 +44,6 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float } y_block[i * stride_y + tid] = sumf; } - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index e142c478feb..27ee7702537 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -88,7 +88,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * const topk_moe_config config) { const int row = blockIdx.x * blockDim.y + threadIdx.y; if (row >= n_rows) { - GGML_CUDA_PDL_LC(); return; } @@ -238,7 +237,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * weights[idx] = output_weights[i] * scale_val; } } - GGML_CUDA_PDL_LC(); } template diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index abe5c15b208..517649c5dde 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -114,13 +114,11 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { - GGML_CUDA_PDL_LC(); return; } GGML_CUDA_PDL_SYNC(); dst[i] = (T)op((float)x[i]); - GGML_CUDA_PDL_LC(); } template From 23a24c5012621d12a998010b8f34d0f5296f24c3 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 25 Mar 2026 17:41:51 +0100 Subject: [PATCH 20/24] Added LC according to internal guidance and tested kernel performance. --- ggml/src/ggml-cuda/binbcast.cu | 2 ++ ggml/src/ggml-cuda/fattn-common.cuh | 2 ++ ggml/src/ggml-cuda/fattn-vec.cuh | 2 ++ ggml/src/ggml-cuda/mmvf.cu | 2 ++ ggml/src/ggml-cuda/mmvq.cu | 2 ++ ggml/src/ggml-cuda/norm.cu | 2 ++ ggml/src/ggml-cuda/quantize.cu | 3 +++ ggml/src/ggml-cuda/rope.cu | 2 ++ ggml/src/ggml-cuda/set-rows.cu | 2 ++ ggml/src/ggml-cuda/topk-moe.cu | 3 +++ 10 files changed, 22 insertions(+) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 86ac9955ad9..085d78e785f 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -55,6 +55,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const int s12, const int s13, src1_ptrs... src1s) { + GGML_CUDA_PDL_LC(); // BINBCAST try 1; 352.28, 352.62, 352.17, 351.96 on maxq const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y); const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3); @@ -76,6 +77,7 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; + // GGML_CUDA_PDL_LC(); // BINBCAST try 2; 352.44 352.42, 352.05 on maxq for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { const uint32_t i10 = fastmodulo(i0, ne10); diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index c922c0f68a0..dadc5e34961 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -782,6 +782,7 @@ static __global__ void flash_attn_combine_results( const float2 * __restrict__ VKQ_meta, float * __restrict__ dst, const int parallel_blocks) { + GGML_CUDA_PDL_LC(); // FATTN_COMBINE_RESULTS try 1; on maxq // Dimension 0: threadIdx.x // Dimension 1: blockIdx.x // Dimension 2: blockIdx.y @@ -810,6 +811,7 @@ static __global__ void flash_attn_combine_results( ((float *) meta)[i] = ((const float *)VKQ_meta) [i]; } + // GGML_CUDA_PDL_LC(); // FATTN_COMBINE_RESULTS try 2; on maxq __syncthreads(); float kqmax = meta[0].x; diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh index a544d95221c..e4e467eef31 100644 --- a/ggml/src/ggml-cuda/fattn-vec.cuh +++ b/ggml/src/ggml-cuda/fattn-vec.cuh @@ -40,6 +40,7 @@ static __global__ void flash_attn_ext_vec( const int32_t nb21, const int32_t nb22, const int64_t nb23, const int32_t ne31, const int32_t ne32, const int32_t ne33, const int32_t nb31, const int32_t nb32, const int64_t nb33) { + GGML_CUDA_PDL_LC(); // FATTN_VEC try 1; on maxq #ifdef FLASH_ATTN_AVAILABLE // Skip unused kernel variants for faster compilation: @@ -138,6 +139,7 @@ static __global__ void flash_attn_ext_vec( float2 Q_ds[ncols][1 > D/(sizeof(int)*nthreads_KQ) ? 1 : D/(sizeof(int)*nthreads_KQ)]; GGML_CUDA_PDL_SYNC(); + // GGML_CUDA_PDL_LC(); // FATTN_VEC try 2; on maxq if constexpr (Q_q8_1) { #pragma unroll for (int j0 = 0; j0 < ncols; j0 += nwarps) { diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 84625a462f2..7af807722df 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -11,6 +11,7 @@ static __global__ void mul_mat_vec_f( const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const int ids_stride) { + // GGML_CUDA_PDL_LC(); // MMVF try 1; on maxq const int row = blockIdx.x; // for MUL_MAT_ID - blockIdx.y = n_expert_used, blockIdx.z = ncols_dst (tokens) const int channel_dst = blockIdx.y; @@ -299,6 +300,7 @@ static __global__ void mul_mat_vec_f( static_assert(std::is_same_v, "unsupported type"); } + GGML_CUDA_PDL_LC(); // MMVF try 2; on maxq #pragma unroll for (int j = 0; j < ncols_dst; ++j) { sumf[j] = warp_reduce_sum(sumf[j]); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index de05b527741..50634fb6c8d 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -203,6 +203,7 @@ static __global__ void mul_mat_vec_q( const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst, const uint32_t ids_stride) { + // GGML_CUDA_PDL_LC(); // MMVQ try 1; on maxq constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; constexpr int vdr = get_vdr_mmvq(type); @@ -353,6 +354,7 @@ static __global__ void mul_mat_vec_q( dst += token_idx*stride_col_dst; } + // GGML_CUDA_PDL_LC(); // MMVQ try 2; on maxq // sum up partial sums and write back result #pragma unroll for (int j = 0; j < ncols_dst; ++j) { diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index c5d427b6fcb..97287ab8e1c 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -97,6 +97,7 @@ static __global__ void rms_norm_f32(const float * x, const uint3 add_nrows_packed = make_uint3(0, 0, 0), const uint3 add_nchannels_packed = make_uint3(0, 0, 0), const uint3 add_nsamples_packed = make_uint3(0, 0, 0)) { + GGML_CUDA_PDL_LC(); // RMS_NORM try 1; on maxq const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -131,6 +132,7 @@ static __global__ void rms_norm_f32(const float * x, const float xi = x[col]; tmp += xi * xi; } + // GGML_CUDA_PDL_LC(); // RMS_NORM try 2; on maxq // sum up partial sums extern __shared__ float s_sum[]; diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 9480c98cd6c..d8dadd1bc10 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -6,6 +6,8 @@ static __global__ void quantize_q8_1( const float * __restrict__ x, void * __restrict__ vy, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03, const int64_t ne0, const uint32_t ne1, const uint3 ne2) { + // aendk test, baseline is at 345.57 on maxq + GGML_CUDA_PDL_LC(); // Try 1; 349.76 on maxq const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i0 >= ne0) { @@ -28,6 +30,7 @@ static __global__ void quantize_q8_1( const int64_t ib = i_cont / QK8_1; // block index const int64_t iqs = i_cont % QK8_1; // quant index + // GGML_CUDA_PDL_LC(); // Try 2; 348.48 on maxq; NSYS: Takes even longer, more contention? GGML_CUDA_PDL_SYNC(); const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f; float amax = fabsf(xi); diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 8c84f26e694..4113aaebeb5 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -134,6 +134,7 @@ static __global__ void rope_neox(const T * x, const float * freq_factors, const int64_t * row_indices, const int set_rows_stride) { + GGML_CUDA_PDL_LC(); // ROPE_NEOX try 1; on maxq const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y); if (i0 >= ne00) { @@ -149,6 +150,7 @@ static __global__ void rope_neox(const T * x, int idst = i0 / 2 + i1 * s1 + i2 * s2 + i3 * s3; const int ix = i0 / 2 + i1 * s01 + i2 * s02 + i3 * s03; GGML_CUDA_PDL_SYNC(); // guards x, dst, pos, freq_factors, row_indices data access + // GGML_CUDA_PDL_LC(); // ROPE_NEOX try 2; on maxq // Fusion optimization: ROPE + VIEW + SET_ROWS. // The rope output is viewed as a 1D tensor and offset based on a row index in row_indices. diff --git a/ggml/src/ggml-cuda/set-rows.cu b/ggml/src/ggml-cuda/set-rows.cu index 6bc022a39df..4fa059e54e2 100644 --- a/ggml/src/ggml-cuda/set-rows.cu +++ b/ggml/src/ggml-cuda/set-rows.cu @@ -133,6 +133,7 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, const uint3 ne02, const uint3 ne11_fd, const uint3 ne12_fd) { + // GGML_CUDA_PDL_LC(); // SET_ROWS try 1; on maxq const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x; if (i >= ne_total) { @@ -160,6 +161,7 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0, GGML_CUDA_PDL_SYNC(); const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12); + GGML_CUDA_PDL_LC(); // SET_ROWS try 2; on maxq const src_t * src0_row = src0 + i01*s01 + i02*s02 + i03*s03; dst_t * dst_row_ptr = dst + dst_row*s1 + i02*s2 + i03*s3; diff --git a/ggml/src/ggml-cuda/topk-moe.cu b/ggml/src/ggml-cuda/topk-moe.cu index 1940990dfa2..e312a60deef 100644 --- a/ggml/src/ggml-cuda/topk-moe.cu +++ b/ggml/src/ggml-cuda/topk-moe.cu @@ -86,6 +86,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * const float clamp_val, const float scale_val, const topk_moe_config config) { + // GGML_CUDA_PDL_LC(); // TOPK_MOE try 1; on maxq const int row = blockIdx.x * blockDim.y + threadIdx.y; if (row >= n_rows) { return; @@ -159,6 +160,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * output_weights[i] = 0.f; } + GGML_CUDA_PDL_LC(); // TOPK_MOE try 2; on maxq for (int k = 0; k < n_expert_used; k++) { float max_val = wt[0]; int max_expert = threadIdx.x; @@ -228,6 +230,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float * } } + // GGML_CUDA_PDL_LC(); // TOPK_MOE try 3; on maxq if (config.with_norm) { wt_sum = warp_reduce_sum(wt_sum); wt_sum = max(wt_sum, clamp_val); From ef28cdae23c5c2d96c519c4b118ca4d00f575fe5 Mon Sep 17 00:00:00 2001 From: aendk Date: Thu, 2 Apr 2026 12:22:53 +0200 Subject: [PATCH 21/24] Enrols missing qwen3-5 kernels passively into PDL. --- ggml/src/ggml-cuda/gated_delta_net.cu | 11 +++++++---- ggml/src/ggml-cuda/getrows.cu | 1 + ggml/src/ggml-cuda/norm.cu | 6 ++++-- ggml/src/ggml-cuda/rope.cu | 6 ++++-- ggml/src/ggml-cuda/unary.cu | 5 +++-- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index 6b44bec7317..e77cdc677f6 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -1,4 +1,5 @@ #include "gated_delta_net.cuh" +#include "ggml-cuda/common.cuh" template __global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2) @@ -48,6 +49,7 @@ gated_delta_net_cuda(const float * q, float s_shard[rows_per_lane]; // state is stored transposed: M[col][i] = S[i][col], row col is contiguous + GGML_CUDA_PDL_SYNC(); #pragma unroll for (int r = 0; r < rows_per_lane; r++) { const int i = r * warp_size + lane; @@ -167,28 +169,29 @@ static void launch_gated_delta_net( int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + auto launch_params = ggml_cuda_kernel_launch_params(grid_dims, block_dims, 0, stream); switch (S_v) { case 16: - gated_delta_net_cuda<16, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<16, KDA>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); break; case 32: - gated_delta_net_cuda<32, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<32, KDA>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); break; case 64: { - gated_delta_net_cuda<64, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<64, KDA>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); break; } case 128: { - gated_delta_net_cuda<128, KDA><<>>( + ggml_cuda_kernel_launch(gated_delta_net_cuda<128, KDA>, launch_params, q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, sv3, sb1, sb2, sb3, neqk1_magic, rq3_magic, scale); diff --git a/ggml/src/ggml-cuda/getrows.cu b/ggml/src/ggml-cuda/getrows.cu index 0314d8149f3..0a88add574b 100644 --- a/ggml/src/ggml-cuda/getrows.cu +++ b/ggml/src/ggml-cuda/getrows.cu @@ -48,6 +48,7 @@ static __global__ void k_get_rows_float( /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03, const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) { + GGML_CUDA_PDL_LC(); // try1 GGML_CUDA_PDL_SYNC(); for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) { for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) { diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 97287ab8e1c..aeeff41ed3c 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -423,10 +423,12 @@ static void l2_norm_f32_cuda( const dim3 blocks_num(nrows, nchannels, nsamples); if (ncols < 1024) { const dim3 block_dims(WARP_SIZE, 1, 1); - l2_norm_f32<<>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } else { const dim3 block_dims(1024, 1, 1); - l2_norm_f32<1024><< WARP_SIZE ? 32 * sizeof(float): 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps); + auto launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float): 0, stream}; + ggml_cuda_kernel_launch(l2_norm_f32<1024>, launch_params, x, dst, ncols, stride_row, stride_channel, stride_sample, eps); } } diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index 4113aaebeb5..f3c9a9738da 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -449,11 +449,13 @@ static void rope_multi_cuda(const T * x, const float theta_scale = powf(freq_base, -2.0f / n_dims); if (freq_factors == nullptr) { - rope_multi<<>>( + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } else { - rope_multi<<>>( + auto launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); + ggml_cuda_kernel_launch(rope_multi, launch_params, x, dst, ne00, ne01, ne02, s01, s02, s03, s1, s2, s3, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope); } diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 1ed0ca24129..94411935e34 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -126,7 +126,6 @@ static void unary_cuda(const T * x, T * dst, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; auto launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream); ggml_cuda_kernel_launch(unary_op_kernel, launch_params, x, dst, k); - unary_op_kernel<<>>(x, dst, k); } template @@ -266,13 +265,15 @@ static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); + GGML_CUDA_PDL_SYNC(); dst[i] = (T)(op((float)x[j0]) * (float)g[j1]); } template static void unary_gated_cuda(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, cudaStream_t stream) { const int64_t num_blocks = (k + CUDA_GLU_BLOCK_SIZE - 1) / CUDA_GLU_BLOCK_SIZE; - unary_gated_op_kernel<<>>(x, g, dst, k, n, o0, o1); + auto launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_GLU_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(unary_gated_op_kernel, launch_params, x, g, dst, k, n, o0, o1); } template From 5e318bfcb76666436c988a1932d86af4295a7b08 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 10 Apr 2026 14:52:09 +0200 Subject: [PATCH 22/24] Kernel optimizations (LC signals) for qwen3.5 --- ggml/src/ggml-cuda/cpy.cu | 1 + ggml/src/ggml-cuda/gated_delta_net.cu | 1 + ggml/src/ggml-cuda/norm.cu | 4 ++++ ggml/src/ggml-cuda/rope.cu | 3 +++ ggml/src/ggml-cuda/scale.cu | 1 + ggml/src/ggml-cuda/ssm-conv.cu | 1 + ggml/src/ggml-cuda/unary.cu | 2 ++ 7 files changed, 13 insertions(+) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index f7aa048fc46..93d31b7aff4 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -16,6 +16,7 @@ static __global__ void cpy_scalar(const char * cx, char * cdst, const int64_t ne const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13) { + GGML_CUDA_PDL_LC(); // try 1 const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { diff --git a/ggml/src/ggml-cuda/gated_delta_net.cu b/ggml/src/ggml-cuda/gated_delta_net.cu index e77cdc677f6..017f33d6be5 100644 --- a/ggml/src/ggml-cuda/gated_delta_net.cu +++ b/ggml/src/ggml-cuda/gated_delta_net.cu @@ -25,6 +25,7 @@ gated_delta_net_cuda(const float * q, const uint3 neqk1_magic, const uint3 rq3_magic, float scale) { + // GGML_CUDA_PDL_LC(); // GATED_DELTA_NET try 1; always followed by memcpy on qwen3.5, no benefit const uint32_t h_idx = blockIdx.x; const uint32_t sequence = blockIdx.y; // each warp owns one column, using warp-level primitives to reduce across rows diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index aeeff41ed3c..60b928a3d8f 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -246,6 +246,7 @@ template static __global__ void l2_norm_f32( const float * x, float * dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps) { + // GGML_CUDA_PDL_LC(); // L2_NORM try 1; on maxq const int nrows = gridDim.x; const int nchannels = gridDim.y; @@ -259,15 +260,18 @@ static __global__ void l2_norm_f32( float tmp = 0.0f; // partial sum for thread in warp + // GGML_CUDA_PDL_LC(); // L2_NORM try 2; on maxq GGML_CUDA_PDL_SYNC(); // needs to guard data access (except pointer arithmetic) for x, dst. for (int col = tid; col < ncols; col += block_size) { const float xi = x[col]; tmp += xi * xi; } + // GGML_CUDA_PDL_LC(); // L2_NORM try 3; on maxq // sum up partial sums extern __shared__ float s_sum[]; tmp = block_reduce(tmp, s_sum); + GGML_CUDA_PDL_LC(); // L2_NORM try 4; on maxq // from https://pytorch.org/docs/stable/generated/torch.nn.functional.normalize.html const float scale = rsqrtf(fmaxf(tmp, eps * eps)); diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu index f3c9a9738da..57dec55e340 100644 --- a/ggml/src/ggml-cuda/rope.cu +++ b/ggml/src/ggml-cuda/rope.cu @@ -204,6 +204,7 @@ static __global__ void rope_multi(const T * x, const float * freq_factors, const mrope_sections sections, const bool is_imrope) { + // GGML_CUDA_PDL_LC(); // ROPE_MULTI try 1; on maxq const int i0 = 2 * (blockDim.y * blockIdx.y + threadIdx.y); if (i0 >= ne00) { @@ -253,6 +254,7 @@ static __global__ void rope_multi(const T * x, theta_base = pos[i2 + ne02 * 3] * powf(theta_scale, i0 / 2.0f); } } + // GGML_CUDA_PDL_LC(); // ROPE_MULTI try 2; on maxq const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f; @@ -261,6 +263,7 @@ static __global__ void rope_multi(const T * x, rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, cos_theta, sin_theta); + // GGML_CUDA_PDL_LC(); // ROPE_MULTI try 3; on maxq const float x0 = x[ix + 0]; const float x1 = x[ix + n_dims/2]; diff --git a/ggml/src/ggml-cuda/scale.cu b/ggml/src/ggml-cuda/scale.cu index 4f66522c700..e21e5f36038 100644 --- a/ggml/src/ggml-cuda/scale.cu +++ b/ggml/src/ggml-cuda/scale.cu @@ -3,6 +3,7 @@ #define MAX_GRIDDIM_X 0x7FFFFFFF static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) { + GGML_CUDA_PDL_LC(); // SCALE try 1; on maxq int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x; int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x; diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 060becc50a2..91059d1bc49 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -7,6 +7,7 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1, float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2, const int64_t n_t) { + GGML_CUDA_PDL_LC(); // SSM_CONV try 1; on maxq GGML_UNUSED(src0_nb0); const int tid = threadIdx.x; const int bidx = blockIdx.x; diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 94411935e34..cbe4a9bd533 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -111,6 +111,7 @@ static __device__ __forceinline__ float op_trunc(float x) { template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { + GGML_CUDA_PDL_LC(); // try 1 const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { @@ -255,6 +256,7 @@ void ggml_cuda_op_softplus(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { template static __global__ void unary_gated_op_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1) { + GGML_CUDA_PDL_LC(); // try 1 const int64_t i = int64_t(blockDim.x)*blockIdx.x + threadIdx.x; if (i >= k) { From f3b866536ec2c44526e7a5d24dbfd66556cd5735 Mon Sep 17 00:00:00 2001 From: aendk Date: Fri, 10 Apr 2026 16:30:27 +0200 Subject: [PATCH 23/24] Enrolls ssm-scan kernels into PDL --- ggml/src/ggml-cuda/ssm-scan.cu | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/ggml/src/ggml-cuda/ssm-scan.cu b/ggml/src/ggml-cuda/ssm-scan.cu index c1d4e2bc8df..0f43d366ed0 100644 --- a/ggml/src/ggml-cuda/ssm-scan.cu +++ b/ggml/src/ggml-cuda/ssm-scan.cu @@ -26,6 +26,7 @@ __global__ void __launch_bounds__(splitD, 1) const int64_t s_off, const int64_t d_inner, const int64_t L_param) { const size_t L = L_template == 0 ? L_param : L_template; + GGML_CUDA_PDL_SYNC(); const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2); const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float)); const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float)); @@ -134,7 +135,7 @@ __global__ void __launch_bounds__(d_state, 1) const int seq_idx = blockIdx.y; const int group_off = (head_idx / (n_head / n_group)) * d_state * sizeof(float); - + GGML_CUDA_PDL_SYNC(); // src6 ptr dereference // TODO: refactor strides to be in elements/floats instead of bytes to be cleaner and consistent with the rest of the codebase const float * s0_warp = (const float *) ((const char *) src0 + src6[seq_idx] * src0_nb3 + head_idx * src0_nb2 + head_off * d_state); const float * x_warp = (const float *) ((const char *) src1 + (seq_idx * src1_nb3) + (warp_idx * sizeof(float))); @@ -206,7 +207,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<128/WARP_SIZE, 128><<>>( + auto launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<128/WARP_SIZE, 128>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -215,7 +217,8 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa constexpr int num_warps = threads/WARP_SIZE; const dim3 blocks((n_head * head_dim + (num_warps - 1)) / num_warps, n_seq, 1); - ssm_scan_f32_group<256/WARP_SIZE, 256><<>>( + auto launch_params = ggml_cuda_kernel_launch_params(blocks, threads, 0, stream); + ggml_cuda_kernel_launch(ssm_scan_f32_group<256/WARP_SIZE, 256>, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, head_dim, n_group, n_tok); @@ -231,58 +234,59 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1); const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float); if (d_state == 16) { + auto launch_params = ggml_cuda_kernel_launch_params(blocks, threads, smem_size, stream); switch (n_tok) { case 1: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 2: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 3: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 4: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 5: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 6: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 7: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; case 8: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); break; default: - ssm_scan_f32<<>>( + ggml_cuda_kernel_launch(ssm_scan_f32, launch_params, src0, src1, src2, src3, src4, src5, src6, dst, src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2, src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok); From 83e3c79ba300e5c19d09a3c41ee9c82e7fcf59d3 Mon Sep 17 00:00:00 2001 From: aendk Date: Wed, 29 Apr 2026 15:45:37 +0200 Subject: [PATCH 24/24] Adds GGML_CUDA_PDL command line option to toggle PDL. --- ggml/CMakeLists.txt | 1 + ggml/src/ggml-cuda/CMakeLists.txt | 4 ++++ ggml/src/ggml-cuda/common.cuh | 12 ++---------- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index b9f7deb150d..5a44a108232 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -207,6 +207,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT}) +option(GGML_CUDA_PDL "ggml: use Programmatic Dependent Launch (NVIDIA CC >= 9.0)" OFF) option(GGML_CUDA_NCCL "ggml: use NVIDIA Collective Comm. Library" ON) set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING "ggml: cuda link binary compression mode; requires cuda 12.8+") diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index b54d4a6b107..fa14646b409 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -134,6 +134,10 @@ if (CUDAToolkit_FOUND) add_compile_definitions(GGML_CUDA_USE_GRAPHS) endif() + if (GGML_CUDA_PDL) + add_compile_definitions(GGML_CUDA_USE_PDL) + endif() + if (GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index de692a3c61e..3b870786e93 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -109,20 +109,12 @@ # define GGML_CUDA_USE_CUB #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) -# define GGML_CUDA_USE_PDL -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) __CUDA_ARCH__ >= GGML_CUDA_CC_HOPPER - #if defined(GGML_CUDA_USE_PDL) # define GGML_CUDA_PDL_SYNC() cudaGridDependencySynchronize() -#else -# define GGML_CUDA_PDL_SYNC() // no-op on HIP/MUSA -#endif - -#if defined(GGML_CUDA_USE_PDL) # define GGML_CUDA_PDL_LC() cudaTriggerProgrammaticLaunchCompletion() #else -# define GGML_CUDA_PDL_LC() // no-op on HIP/MUSA +# define GGML_CUDA_PDL_SYNC() // no-op when PDL disabled on HIP/MUSA/pre-Hopper +# define GGML_CUDA_PDL_LC() #endif #ifdef __CUDA_ARCH_LIST__