webgpu: Optimize DP4A SmallM MatMulNBits tiling (#27910)

qjia7 · web-flow · commit f22e3a997eb2 · 2026-03-31T14:29:42.000-07:00
This pull request adjusts the tiling strategy for small matrix sizes in
the DP4A matmul kernel. The changes are aimed at improving performance
and compatibility, especially for specific GPU vendors.

On Qualcomm, improving token generation from ~20 tps to ~25 tps.
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/dp4a_matmul_nbits.cc
@@ -128,13 +128,9 @@ Status ApplyDP4AMatrixMatMulNBits(const Tensor* a, const Tensor* b, const Tensor
   const bool has_weight_idx_indirect = weight_index_indirect != nullptr;
   const bool single_scale_weights = (block_size == K * N);
   if (M < min_M_for_tile_optimization) {
-    uint32_t tile_size_k_vec = 16;
-    uint32_t tile_size_n = 32;
+    uint32_t tile_size_k_vec = 32;
+    uint32_t tile_size_n = 4;
 
-    if (context.AdapterInfo().vendor == std::string_view{"intel"}) {
-      tile_size_k_vec = 32;
-      tile_size_n = 4;
-    }
     const uint32_t b_components = (nbits == 2 ? kVec2Components : kVec4Components);
     DP4AMatMulNBitsSmallMProgram mul_program{tile_size_k_vec, tile_size_n, nbits, has_zero_points, has_bias, has_weight_idx, has_weight_idx_indirect, single_scale_weights};
     uint32_t num_N_tile = (N + tile_size_n - 1) / tile_size_n;
diff --git a/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/webgpu/quantization/matmul_nbits.cc
@@ -230,7 +230,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,
   }
 #endif
 
-  // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
+  // On FP32 only GPUs and Qualcomm GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.
   // DP4A Q2 path now supports custom zero points via a 1024-entry LUT (4 zero-point sections × 256 byte values).
   if ((M >= kMinMForTileOptimization || y->DataType() == DataTypeImpl::GetType<float>() || context.AdapterInfo().vendor == std::string_view{"qualcomm"}) &&
       CanApplyDP4AMatrixMatMulNBits(context, accuracy_level, block_size, N, K, components_a)) {

Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,7 @@ Status ApplyMatMulNBits(const Tensor* a, const Tensor* b, const Tensor* scales,`
`230`	`230`	`}`
`231`	`231`	`#endif`
`232`	`232`
`233`		`- // On FP32 only GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.`
	`233`	`+ // On FP32 only GPUs and Qualcomm GPUs, integer math is faster than FP32 therefore always use DP4A independent of length of M.`
`234`	`234`	`// DP4A Q2 path now supports custom zero points via a 1024-entry LUT (4 zero-point sections × 256 byte values).`
`235`	`235`	`if ((M >= kMinMForTileOptimization \|\| y->DataType() == DataTypeImpl::GetType<float>() \|\| context.AdapterInfo().vendor == std::string_view{"qualcomm"}) &&`
`236`	`236`	`CanApplyDP4AMatrixMatMulNBits(context, accuracy_level, block_size, N, K, components_a)) {`