Address Copilot review comments on PR #28484

hariharans29 · hariharans29 · commit 87e964481622 · 2026-05-19T23:55:25.000-07:00
- [WebGPU] Validate q/k_norm_weight is 1-D of length head_size in the GQA kernel so a hand-authored model with the wrong shape fails with INVALID_ARGUMENT instead of reading wrong offsets.

- [Optimizer] Require SimplifiedLayerNormalization input/scale/output element types to match before fusing, since the fused GQA input slots reuse the projection's element type (T) and a mixed-type SLN would change the node's type constraints.

- [JSEP] Reject the GQA node when q_norm_weight or k_norm_weight is present regardless of rank (including scalars), instead of only checking dims.length &gt; 0.
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/group-query-attention.ts
@@ -331,11 +331,8 @@ export const groupQueryAttention = (context: ComputeContext, attributes: GroupQu
   // q_norm_weight (input 14) / k_norm_weight (input 15) are emitted by the WebGPU-only
   // GroupQueryAttentionPreNormFusion optimizer pass. JSEP does not implement the fused
   // per-head Q/K RMS normalization prologue, so reject the node if either input is present
-  // rather than silently dropping the normalization.
-  if (
-    (context.inputs.length > 14 && context.inputs[14] && context.inputs[14].dims.length > 0) ||
-    (context.inputs.length > 15 && context.inputs[15] && context.inputs[15].dims.length > 0)
-  ) {
+  // (regardless of rank, including scalars) rather than silently dropping the normalization.
+  if ((context.inputs.length > 14 && context.inputs[14]) || (context.inputs.length > 15 && context.inputs[15])) {
     throw new Error(
       'GroupQueryAttention (JSEP): q_norm_weight / k_norm_weight inputs are not supported. ' +
         'The per-head Q/K RMS normalization prologue is implemented only on the native WebGPU EP.',
diff --git a/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/webgpu/bert/group_query_attention.cc
@@ -261,6 +261,21 @@ Status GroupQueryAttention::ComputeInternal(onnxruntime::webgpu::ComputeContext&
   WebgpuAttentionParameters parameters(params);
   ORT_RETURN_IF(has_qk_norm && parameters.is_packed_qkv_,
                 "GroupQueryAttention: q_norm_weight / k_norm_weight are not supported when QKV is packed.");
+  if (has_qk_norm) {
+    // The fused prologue indexes q/k_norm_weight as a 1-D tensor of length head_size. Validate
+    // shape here so a hand-authored model with a wrong shape fails with INVALID_ARGUMENT instead
+    // of silently reading the wrong offsets (or out of bounds).
+    const auto& q_norm_shape = q_norm_weight->Shape();
+    ORT_RETURN_IF_NOT(q_norm_shape.NumDimensions() == 1 &&
+                          q_norm_shape[0] == static_cast<int64_t>(parameters.head_size_),
+                      "GroupQueryAttention: q_norm_weight must be a 1-D tensor of shape [head_size=",
+                      parameters.head_size_, "], got ", q_norm_shape.ToString(), ".");
+    const auto& k_norm_shape = k_norm_weight->Shape();
+    ORT_RETURN_IF_NOT(k_norm_shape.NumDimensions() == 1 &&
+                          k_norm_shape[0] == static_cast<int64_t>(parameters.head_size_),
+                      "GroupQueryAttention: k_norm_weight must be a 1-D tensor of shape [head_size=",
+                      parameters.head_size_, "], got ", k_norm_shape.ToString(), ".");
+  }
   TensorShapeVector output_shape(3);
   output_shape[0] = static_cast<int64_t>(parameters.batch_size_);
   output_shape[1] = static_cast<int64_t>(parameters.sequence_length_);
diff --git a/onnxruntime/core/optimizer/group_query_attention_pre_norm_fusion.cc b/onnxruntime/core/optimizer/group_query_attention_pre_norm_fusion.cc
@@ -124,6 +124,31 @@ bool MatchPreNormReshapeChain(Graph& graph,
     return false;
   }
 
+  // SimplifiedLayerNormalization permits its input (T), scale (V) and output (T) to use different
+  // element types. The fused GroupQueryAttention input slots reuse the projection's element type
+  // (T), so we can only fuse when scale and output also use T -- otherwise the rewrite would
+  // change the node's type constraints and produce a semantically different graph. Require all
+  // three to match before fusing.
+  auto get_elem_type = [](const NodeArg* arg) -> int32_t {
+    if (arg == nullptr) {
+      return ONNX_NAMESPACE::TensorProto::UNDEFINED;
+    }
+    const auto* type_proto = arg->TypeAsProto();
+    if (type_proto == nullptr || !type_proto->has_tensor_type() ||
+        !type_proto->tensor_type().has_elem_type()) {
+      return ONNX_NAMESPACE::TensorProto::UNDEFINED;
+    }
+    return type_proto->tensor_type().elem_type();
+  };
+  const int32_t sln_input_elem_type = get_elem_type(sln->InputDefs()[0]);
+  const int32_t sln_scale_elem_type = get_elem_type(sln->InputDefs()[1]);
+  const int32_t sln_output_elem_type = get_elem_type(sln->OutputDefs()[0]);
+  if (sln_input_elem_type == ONNX_NAMESPACE::TensorProto::UNDEFINED ||
+      sln_input_elem_type != sln_scale_elem_type ||
+      sln_input_elem_type != sln_output_elem_type) {
+    return false;
+  }
+
   // Norm weight must be an initializer of shape [head_size].
   NodeArg* norm_weight_arg = sln->MutableInputDefs()[1];
   const ONNX_NAMESPACE::TensorProto* norm_weight_tensor =