[Test] Add dtype-mismatch rejection test for GQA pre-norm fusion

hariharans29 · hariharans29 · commit cef037de051b · 2026-05-15T13:58:27.000-07:00
Covers the new gate that requires SimplifiedLayerNormalization input/scale/output element types to match before fusing into GroupQueryAttention.
diff --git a/onnxruntime/test/optimizer/group_query_attention_pre_norm_fusion_test.cc b/onnxruntime/test/optimizer/group_query_attention_pre_norm_fusion_test.cc
@@ -56,6 +56,11 @@ struct BuildOptions {
   // If true, pre-populate the GQA node's slot 14 with a q_norm_weight initializer so the
   // optimizer treats the node as already fused and skips it.
   bool pre_fused = false;
+  // If true, build the q_norm_weight initializer as float32 while the SLN input/output
+  // remain MLFloat16. SimplifiedLayerNormalization's schema allows scale (V) to differ
+  // from input/output (T), but the fused GQA op reuses T for the norm-weight slots, so
+  // the optimizer must reject the rewrite.
+  bool mismatched_norm_weight_dtype = false;
 };
 
 void BuildQwenQkPostNormPattern(ModelTestBuilder& builder, const BuildOptions& opts) {
@@ -77,10 +82,13 @@ void BuildQwenQkPostNormPattern(ModelTestBuilder& builder, const BuildOptions& o
   NodeArg* seqlens_k = builder.MakeInput<int32_t>(std::vector<int64_t>{kBatch}, std::vector<int32_t>{0});
   NodeArg* total_seq_len = builder.MakeInput<int32_t>(std::vector<int64_t>{1}, std::vector<int32_t>{1});
 
-  // Norm weight initializers: [head_size]. (Or non-1D when forcing a shape mismatch.)
+  // Norm weight initializers: [head_size]. (Or non-1D when forcing a shape mismatch, or
+  // a different element type to exercise the dtype-mismatch gate.)
   std::vector<int64_t> q_norm_weight_shape =
       opts.break_q_norm_weight_shape ? std::vector<int64_t>{1, kHeadSize} : std::vector<int64_t>{kHeadSize};
-  NodeArg* q_norm_weight = builder.MakeInitializer<MLFloat16>(q_norm_weight_shape, MLFloat16(1.0f), MLFloat16(1.0f));
+  NodeArg* q_norm_weight = opts.mismatched_norm_weight_dtype
+                               ? builder.MakeInitializer<float>(q_norm_weight_shape, 1.0f, 1.0f)
+                               : builder.MakeInitializer<MLFloat16>(q_norm_weight_shape, MLFloat16(1.0f), MLFloat16(1.0f));
   NodeArg* k_norm_weight = builder.MakeInitializer<MLFloat16>({kHeadSize}, MLFloat16(1.0f), MLFloat16(1.0f));
 
   // Reshape "shape" initializers.
@@ -254,6 +262,19 @@ TEST_F(GraphTransformationTests, GroupQueryAttentionPreNormFusionRejectsNon1DNor
       TransformerLevel::Level2, /*steps=*/1, nullptr, CheckUnfusedGraph));
 }
 
+TEST_F(GraphTransformationTests, GroupQueryAttentionPreNormFusionRejectsMismatchedNormWeightDtype) {
+  // SimplifiedLayerNormalization permits its scale (V) to differ from input/output (T),
+  // but the fused GroupQueryAttention slot reuses T for the norm-weight inputs. Wiring a
+  // float32 scale into a float16 chain must skip the rewrite to avoid changing type
+  // constraints on the fused node.
+  BuildOptions opts;
+  opts.mismatched_norm_weight_dtype = true;
+  auto build = [opts](ModelTestBuilder& builder) { BuildQwenQkPostNormPattern(builder, opts); };
+  ASSERT_STATUS_OK(TestGraphTransformer(
+      build, /*opset_version=*/21, *logger_, MakeWebGpuTransformer(),
+      TransformerLevel::Level2, /*steps=*/1, nullptr, CheckUnfusedGraph));
+}
+
 TEST_F(GraphTransformationTests, GroupQueryAttentionPreNormFusionSkipsCpuEp) {
   // Build the pattern but assign all nodes to CPU EP. The fusion is gated to WebGPU only,
   // so the graph must remain unfused.