From e8c90e9c585f6068be1c10e32c1fe17947440bc0 Mon Sep 17 00:00:00 2001 From: yongjunlee Date: Sun, 19 Apr 2026 16:11:28 +0900 Subject: [PATCH 1/6] =?UTF-8?q?Reject=20QDQ=20Gemm=E2=86=92QGemm=20fusion?= =?UTF-8?q?=20when=20alpha=20!=3D=201=20with=20bias=20(#28130)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Gemm→QGemm QDQ fusion selector only validated beta == 1, letting Gemms with alpha != 1 and a bias through. QGemm broadcasts the int32 bias into the accumulator before applying the alpha*sa*sb output scale, so the bias ends up scaled by alpha too — producing incorrect outputs when alpha != 1 (bias == 0 masks the issue). Add an alpha == 1 check alongside the existing beta == 1 check in GemmNodeGroupSelector::Check (only when bias is present — without bias the fused path is still exact). Extend QDQTransformerGemmTests and the fastmath variant with an alpha_not_one parameter so the regression is covered. Follow-up tracked in the issue: absorb alpha into the int32 bias in GemmReplaceWithQuant so alpha != 1 cases can keep the fusion. --- .../selectors_actions/qdq_selectors.cc | 7 +++++++ .../optimizer/qdq_transformer_fastmath_test.cc | 14 ++++++++++++-- onnxruntime/test/optimizer/qdq_transformer_test.cc | 14 ++++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index 2e9d46656b514..dcfad53c47e4b 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -835,6 +835,13 @@ bool GemmNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& n return true; } + // When bias is present, QGemm folds bias into the int32 accumulator before + // applying the alpha*sa*sb output scale, which would incorrectly scale the + // bias by alpha. Require alpha==1 and beta==1 so the fused path is exact. + if (node.GetAttributes().at("alpha").f() != 1.0) { + return false; + } + if (node.GetAttributes().at("beta").f() != 1.0) { // beta needs to be 1.0 return false; } diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index 55f1d212a8034..bb319b785218e 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -323,7 +323,8 @@ TEST(QDQTransformerTests, MatMul_S8S8U8_DisableFastMath) { } template -void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false, bool disable_fastmath = false) { +void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false, + bool disable_fastmath = false, bool alpha_not_one = false) { auto test_case = [&](const std::vector& input1_shape, const std::vector& input2_shape, bool use_contrib_qdq = false) { auto build_test_case = [&](ModelTestBuilder& builder) { @@ -396,12 +397,17 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one if (beta_not_one) { gemm_node->AddAttribute("beta", 2.0f); } + + if (alpha_not_one) { + gemm_node->AddAttribute("alpha", 2.0f); + } }; auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); - if ((!has_output_q || std::is_same_v) && (!has_bias || (std::is_same_v && !beta_not_one)) && + if ((!has_output_q || std::is_same_v) && + (!has_bias || (std::is_same_v && !beta_not_one && !alpha_not_one)) && (std::is_same_v || std::is_same_v)) { EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1); EXPECT_EQ(op_to_count["Gemm"], 0); @@ -490,6 +496,10 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); + QDQTransformerGemmTests(false, false, false, false, true); + QDQTransformerGemmTests(false, true, false, false, true); + QDQTransformerGemmTests(true, false, false, false, true); + QDQTransformerGemmTests(true, true, false, false, true); // dummy test to disable the fastmath session QDQTransformerGemmTests(true, true, true, true); } diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 85d4c51b9faae..ab2f186487ed8 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -718,7 +718,8 @@ TEST(QDQTransformerTests, MatMul_S8S8U8) { } template -void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false) { +void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one = false, + bool alpha_not_one = false) { auto test_case = [&](const std::vector& input1_shape, const std::vector& input2_shape, bool use_contrib_qdq = false) { auto build_test_case = [&](ModelTestBuilder& builder) { @@ -791,12 +792,17 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one if (beta_not_one) { gemm_node->AddAttribute("beta", 2.0f); } + + if (alpha_not_one) { + gemm_node->AddAttribute("alpha", 2.0f); + } }; auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); - if ((!has_output_q || std::is_same_v) && (!has_bias || (std::is_same_v && !beta_not_one)) && + if ((!has_output_q || std::is_same_v) && + (!has_bias || (std::is_same_v && !beta_not_one && !alpha_not_one)) && (std::is_same_v || std::is_same_v)) { EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1); EXPECT_EQ(op_to_count["Gemm"], 0); @@ -860,6 +866,10 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); + QDQTransformerGemmTests(false, false, false, true); + QDQTransformerGemmTests(false, true, false, true); + QDQTransformerGemmTests(true, false, false, true); + QDQTransformerGemmTests(true, true, false, true); } TEST(QDQTransformerTests, Gemm_U8U8U8) { From f9833833d6c7399dcecbdd879f03e5e4ce22c548 Mon Sep 17 00:00:00 2001 From: yongjunlee Date: Tue, 19 May 2026 14:47:59 +0900 Subject: [PATCH 2/6] Trim redundant no-bias alpha_not_one cases to reduce ASAN memory pressure --- onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc | 2 -- onnxruntime/test/optimizer/qdq_transformer_test.cc | 2 -- 2 files changed, 4 deletions(-) diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index bb319b785218e..ac9d317f9e02e 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -496,9 +496,7 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); - QDQTransformerGemmTests(false, false, false, false, true); QDQTransformerGemmTests(false, true, false, false, true); - QDQTransformerGemmTests(true, false, false, false, true); QDQTransformerGemmTests(true, true, false, false, true); // dummy test to disable the fastmath session QDQTransformerGemmTests(true, true, true, true); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index ab2f186487ed8..f3b02059d228b 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -866,9 +866,7 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); - QDQTransformerGemmTests(false, false, false, true); QDQTransformerGemmTests(false, true, false, true); - QDQTransformerGemmTests(true, false, false, true); QDQTransformerGemmTests(true, true, false, true); } From f031046ae2f4350e952549866ea7cb1fd212cb0e Mon Sep 17 00:00:00 2001 From: yongjunlee Date: Thu, 21 May 2026 15:51:18 +0900 Subject: [PATCH 3/6] Skip TestDequantizeLinearNoAxis variants under ASan and restore alpha_not_one cases --- onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc | 2 ++ onnxruntime/test/optimizer/qdq_transformer_test.cc | 2 ++ onnxruntime/test/optimizer/transpose_optimizer_test.cc | 3 +++ 3 files changed, 7 insertions(+) diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index ac9d317f9e02e..bb319b785218e 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -496,7 +496,9 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); + QDQTransformerGemmTests(false, false, false, false, true); QDQTransformerGemmTests(false, true, false, false, true); + QDQTransformerGemmTests(true, false, false, false, true); QDQTransformerGemmTests(true, true, false, false, true); // dummy test to disable the fastmath session QDQTransformerGemmTests(true, true, true, true); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index f3b02059d228b..ab2f186487ed8 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -866,7 +866,9 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); + QDQTransformerGemmTests(false, false, false, true); QDQTransformerGemmTests(false, true, false, true); + QDQTransformerGemmTests(true, false, false, true); QDQTransformerGemmTests(true, true, false, true); } diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index 080c382db5d93..e84aa4cfb3a59 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -3763,6 +3763,8 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { std::optional no_axis; // Empty axis value will not be set. RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); + // Skip remaining variants under ASan to stay under the 8 GB allocator limit. +#if !defined(__SANITIZE_ADDRESS__) RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); #if !defined(DISABLE_CONTRIB_OPS) @@ -3771,6 +3773,7 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kMSDomain); RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kMSDomain); #endif +#endif } TEST(TransposeOptimizerTests, TestCast) { From c6a9c6cb351a17ad4453a5f338fd15eba59e8829 Mon Sep 17 00:00:00 2001 From: yongjunlee Date: Wed, 27 May 2026 10:45:10 +0900 Subject: [PATCH 4/6] Revert ASan guard --- onnxruntime/test/optimizer/transpose_optimizer_test.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index e84aa4cfb3a59..080c382db5d93 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -3763,8 +3763,6 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { std::optional no_axis; // Empty axis value will not be set. RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); - // Skip remaining variants under ASan to stay under the 8 GB allocator limit. -#if !defined(__SANITIZE_ADDRESS__) RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kOnnxDomain); #if !defined(DISABLE_CONTRIB_OPS) @@ -3773,7 +3771,6 @@ TEST(TransposeOptimizerTests, TestDequantizeLinearNoAxis) { RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kMSDomain); RunDequantizeLinearTestCase(zp_input_shape, zp_value_shape, no_axis, kMSDomain); #endif -#endif } TEST(TransposeOptimizerTests, TestCast) { From 987cb8c4cf3d690b4cc97bc5a09f142ec0a6b948 Mon Sep 17 00:00:00 2001 From: elwhyjay Date: Wed, 3 Jun 2026 05:25:26 +0900 Subject: [PATCH 5/6] Reduce QDQ Gemm alpha test matrix --- .../test/optimizer/qdq_transformer_fastmath_test.cc | 11 +++++++---- onnxruntime/test/optimizer/qdq_transformer_test.cc | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index bb319b785218e..1197a2eea647d 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -496,10 +496,13 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); - QDQTransformerGemmTests(false, false, false, false, true); - QDQTransformerGemmTests(false, true, false, false, true); - QDQTransformerGemmTests(true, false, false, false, true); - QDQTransformerGemmTests(true, true, false, false, true); + if constexpr (std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) { + QDQTransformerGemmTests(false, false, false, false, true); + QDQTransformerGemmTests(false, true, false, false, true); + QDQTransformerGemmTests(true, false, false, false, true); + QDQTransformerGemmTests(true, true, false, false, true); + } // dummy test to disable the fastmath session QDQTransformerGemmTests(true, true, true, true); } diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 2e5c5a8f71be9..1b8884b13e8c0 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -866,10 +866,13 @@ void QDQTransformerGemmTests() { QDQTransformerGemmTests(false, true, true); QDQTransformerGemmTests(true, false, true); QDQTransformerGemmTests(true, true, true); - QDQTransformerGemmTests(false, false, false, true); - QDQTransformerGemmTests(false, true, false, true); - QDQTransformerGemmTests(true, false, false, true); - QDQTransformerGemmTests(true, true, false, true); + if constexpr (std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) { + QDQTransformerGemmTests(false, false, false, true); + QDQTransformerGemmTests(false, true, false, true); + QDQTransformerGemmTests(true, false, false, true); + QDQTransformerGemmTests(true, true, false, true); + } } TEST(QDQTransformerTests, Gemm_U8U8U8) { From ea615ecf1276ab98ea2064cdb59d4851cf59b581 Mon Sep 17 00:00:00 2001 From: elwhyjay Date: Wed, 3 Jun 2026 12:10:55 +0900 Subject: [PATCH 6/6] Limit QDQ Gemm alpha test shape variants --- onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc | 6 ++++-- onnxruntime/test/optimizer/qdq_transformer_test.cc | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index 1197a2eea647d..6b431c10f978a 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -482,8 +482,10 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one }; test_case({2, 2}, {2, 4}); - test_case({13, 15}, {15, 15}); - test_case({2, 2}, {2, 4}, true); // Use com.microsoft QDQ ops + if (!alpha_not_one) { + test_case({13, 15}, {15, 15}); + test_case({2, 2}, {2, 4}, true); // Use com.microsoft QDQ ops + } } template diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 1b8884b13e8c0..11183722a74cb 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -852,8 +852,10 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one }; test_case({2, 2}, {2, 4}); - test_case({13, 15}, {15, 15}); - test_case({2, 2}, {2, 4}, true); // Use com.microsoft QDQ ops + if (!alpha_not_one) { + test_case({13, 15}, {15, 15}); + test_case({2, 2}, {2, 4}, true); // Use com.microsoft QDQ ops + } } template