microsoft · JonathanC-ARM · May 7, 2026 · May 7, 2026 · May 12, 2026 · May 19, 2026
diff --git a/onnxruntime/core/providers/cpu/math/matmul.cc b/onnxruntime/core/providers/cpu/math/matmul.cc
@@ -253,7 +253,14 @@ Status MatMul<float>::PrePack(const Tensor& tensor, int input_idx, /*out*/ Alloc
       dim2 = static_cast<size_t>(b_shape[1]);
     }
 
-    if (use_fastmath_mode_ && (trans_a_attr_ == 0) && (trans_b_attr_ == 0) && ((dim1 * dim2) >= kFastMathModeKernelsizeThreshold)) {
+    const size_t k_dim = b_shape.NumDimensions() >= 2
+                             ? static_cast<size_t>(b_shape[b_shape.NumDimensions() - 2])
+                             : dim1;
+    if (use_fastmath_mode_ &&
+        (trans_a_attr_ == 0) &&
+        (trans_b_attr_ == 0) &&
+        ((k_dim * dim2) >= kFastMathModeKernelsizeThreshold) &&
+        ((k_dim % kFastMathModeKAlignment) == 0)) {
       is_packed = GemmPackBBfloat16(alloc, tensor, trans_a_attr_ != 0, trans_b_attr_ != 0, packed_b_, packed_b_size, b_shape_, &mlas_backend_kernel_selector_config_);
     } else
 #endif
@@ -323,7 +330,10 @@ Status MatMul<float>::Compute(OpKernelContext* ctx) const {
   const size_t lda = helper.Lda(trans_a);
   const size_t ldb = helper.Ldb(trans_b);
 #if defined(__aarch64__) && defined(__linux__)
-  if (use_fastmath_mode_ && !trans_a && !trans_b && ((N * K) >= kFastMathModeKernelsizeThreshold)) {
+  if (use_fastmath_mode_ &&
+      !trans_a && !trans_b &&
+      ((N * K) >= kFastMathModeKernelsizeThreshold) &&
+      ((K % kFastMathModeKAlignment) == 0)) {
     std::vector<MLAS_SBGEMM_DATA_PARAMS> data(max_len);
     for (size_t i = 0; i < max_len; i++) {
       data[i].BIsfp32 = !(bool(packed_b_));

diff --git a/onnxruntime/core/providers/cpu/math/matmul.h b/onnxruntime/core/providers/cpu/math/matmul.h
@@ -102,6 +102,8 @@ class MatMul<float> final : public OpKernel {
   bool use_fastmath_mode_;
   // sbgemm kernel is implemented as 8x8 blocks with weights pre-packed to 4 blocks of 4x2
   // so a minimum of 32 elements is defined to outweigh the additional prepacking overhead
+  // The NEON SBGemm kernel consumes A in 4-float groups. Keep K tails on SGEMM
+  const size_t kFastMathModeKAlignment = 4;
   const size_t kFastMathModeKernelsizeThreshold = 32;
 #endif
 };

diff --git a/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc b/onnxruntime/test/providers/cpu/math/matmul_fastmath_test.cc
@@ -5,9 +5,11 @@
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+#include "core/session/inference_session.h"
 #include "test/common/dnnl_op_test_utils.h"
 #include "test/common/cuda_op_test_utils.h"
 #include "test/common/tensor_op_test_utils.h"
+#include "test/util/include/test_environment.h"
 #include "default_providers.h"
 
 #if defined(__aarch64__) && defined(__linux__)
@@ -168,6 +170,63 @@ void RunMatMulTest(int32_t opset_version) {
   RunMatMulTest<T>(opset_version, false, false, false);
 }
 
+TEST(MathOpTest, MatMulFloatTypeFastMathKTailFallsBackToSgemm) {
+  constexpr int64_t M = 1;
+  constexpr int64_t N = 8;
+  constexpr int64_t K = 13;
+
+  OpTester test("MatMul", 7);
+
+  std::vector<float> input0_vals(K);
+  std::iota(input0_vals.begin(), input0_vals.end(), 1.0f);
+  test.AddInput<float>("A", {M, K}, input0_vals);
+
+  std::vector<float> input1_vals(K * N, 1.0f);
+  test.AddInput<float>("B", {K, N}, input1_vals, true);
+
+  std::vector<float> expected_vals(N, 91.0f);
+  test.AddOutput<float>("Y", {M, N}, expected_vals);
+
+  Model& model = test.BuildModel();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+
+  std::string serialized_model;
+  ASSERT_TRUE(model.ToProto().SerializeToString(&serialized_model));
+
+  SessionOptions so;
+  ASSERT_STATUS_OK(so.config_options.AddConfigEntry(
+      kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1"));
+
+  InferenceSession session_object{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCpuExecutionProvider()));
+
+  std::stringstream model_stream(serialized_model);
+  ASSERT_STATUS_OK(session_object.Load(model_stream));
+  ASSERT_STATUS_OK(session_object.Initialize());
+
+  std::vector<float> input0_backing(input0_vals.begin(), input0_vals.end());
+  input0_backing.resize(K + 3, std::numeric_limits<float>::quiet_NaN());
+
+  OrtValue input0;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({M, K}), input0_backing.data(),
+                       OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator), input0);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair(std::string("A"), input0));
+
+  std::vector<OrtValue> fetches;
+  ASSERT_STATUS_OK(session_object.Run(RunOptions{}, feeds, AsSpan({std::string("Y")}), &fetches));
+
+  const auto& output_tensor = fetches[0].Get<Tensor>();
+  ASSERT_EQ(output_tensor.Shape(), TensorShape({M, N}));
+
+  const auto* output_data = output_tensor.Data<float>();
+  for (int64_t i = 0; i < N; ++i) {
+    ASSERT_TRUE(std::isfinite(output_data[i])) << "Output " << i << " should not include padded A tail values.";
+    ASSERT_EQ(output_data[i], expected_vals[i]) << "Output " << i;
+  }
+}
+
 TEST(MathOpTest, MatMulFloatType_FastMath) {
   // TODO: Unskip when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {