microsoft · Copilot · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -214,7 +214,23 @@ class ReduceAggregatorSum : public ReduceAggregator<T, T> {
   inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
   inline void update(const T& v) { this->accumulator_ += v; }
   static T aggall(const T* from_data, int64_t size) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
+    if constexpr (std::is_same_v<T, float>) {
+      // Kahan compensated summation to reduce float round-off error on large reductions.
+      // https://en.wikipedia.org/wiki/Kahan_summation_algorithm
+      double sum = 0.0;
+      double compensation = 0.0;
+      for (int64_t i = 0; i < size; ++i) {
+        // compensation stores the previous step's rounding error that is corrected in this step.
+        const double value = static_cast<double>(from_data[i]) - compensation;
+        const double next_sum = sum + value;
+        compensation = (next_sum - sum) - value;
+        sum = next_sum;
+      }
+
+      return static_cast<float>(sum);
+    } else {
+      return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
+    }
   }
   inline T aggall(const T* from_data) {
     return aggall(from_data, this->N_);

diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -2684,6 +2684,31 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch
 }
 
+TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_constant_input) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+
+  constexpr int64_t d0 = 5;
+  constexpr int64_t d1 = 68;
+  constexpr int64_t d2 = 64;
+  constexpr int64_t d3 = 64;
+  constexpr int64_t numel = d0 * d1 * d2 * d3;
+  constexpr float input_value = 0.1f;
+  // narrow ensures the int64_t->size_t conversion is checked.
+  const std::vector<float> input(onnxruntime::narrow<size_t>(numel), input_value);
+  // Compute the expected scalar using higher precision so the reference itself
+  // is not affected by large float32 accumulation error.
+  const float expected = static_cast<float>(static_cast<double>(numel) * static_cast<double>(input_value));
+
+  test.AddInput<float>("data", {d0, d1, d2, d3}, input);
+  test.AddOutput<float>("reduced", {}, {expected});
+  // Allow small architecture-dependent float order differences while still
+  // catching the large (~1e2) discrepancy reported in the issue.
+  test.SetOutputAbsErr("reduced", 0.05f);
+  test.SetOutputRelErr("reduced", 1e-6f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceSum_do_not_keepdims) {
   OpTester test("ReduceSum");
   test.AddAttribute("axes", std::vector<int64_t>{1});