From 270f80ead396ceccc7d22d71d7f602ec8f6b2e03 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 20 May 2026 15:37:53 +0000
Subject: [PATCH 1/3] Initial plan


From deb252526a98f17b8cc933062f604e05b66fe730 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 20 May 2026 16:24:49 +0000
Subject: [PATCH 2/3] Fix float32 ReduceSum stability for large full reductions

Agent-Logs-Url: https://github.com/microsoft/onnxruntime/sessions/6d1108ff-0865-40b2-a59a-6d0af71e8b38

Co-authored-by: justinchuby <11205048+justinchuby@users.noreply.github.com>
---
 .../providers/cpu/reduction/reduction_ops.h   | 16 ++++++++++++++-
 .../cpu/reduction/reduction_ops_test.cc       | 20 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
index 03be8f3e6686c..797a7c9ca31cf 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -214,7 +214,21 @@ class ReduceAggregatorSum : public ReduceAggregator<T, T> {
   inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
   inline void update(const T& v) { this->accumulator_ += v; }
   static T aggall(const T* from_data, int64_t size) {
-    return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
+    if constexpr (std::is_same_v<T, float>) {
+      // Improve numerical stability for large float reductions.
+      double sum = 0.0;
+      double compensation = 0.0;
+      for (int64_t i = 0; i < size; ++i) {
+        const double value = static_cast<double>(from_data[i]) - compensation;
+        const double next_sum = sum + value;
+        compensation = (next_sum - sum) - value;
+        sum = next_sum;
+      }
+
+      return static_cast<float>(sum);
+    } else {
+      return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
+    }
   }
   inline T aggall(const T* from_data) {
     return aggall(from_data, this->N_);
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 79617dc16e1f5..0651dce2b9be1 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -2684,6 +2684,26 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch
 }
 
+TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_constant_input) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+
+  constexpr int64_t d0 = 5;
+  constexpr int64_t d1 = 68;
+  constexpr int64_t d2 = 64;
+  constexpr int64_t d3 = 64;
+  constexpr int64_t numel = d0 * d1 * d2 * d3;
+  constexpr float input_value = 0.1f;
+  const std::vector<float> input(onnxruntime::narrow<size_t>(numel), input_value);
+  const float expected = static_cast<float>(static_cast<double>(numel) * static_cast<double>(input_value));
+
+  test.AddInput<float>("data", {d0, d1, d2, d3}, input);
+  test.AddOutput<float>("reduced", {}, {expected});
+  test.SetOutputAbsErr("reduced", 0.1f);
+  test.SetOutputRelErr("reduced", 1e-6f);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceSum_do_not_keepdims) {
   OpTester test("ReduceSum");
   test.AddAttribute("axes", std::vector<int64_t>{1});

From 1d0f58f41d105eea39017014bca1ca195fd2127c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 20 May 2026 16:31:07 +0000
Subject: [PATCH 3/3] Add clarifying notes for compensated ReduceSum fix

Agent-Logs-Url: https://github.com/microsoft/onnxruntime/sessions/6d1108ff-0865-40b2-a59a-6d0af71e8b38

Co-authored-by: justinchuby <11205048+justinchuby@users.noreply.github.com>
---
 onnxruntime/core/providers/cpu/reduction/reduction_ops.h   | 4 +++-
 .../test/providers/cpu/reduction/reduction_ops_test.cc     | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
index 797a7c9ca31cf..755ca0cee0dba 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -215,10 +215,12 @@ class ReduceAggregatorSum : public ReduceAggregator<T, T> {
   inline void update(const T& v) { this->accumulator_ += v; }
   static T aggall(const T* from_data, int64_t size) {
     if constexpr (std::is_same_v<T, float>) {
-      // Improve numerical stability for large float reductions.
+      // Kahan compensated summation to reduce float round-off error on large reductions.
+      // https://en.wikipedia.org/wiki/Kahan_summation_algorithm
       double sum = 0.0;
       double compensation = 0.0;
       for (int64_t i = 0; i < size; ++i) {
+        // compensation stores the previous step's rounding error that is corrected in this step.
         const double value = static_cast<double>(from_data[i]) - compensation;
         const double next_sum = sum + value;
         compensation = (next_sum - sum) - value;
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 0651dce2b9be1..60406cc3a122d 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -2694,12 +2694,17 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_cons
   constexpr int64_t d3 = 64;
   constexpr int64_t numel = d0 * d1 * d2 * d3;
   constexpr float input_value = 0.1f;
+  // narrow ensures the int64_t->size_t conversion is checked.
   const std::vector<float> input(onnxruntime::narrow<size_t>(numel), input_value);
+  // Compute the expected scalar using higher precision so the reference itself
+  // is not affected by large float32 accumulation error.
   const float expected = static_cast<float>(static_cast<double>(numel) * static_cast<double>(input_value));
 
   test.AddInput<float>("data", {d0, d1, d2, d3}, input);
   test.AddOutput<float>("reduced", {}, {expected});
-  test.SetOutputAbsErr("reduced", 0.1f);
+  // Allow small architecture-dependent float order differences while still
+  // catching the large (~1e2) discrepancy reported in the issue.
+  test.SetOutputAbsErr("reduced", 0.05f);
   test.SetOutputRelErr("reduced", 1e-6f);
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: full reduce without keepDimensions is not supported with explicit batch
 }