From 270f80ead396ceccc7d22d71d7f602ec8f6b2e03 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 15:37:53 +0000 Subject: [PATCH 1/3] Initial plan From deb252526a98f17b8cc933062f604e05b66fe730 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 16:24:49 +0000 Subject: [PATCH 2/3] Fix float32 ReduceSum stability for large full reductions Agent-Logs-Url: https://github.com/microsoft/onnxruntime/sessions/6d1108ff-0865-40b2-a59a-6d0af71e8b38 Co-authored-by: justinchuby <11205048+justinchuby@users.noreply.github.com> --- .../providers/cpu/reduction/reduction_ops.h | 16 ++++++++++++++- .../cpu/reduction/reduction_ops_test.cc | 20 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index 03be8f3e6686c..797a7c9ca31cf 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -214,7 +214,21 @@ class ReduceAggregatorSum : public ReduceAggregator { inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} inline void update(const T& v) { this->accumulator_ += v; } static T aggall(const T* from_data, int64_t size) { - return Eigen::Map>(from_data, onnxruntime::narrow(size)).sum(); + if constexpr (std::is_same_v) { + // Improve numerical stability for large float reductions. + double sum = 0.0; + double compensation = 0.0; + for (int64_t i = 0; i < size; ++i) { + const double value = static_cast(from_data[i]) - compensation; + const double next_sum = sum + value; + compensation = (next_sum - sum) - value; + sum = next_sum; + } + + return static_cast(sum); + } else { + return Eigen::Map>(from_data, onnxruntime::narrow(size)).sum(); + } } inline T aggall(const T* from_data) { return aggall(from_data, this->N_); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 79617dc16e1f5..0651dce2b9be1 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -2684,6 +2684,26 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch } +TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_constant_input) { + OpTester test("ReduceSum"); + test.AddAttribute("keepdims", static_cast(0)); + + constexpr int64_t d0 = 5; + constexpr int64_t d1 = 68; + constexpr int64_t d2 = 64; + constexpr int64_t d3 = 64; + constexpr int64_t numel = d0 * d1 * d2 * d3; + constexpr float input_value = 0.1f; + const std::vector input(onnxruntime::narrow(numel), input_value); + const float expected = static_cast(static_cast(numel) * static_cast(input_value)); + + test.AddInput("data", {d0, d1, d2, d3}, input); + test.AddOutput("reduced", {}, {expected}); + test.SetOutputAbsErr("reduced", 0.1f); + test.SetOutputRelErr("reduced", 1e-6f); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch +} + TEST(ReductionOpTest, ReduceSum_do_not_keepdims) { OpTester test("ReduceSum"); test.AddAttribute("axes", std::vector{1}); From 1d0f58f41d105eea39017014bca1ca195fd2127c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 20 May 2026 16:31:07 +0000 Subject: [PATCH 3/3] Add clarifying notes for compensated ReduceSum fix Agent-Logs-Url: https://github.com/microsoft/onnxruntime/sessions/6d1108ff-0865-40b2-a59a-6d0af71e8b38 Co-authored-by: justinchuby <11205048+justinchuby@users.noreply.github.com> --- onnxruntime/core/providers/cpu/reduction/reduction_ops.h | 4 +++- .../test/providers/cpu/reduction/reduction_ops_test.cc | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index 797a7c9ca31cf..755ca0cee0dba 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -215,10 +215,12 @@ class ReduceAggregatorSum : public ReduceAggregator { inline void update(const T& v) { this->accumulator_ += v; } static T aggall(const T* from_data, int64_t size) { if constexpr (std::is_same_v) { - // Improve numerical stability for large float reductions. + // Kahan compensated summation to reduce float round-off error on large reductions. + // https://en.wikipedia.org/wiki/Kahan_summation_algorithm double sum = 0.0; double compensation = 0.0; for (int64_t i = 0; i < size; ++i) { + // compensation stores the previous step's rounding error that is corrected in this step. const double value = static_cast(from_data[i]) - compensation; const double next_sum = sum + value; compensation = (next_sum - sum) - value; diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 0651dce2b9be1..60406cc3a122d 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -2694,12 +2694,17 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_cons constexpr int64_t d3 = 64; constexpr int64_t numel = d0 * d1 * d2 * d3; constexpr float input_value = 0.1f; + // narrow ensures the int64_t->size_t conversion is checked. const std::vector input(onnxruntime::narrow(numel), input_value); + // Compute the expected scalar using higher precision so the reference itself + // is not affected by large float32 accumulation error. const float expected = static_cast(static_cast(numel) * static_cast(input_value)); test.AddInput("data", {d0, d1, d2, d3}, input); test.AddOutput("reduced", {}, {expected}); - test.SetOutputAbsErr("reduced", 0.1f); + // Allow small architecture-dependent float order differences while still + // catching the large (~1e2) discrepancy reported in the issue. + test.SetOutputAbsErr("reduced", 0.05f); test.SetOutputRelErr("reduced", 1e-6f); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch }