diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h index 03be8f3e6686c..755ca0cee0dba 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h @@ -214,7 +214,23 @@ class ReduceAggregatorSum : public ReduceAggregator { inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator(N, 0) {} inline void update(const T& v) { this->accumulator_ += v; } static T aggall(const T* from_data, int64_t size) { - return Eigen::Map>(from_data, onnxruntime::narrow(size)).sum(); + if constexpr (std::is_same_v) { + // Kahan compensated summation to reduce float round-off error on large reductions. + // https://en.wikipedia.org/wiki/Kahan_summation_algorithm + double sum = 0.0; + double compensation = 0.0; + for (int64_t i = 0; i < size; ++i) { + // compensation stores the previous step's rounding error that is corrected in this step. + const double value = static_cast(from_data[i]) - compensation; + const double next_sum = sum + value; + compensation = (next_sum - sum) - value; + sum = next_sum; + } + + return static_cast(sum); + } else { + return Eigen::Map>(from_data, onnxruntime::narrow(size)).sum(); + } } inline T aggall(const T* from_data) { return aggall(from_data, this->N_); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 79617dc16e1f5..60406cc3a122d 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -2684,6 +2684,31 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch } +TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_constant_input) { + OpTester test("ReduceSum"); + test.AddAttribute("keepdims", static_cast(0)); + + constexpr int64_t d0 = 5; + constexpr int64_t d1 = 68; + constexpr int64_t d2 = 64; + constexpr int64_t d3 = 64; + constexpr int64_t numel = d0 * d1 * d2 * d3; + constexpr float input_value = 0.1f; + // narrow ensures the int64_t->size_t conversion is checked. + const std::vector input(onnxruntime::narrow(numel), input_value); + // Compute the expected scalar using higher precision so the reference itself + // is not affected by large float32 accumulation error. + const float expected = static_cast(static_cast(numel) * static_cast(input_value)); + + test.AddInput("data", {d0, d1, d2, d3}, input); + test.AddOutput("reduced", {}, {expected}); + // Allow small architecture-dependent float order differences while still + // catching the large (~1e2) discrepancy reported in the issue. + test.SetOutputAbsErr("reduced", 0.05f); + test.SetOutputRelErr("reduced", 1e-6f); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch +} + TEST(ReductionOpTest, ReduceSum_do_not_keepdims) { OpTester test("ReduceSum"); test.AddAttribute("axes", std::vector{1});