Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion onnxruntime/core/providers/cpu/reduction/reduction_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,23 @@ class ReduceAggregatorSum : public ReduceAggregator<T, T> {
inline ReduceAggregatorSum(int64_t N, const T&) : ReduceAggregator<T, T>(N, 0) {}
inline void update(const T& v) { this->accumulator_ += v; }
static T aggall(const T* from_data, int64_t size) {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
if constexpr (std::is_same_v<T, float>) {
// Kahan compensated summation to reduce float round-off error on large reductions.
// https://en.wikipedia.org/wiki/Kahan_summation_algorithm
double sum = 0.0;
double compensation = 0.0;
for (int64_t i = 0; i < size; ++i) {
// compensation stores the previous step's rounding error that is corrected in this step.
const double value = static_cast<double>(from_data[i]) - compensation;
const double next_sum = sum + value;
compensation = (next_sum - sum) - value;
sum = next_sum;
}

return static_cast<float>(sum);
} else {
return Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(from_data, onnxruntime::narrow<size_t>(size)).sum();
}
}
inline T aggall(const T* from_data) {
return aggall(from_data, this->N_);
Expand Down
25 changes: 25 additions & 0 deletions onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2684,6 +2684,31 @@ TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch
}

TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims_large_float32_constant_input) {
OpTester test("ReduceSum");
test.AddAttribute("keepdims", static_cast<int64_t>(0));

constexpr int64_t d0 = 5;
constexpr int64_t d1 = 68;
constexpr int64_t d2 = 64;
constexpr int64_t d3 = 64;
constexpr int64_t numel = d0 * d1 * d2 * d3;
constexpr float input_value = 0.1f;
// narrow ensures the int64_t->size_t conversion is checked.
const std::vector<float> input(onnxruntime::narrow<size_t>(numel), input_value);
// Compute the expected scalar using higher precision so the reference itself
// is not affected by large float32 accumulation error.
const float expected = static_cast<float>(static_cast<double>(numel) * static_cast<double>(input_value));

test.AddInput<float>("data", {d0, d1, d2, d3}, input);
test.AddOutput<float>("reduced", {}, {expected});
// Allow small architecture-dependent float order differences while still
// catching the large (~1e2) discrepancy reported in the issue.
test.SetOutputAbsErr("reduced", 0.05f);
test.SetOutputRelErr("reduced", 1e-6f);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: full reduce without keepDimensions is not supported with explicit batch
}

TEST(ReductionOpTest, ReduceSum_do_not_keepdims) {
OpTester test("ReduceSum");
test.AddAttribute("axes", std::vector<int64_t>{1});
Expand Down
Loading