Accumulate CPU half-precision sums in float32

sofinvalery · sofinvalery · commit f86988443e29 · 2026-05-07T20:32:46.000+03:00
diff --git a/mlx/backend/cpu/reduce.cpp b/mlx/backend/cpu/reduce.cpp
@@ -3,6 +3,7 @@
 #include <cassert>
 #include <functional>
 #include <limits>
+#include <type_traits>
 
 #include "mlx/backend/common/reduce.h"
 #include "mlx/backend/cpu/encoder.h"
@@ -96,19 +97,29 @@ void strided_reduce(
 };
 
 template <typename T, typename U, typename Op>
-void contiguous_reduce(const T* x, U* accumulator, int size, Op op, U init) {
+U strided_reduce(const T* x, int size, size_t stride, Op op, U accumulator) {
+  for (int i = 0; i < size; i++) {
+    accumulator = op(accumulator, *x);
+    x += stride;
+  }
+  return accumulator;
+}
+
+template <typename T, typename U, typename Op>
+U contiguous_reduce(const T* x, int size, Op op, U accumulator, U init) {
   constexpr int N = std::min(simd::max_size<T>, simd::max_size<U>);
   simd::Simd<U, N> accumulator_v(init);
   while (size >= N) {
     accumulator_v = op(accumulator_v, simd::Simd<U, N>(simd::load<T, N>(x)));
     x += N;
     size -= N;
   }
-  *accumulator = op(*accumulator, op(accumulator_v));
+  accumulator = op(accumulator, op(accumulator_v));
   while (size-- > 0) {
-    *accumulator = op(*accumulator, *x);
+    accumulator = op(accumulator, *x);
     x++;
   }
+  return accumulator;
 }
 
 // Helper for the ndimensional strided loop
@@ -135,27 +146,25 @@ void nd_loop(
   loop_inner(0, 0);
 }
 
-template <typename T, typename U, typename Op>
+template <typename T, typename OutT, typename Op, typename AccT = OutT>
 void reduction_op(
     const array& x,
     array& out,
     const std::vector<int>& axes,
-    U init) {
+    AccT init) {
   ReductionPlan plan = get_reduction_plan(x, axes);
 
   auto in_ptr = x.data<T>();
-  auto out_ptr = out.data<U>();
+  auto out_ptr = out.data<OutT>();
   if (plan.type == ContiguousAllReduce) {
-    *out_ptr = init;
-    contiguous_reduce(in_ptr, out_ptr, x.size(), Op{}, init);
+    *out_ptr = contiguous_reduce(in_ptr, x.size(), Op{}, init, init);
     return;
   }
 
   if (plan.type == ContiguousReduce && plan.shape.size() == 1) {
     int reduction_size = plan.shape[0];
     for (int i = 0; i < out.size(); i++, out_ptr++, in_ptr += reduction_size) {
-      *out_ptr = init;
-      contiguous_reduce(in_ptr, out_ptr, reduction_size, Op{}, init);
+      *out_ptr = contiguous_reduce(in_ptr, reduction_size, Op{}, init, init);
     }
     return;
   }
@@ -170,24 +179,25 @@ void reduction_op(
     if (plan.shape.size() == 0) {
       for (int i = 0; i < out.size(); i++, out_ptr++) {
         int offset = elem_to_loc(i, shape, strides);
-        *out_ptr = init;
-        contiguous_reduce(in_ptr + offset, out_ptr, reduction_size, Op{}, init);
+        *out_ptr = contiguous_reduce(
+            in_ptr + offset, reduction_size, Op{}, init, init);
       }
     } else {
       for (int i = 0; i < out.size(); i++, out_ptr++) {
         int offset = elem_to_loc(i, shape, strides);
-        *out_ptr = init;
+        AccT val = init;
         nd_loop(
             [&](int extra_offset) {
-              contiguous_reduce(
+              val = contiguous_reduce(
                   in_ptr + offset + extra_offset,
-                  out_ptr,
                   reduction_size,
                   Op{},
+                  val,
                   init);
             },
             plan.shape,
             plan.strides);
+        *out_ptr = val;
       }
     }
     return;
@@ -199,8 +209,15 @@ void reduction_op(
     plan.shape.pop_back();
     plan.strides.pop_back();
     for (int i = 0; i < out.size(); i += reduction_stride) {
-      std::fill_n(out_ptr, reduction_stride, init);
-      strided_reduce(in_ptr, out_ptr, reduction_size, reduction_stride, Op{});
+      if constexpr (std::is_same_v<OutT, AccT>) {
+        std::fill_n(out_ptr, reduction_stride, init);
+        strided_reduce(in_ptr, out_ptr, reduction_size, reduction_stride, Op{});
+      } else {
+        for (size_t j = 0; j < reduction_stride; j++) {
+          out_ptr[j] = strided_reduce(
+              in_ptr + j, reduction_size, reduction_stride, Op{}, init);
+        }
+      }
       in_ptr += reduction_stride * reduction_size;
       out_ptr += reduction_stride;
     }
@@ -218,26 +235,55 @@ void reduction_op(
     if (plan.shape.size() == 0) {
       for (int i = 0; i < out.size(); i += reduction_stride) {
         int offset = elem_to_loc(i, shape, strides);
-        std::fill_n(out_ptr, reduction_stride, init);
-        strided_reduce(
-            in_ptr + offset, out_ptr, reduction_size, reduction_stride, Op{});
+        if constexpr (std::is_same_v<OutT, AccT>) {
+          std::fill_n(out_ptr, reduction_stride, init);
+          strided_reduce(
+              in_ptr + offset, out_ptr, reduction_size, reduction_stride, Op{});
+        } else {
+          for (size_t j = 0; j < reduction_stride; j++) {
+            out_ptr[j] = strided_reduce(
+                in_ptr + offset + j,
+                reduction_size,
+                reduction_stride,
+                Op{},
+                init);
+          }
+        }
         out_ptr += reduction_stride;
       }
     } else {
       for (int i = 0; i < out.size(); i += reduction_stride) {
         int offset = elem_to_loc(i, shape, strides);
-        std::fill_n(out_ptr, reduction_stride, init);
-        nd_loop(
-            [&](int extra_offset) {
-              strided_reduce(
-                  in_ptr + offset + extra_offset,
-                  out_ptr,
-                  reduction_size,
-                  reduction_stride,
-                  Op{});
-            },
-            plan.shape,
-            plan.strides);
+        if constexpr (std::is_same_v<OutT, AccT>) {
+          std::fill_n(out_ptr, reduction_stride, init);
+          nd_loop(
+              [&](int extra_offset) {
+                strided_reduce(
+                    in_ptr + offset + extra_offset,
+                    out_ptr,
+                    reduction_size,
+                    reduction_stride,
+                    Op{});
+              },
+              plan.shape,
+              plan.strides);
+        } else {
+          for (size_t j = 0; j < reduction_stride; j++) {
+            AccT val = init;
+            nd_loop(
+                [&](int extra_offset) {
+                  val = strided_reduce(
+                      in_ptr + offset + extra_offset + j,
+                      reduction_size,
+                      reduction_stride,
+                      Op{},
+                      val);
+                },
+                plan.shape,
+                plan.strides);
+            out_ptr[j] = val;
+          }
+        }
         out_ptr += reduction_stride;
       }
     }
@@ -249,7 +295,7 @@ void reduction_op(
 
     for (int i = 0; i < out.size(); i++, out_ptr++) {
       int offset = elem_to_loc(i, shape, strides);
-      U val = init;
+      AccT val = init;
       nd_loop(
           [&](int extra_offset) {
             val = Op{}(val, *(in_ptr + offset + extra_offset));
@@ -404,9 +450,9 @@ void reduce_dispatch_and_or(
     Reduce::ReduceType rtype,
     const std::vector<int>& axes) {
   if (rtype == Reduce::And) {
-    reduction_op<InT, bool, AndReduce>(in, out, axes, true);
+    reduction_op<InT, bool, AndReduce, bool>(in, out, axes, true);
   } else {
-    reduction_op<InT, bool, OrReduce>(in, out, axes, false);
+    reduction_op<InT, bool, OrReduce, bool>(in, out, axes, false);
   }
 }
 
@@ -417,16 +463,19 @@ void reduce_dispatch_sum_prod(
     Reduce::ReduceType rtype,
     const std::vector<int>& axes) {
   if (rtype == Reduce::Sum) {
-    if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
-      reduction_op<InT, int32_t, SumReduce>(in, out, axes, 0);
+    if constexpr (
+        std::is_same_v<InT, float16_t> || std::is_same_v<InT, bfloat16_t>) {
+      reduction_op<InT, InT, SumReduce, float>(in, out, axes, 0.0f);
+    } else if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
+      reduction_op<InT, int32_t, SumReduce, int32_t>(in, out, axes, 0);
     } else {
-      reduction_op<InT, InT, SumReduce>(in, out, axes, 0);
+      reduction_op<InT, InT, SumReduce, InT>(in, out, axes, 0);
     }
   } else {
     if constexpr (std::is_integral_v<InT> && sizeof(InT) <= 4) {
-      reduction_op<InT, int32_t, ProdReduce>(in, out, axes, 1);
+      reduction_op<InT, int32_t, ProdReduce, int32_t>(in, out, axes, 1);
     } else {
-      reduction_op<InT, InT, ProdReduce>(in, out, axes, 1);
+      reduction_op<InT, InT, ProdReduce, InT>(in, out, axes, 1);
     }
   }
 }
@@ -439,10 +488,10 @@ void reduce_dispatch_min_max(
     const std::vector<int>& axes) {
   if (rtype == Reduce::Max) {
     auto init = Limits<InT>::min;
-    reduction_op<InT, InT, MaxReduce>(in, out, axes, init);
+    reduction_op<InT, InT, MaxReduce, InT>(in, out, axes, init);
   } else {
     auto init = Limits<InT>::max;
-    reduction_op<InT, InT, MinReduce>(in, out, axes, init);
+    reduction_op<InT, InT, MinReduce, InT>(in, out, axes, init);
   }
 }
 
diff --git a/tests/ops_tests.cpp b/tests/ops_tests.cpp
@@ -1029,6 +1029,37 @@ TEST_CASE("test reduction ops") {
     x = array({1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f}, {2, 3});
     CHECK(array_equal(sum(x, 0), full({3}, 3.0f)).item<bool>());
     CHECK(array_equal(sum(x, 1), array({3.0f, 6.0f}, {2})).item<bool>());
+
+    auto check_half_sum = [](Dtype dtype, Shape shape, std::vector<int> axes) {
+      int size = 1;
+      for (auto dim : shape) {
+        size *= dim;
+      }
+      auto x = astype(
+          reshape(
+              divide(arange(size * 1.0, float32, Device::cpu), array(1000.0f)),
+              shape,
+              Device::cpu),
+          dtype,
+          Device::cpu);
+      auto out = sum(x, axes, false, Device::cpu);
+      auto expected =
+          sum(astype(x, float32, Device::cpu), axes, false, Device::cpu);
+      auto diff =
+          max(abs(subtract(
+                  astype(out, float32, Device::cpu), expected, Device::cpu)),
+              Device::cpu)
+              .item<float>();
+      auto tolerance = dtype == float16 ? 0.5f : 2.0f;
+      CHECK_EQ(out.dtype(), dtype);
+      CHECK(diff <= tolerance);
+    };
+    check_half_sum(float16, {1000}, {0});
+    check_half_sum(bfloat16, {1000}, {0});
+    check_half_sum(float16, {100, 10}, {0});
+    check_half_sum(bfloat16, {100, 10}, {0});
+    check_half_sum(float16, {100, 10}, {1});
+    check_half_sum(bfloat16, {100, 10}, {1});
   }
 
   // Test unsigned sum