feat(burn): fused SIMD sigmoid via hpc::activations::sigmoid_f32

claude · claude · commit 8d3f6bc2759e · 2026-03-29T08:28:37.000Z
Override ActivationOps::sigmoid with fused F32x16 SIMD path. Default burn sigmoid: 6 separate ops (neg, exp, add, log, neg, exp) Our sigmoid: one fused pass: 1/(1+exp(-x)) via F32x16 polynomial For contiguous f32: use hpc::activations::sigmoid_f32 (F32x16 SIMD) For non-f32 or non-contiguous: decomposed via Backend float ops The fused path eliminates 5 intermediate tensor allocations and does the full sigmoid in a single pass over the data. 30 tests passing. Zero regressions. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
diff --git a/crates/burn/src/ops/activation.rs b/crates/burn/src/ops/activation.rs
@@ -1,5 +1,5 @@
 use crate::{
-    NdArray, NdArrayTensor, SharedArray,
+    NdArray, NdArrayStorage, NdArrayTensor, SharedArray,
     element::{FloatNdArrayElement, IntNdArrayElement, QuantElement},
     execute_with_numeric_dtype,
     ops::NdArrayMathOps,
@@ -15,4 +15,31 @@ where
     fn relu(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
         execute_with_numeric_dtype!(tensor, |array| NdArrayMathOps::clamp_min(array, 0.elem()))
     }
+
+    /// Sigmoid via ndarray::hpc::activations::sigmoid_f32 (fused F32x16 SIMD).
+    ///
+    /// Default impl decomposes into 6 separate ops: neg, exp, add, log, neg, exp.
+    /// Our version does `1 / (1 + exp(-x))` in one SIMD pass with F32x16.
+    fn sigmoid(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
+        #[cfg(feature = "simd")]
+        if let NdArrayTensor::F32(ref storage) = tensor {
+            let view = storage.view();
+            if view.is_standard_layout() {
+                if let Some(input) = view.as_slice() {
+                    let mut output = alloc::vec![0.0f32; input.len()];
+                    ndarray::hpc::activations::sigmoid_f32(input, &mut output);
+                    let shape: alloc::vec::Vec<usize> = view.shape().to_vec();
+                    let array = ndarray::Array::from_shape_vec(ndarray::IxDyn(&shape), output)
+                        .expect("sigmoid output shape mismatch");
+                    return NdArrayTensor::F32(NdArrayStorage::Owned(array.into_shared()));
+                }
+            }
+        }
+        // Fallback: decomposed sigmoid via Backend ops (non-f32 or non-contiguous).
+        use burn_backend::ops::FloatTensorOps;
+        let tensor_neg = Self::float_neg(tensor);
+        let tensor_exp = Self::float_exp(tensor_neg);
+        let tensor_add = Self::float_add_scalar(tensor_exp, 1.0.into());
+        Self::float_recip(tensor_add)
+    }
 }