feat(burn): wire ndarray hpc::vml SIMD into float_exp/log/sqrt/abs

claude · claude · commit 984d50c8e8ba · 2026-03-29T08:16:51.000Z
First augmentation of the burn backend with our crate::simd F32x16 path. For contiguous f32 tensors, these operations now route through ndarray::hpc::vml which uses crate::simd::F32x16 (AVX-512/AVX2 via LazyLock dispatch). Non-f32 or non-contiguous tensors fall through to the original scalar mapv_into path. float_exp → ndarray::hpc::vml::vsexp (F32x16 polynomial approx) float_log → ndarray::hpc::vml::vsln (F32x16 polynomial approx) float_sqrt → ndarray::hpc::vml::vssqrt (F32x16 hardware sqrt) float_abs → ndarray::hpc::vml::vsabs (F32x16 bitmask) try_vml_unary() helper: - Checks tensor is F32 variant + contiguous layout - Extracts &[f32] slice (zero-copy read) - Calls VML function → Vec<f32> output - Wraps into NdArrayTensor::F32(Owned) - Falls through to scalar on non-f32/non-contiguous 30 tests passing. Zero regressions. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
diff --git a/crates/burn/src/ops/tensor.rs b/crates/burn/src/ops/tensor.rs
@@ -32,6 +32,34 @@ use num_traits::Float;
 
 use libm::erf;
 
+/// Try to accelerate a unary f32 operation via ndarray's hpc::vml (F32x16 SIMD).
+///
+/// VML signature: `fn(input: &[f32], output: &mut [f32])`.
+/// Uses crate::simd::F32x16 internally. Consumer never sees hardware details.
+#[cfg(feature = "simd")]
+fn try_vml_unary(
+    tensor: NdArrayTensor,
+    vml_fn: fn(&[f32], &mut [f32]),
+) -> Result<NdArrayTensor, NdArrayTensor> {
+    if let NdArrayTensor::F32(storage) = tensor {
+        let shared = storage.into_shared();
+        if shared.is_standard_layout() {
+            if let Some(input) = shared.as_slice() {
+                let mut output = vec![0.0f32; input.len()];
+                vml_fn(input, &mut output);
+                let shape = shared.shape().to_vec();
+                let array = ndarray::Array::from_shape_vec(ndarray::IxDyn(&shape), output)
+                    .expect("vml output shape mismatch");
+                return Ok(NdArrayTensor::F32(
+                    crate::NdArrayStorage::Owned(array.into_shared()),
+                ));
+            }
+        }
+        return Err(NdArrayTensor::F32(crate::NdArrayStorage::Owned(shared)));
+    }
+    Err(tensor)
+}
+
 #[cfg(feature = "std")]
 #[allow(dead_code)]
 fn round_ties_even_wrapper(x: f64) -> f64 {
@@ -446,12 +474,24 @@ where
     }
 
     fn float_exp(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
+        // Fast path: contiguous f32 → ndarray::hpc::vml::vsexp (F32x16 SIMD).
+        // Falls through to scalar mapv_into for non-f32 or non-contiguous.
+        #[cfg(feature = "simd")]
+        let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsexp) {
+            Ok(result) => return result,
+            Err(t) => t,
+        };
         execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray<FloatElem>| {
             array.mapv_into(|a: FloatElem| a.exp_elem()).into_shared()
         })
     }
 
     fn float_log(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
+        #[cfg(feature = "simd")]
+        let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsln) {
+            Ok(result) => return result,
+            Err(t) => t,
+        };
         execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray<FloatElem>| {
             array.mapv_into(|a: FloatElem| a.log_elem()).into_shared()
         })
@@ -499,12 +539,22 @@ where
     }
 
     fn float_sqrt(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
+        #[cfg(feature = "simd")]
+        let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vssqrt) {
+            Ok(result) => return result,
+            Err(t) => t,
+        };
         execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray<FloatElem>| {
             array.mapv_into(|a: FloatElem| a.sqrt_elem()).into_shared()
         })
     }
 
     fn float_abs(tensor: FloatTensor<Self>) -> FloatTensor<Self> {
+        #[cfg(feature = "simd")]
+        let tensor = match try_vml_unary(tensor, ndarray::hpc::vml::vsabs) {
+            Ok(result) => return result,
+            Err(t) => t,
+        };
         execute_with_float_dtype!(tensor, FloatElem, |array: SharedArray<FloatElem>| {
             NdArrayMathOps::abs(array)
         })