NEON SIMD for ARM

andrewkern · andrewkern · commit 650a38140af6 · 2025-12-10T18:53:02.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -291,9 +291,19 @@ endif()
 #
 
 # Option to disable SIMD entirely
-option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2)" ON)
+option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)
+
+# Check architecture
+# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
+set(IS_X86_64 FALSE)
+set(IS_ARM64 FALSE)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
+    set(IS_X86_64 TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
+    set(IS_ARM64 TRUE)
+endif()
 
-if(USE_SIMD AND NOT WIN32)
+if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
     include(CheckCXXCompilerFlag)
 
     # Check for AVX2 support
@@ -315,8 +325,14 @@ if(USE_SIMD AND NOT WIN32)
         add_compile_definitions(EIDOS_HAS_SSE42=1)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
     else()
-        message(STATUS "SIMD: No SIMD support detected, using scalar fallback")
+        message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
     endif()
+elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
+    # ARM64 NEON is always available on ARM64, no compiler flag needed
+    message(STATUS "SIMD: ARM64 NEON support enabled")
+    add_compile_definitions(EIDOS_HAS_NEON=1)
+elseif(USE_SIMD AND NOT WIN32)
+    message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
 elseif(USE_SIMD AND WIN32)
     # Windows/MSVC detection not yet implemented
     message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback")
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
@@ -22,7 +22,10 @@
  SIMD acceleration for Eidos math operations, independent of OpenMP.
 
  This header provides vectorized implementations of common math operations
- using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks.
+ using platform-specific SIMD intrinsics when available:
+   - x86_64: SSE4.2 or AVX2 via <immintrin.h>
+   - ARM64: NEON via <arm_neon.h>
+ Falls back to scalar code when no SIMD is available.
 
  */
 
@@ -42,6 +45,10 @@
     #include <smmintrin.h>
     #define EIDOS_SIMD_WIDTH 2          // 2 doubles per SSE register
     #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per SSE register
+#elif defined(EIDOS_HAS_NEON)
+    #include <arm_neon.h>
+    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per NEON register
+    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per NEON register
 #else
     #define EIDOS_SIMD_WIDTH 1          // Scalar fallback
     #define EIDOS_SIMD_FLOAT_WIDTH 1
@@ -78,6 +85,14 @@ inline void sqrt_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_sqrt_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    // Process 2 doubles at a time
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vsqrtq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     // Scalar remainder
@@ -109,6 +124,13 @@ inline void abs_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_andnot_pd(sign_mask, v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vabsq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -136,6 +158,13 @@ inline void floor_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_floor_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndmq_f64(v);  // Round toward minus infinity (floor)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -163,6 +192,13 @@ inline void ceil_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_ceil_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndpq_f64(v);  // Round toward plus infinity (ceil)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -190,6 +226,13 @@ inline void trunc_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndq_f64(v);  // Round toward zero (truncate)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -217,6 +260,13 @@ inline void round_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndaq_f64(v);  // Round to nearest, ties away from zero
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -298,6 +348,15 @@ inline double sum_float64(const double *input, int64_t count)
     __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1);
     vsum = _mm_add_sd(vsum, shuf);
     sum = _mm_cvtsd_f64(vsum);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vsum = vdupq_n_f64(0.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vsum = vaddq_f64(vsum, v);
+    }
+    // Horizontal sum of 2 doubles
+    sum = vaddvq_f64(vsum);
 #endif
 
     // Scalar remainder
@@ -339,6 +398,15 @@ inline double product_float64(const double *input, int64_t count)
     __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1);
     vprod = _mm_mul_sd(vprod, shuf);
     prod = _mm_cvtsd_f64(vprod);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vprod = vdupq_n_f64(1.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vprod = vmulq_f64(vprod, v);
+    }
+    // Horizontal product of 2 doubles
+    prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
 #endif
 
     for (; i < count; i++)