|
22 | 22 | SIMD acceleration for Eidos math operations, independent of OpenMP. |
23 | 23 |
|
24 | 24 | This header provides vectorized implementations of common math operations |
25 | | - using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks. |
| 25 | + using platform-specific SIMD intrinsics when available: |
| 26 | + - x86_64: SSE4.2 or AVX2 via <immintrin.h> |
| 27 | + - ARM64: NEON via <arm_neon.h> |
| 28 | + Falls back to scalar code when no SIMD is available. |
26 | 29 |
|
27 | 30 | */ |
28 | 31 |
|
|
42 | 45 | #include <smmintrin.h> |
43 | 46 | #define EIDOS_SIMD_WIDTH 2 // 2 doubles per SSE register |
44 | 47 | #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per SSE register |
| 48 | +#elif defined(EIDOS_HAS_NEON) |
| 49 | + #include <arm_neon.h> |
| 50 | + #define EIDOS_SIMD_WIDTH 2 // 2 doubles per NEON register |
| 51 | + #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per NEON register |
45 | 52 | #else |
46 | 53 | #define EIDOS_SIMD_WIDTH 1 // Scalar fallback |
47 | 54 | #define EIDOS_SIMD_FLOAT_WIDTH 1 |
@@ -78,6 +85,14 @@ inline void sqrt_float64(const double *input, double *output, int64_t count) |
78 | 85 | __m128d r = _mm_sqrt_pd(v); |
79 | 86 | _mm_storeu_pd(&output[i], r); |
80 | 87 | } |
| 88 | +#elif defined(EIDOS_HAS_NEON) |
| 89 | + // Process 2 doubles at a time |
| 90 | + for (; i + 2 <= count; i += 2) |
| 91 | + { |
| 92 | + float64x2_t v = vld1q_f64(&input[i]); |
| 93 | + float64x2_t r = vsqrtq_f64(v); |
| 94 | + vst1q_f64(&output[i], r); |
| 95 | + } |
81 | 96 | #endif |
82 | 97 |
|
83 | 98 | // Scalar remainder |
@@ -109,6 +124,13 @@ inline void abs_float64(const double *input, double *output, int64_t count) |
109 | 124 | __m128d r = _mm_andnot_pd(sign_mask, v); |
110 | 125 | _mm_storeu_pd(&output[i], r); |
111 | 126 | } |
| 127 | +#elif defined(EIDOS_HAS_NEON) |
| 128 | + for (; i + 2 <= count; i += 2) |
| 129 | + { |
| 130 | + float64x2_t v = vld1q_f64(&input[i]); |
| 131 | + float64x2_t r = vabsq_f64(v); |
| 132 | + vst1q_f64(&output[i], r); |
| 133 | + } |
112 | 134 | #endif |
113 | 135 |
|
114 | 136 | for (; i < count; i++) |
@@ -136,6 +158,13 @@ inline void floor_float64(const double *input, double *output, int64_t count) |
136 | 158 | __m128d r = _mm_floor_pd(v); |
137 | 159 | _mm_storeu_pd(&output[i], r); |
138 | 160 | } |
| 161 | +#elif defined(EIDOS_HAS_NEON) |
| 162 | + for (; i + 2 <= count; i += 2) |
| 163 | + { |
| 164 | + float64x2_t v = vld1q_f64(&input[i]); |
| 165 | + float64x2_t r = vrndmq_f64(v); // Round toward minus infinity (floor) |
| 166 | + vst1q_f64(&output[i], r); |
| 167 | + } |
139 | 168 | #endif |
140 | 169 |
|
141 | 170 | for (; i < count; i++) |
@@ -163,6 +192,13 @@ inline void ceil_float64(const double *input, double *output, int64_t count) |
163 | 192 | __m128d r = _mm_ceil_pd(v); |
164 | 193 | _mm_storeu_pd(&output[i], r); |
165 | 194 | } |
| 195 | +#elif defined(EIDOS_HAS_NEON) |
| 196 | + for (; i + 2 <= count; i += 2) |
| 197 | + { |
| 198 | + float64x2_t v = vld1q_f64(&input[i]); |
| 199 | + float64x2_t r = vrndpq_f64(v); // Round toward plus infinity (ceil) |
| 200 | + vst1q_f64(&output[i], r); |
| 201 | + } |
166 | 202 | #endif |
167 | 203 |
|
168 | 204 | for (; i < count; i++) |
@@ -190,6 +226,13 @@ inline void trunc_float64(const double *input, double *output, int64_t count) |
190 | 226 | __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); |
191 | 227 | _mm_storeu_pd(&output[i], r); |
192 | 228 | } |
| 229 | +#elif defined(EIDOS_HAS_NEON) |
| 230 | + for (; i + 2 <= count; i += 2) |
| 231 | + { |
| 232 | + float64x2_t v = vld1q_f64(&input[i]); |
| 233 | + float64x2_t r = vrndq_f64(v); // Round toward zero (truncate) |
| 234 | + vst1q_f64(&output[i], r); |
| 235 | + } |
193 | 236 | #endif |
194 | 237 |
|
195 | 238 | for (; i < count; i++) |
@@ -217,6 +260,13 @@ inline void round_float64(const double *input, double *output, int64_t count) |
217 | 260 | __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); |
218 | 261 | _mm_storeu_pd(&output[i], r); |
219 | 262 | } |
| 263 | +#elif defined(EIDOS_HAS_NEON) |
| 264 | + for (; i + 2 <= count; i += 2) |
| 265 | + { |
| 266 | + float64x2_t v = vld1q_f64(&input[i]); |
| 267 | + float64x2_t r = vrndaq_f64(v); // Round to nearest, ties away from zero |
| 268 | + vst1q_f64(&output[i], r); |
| 269 | + } |
220 | 270 | #endif |
221 | 271 |
|
222 | 272 | for (; i < count; i++) |
@@ -298,6 +348,15 @@ inline double sum_float64(const double *input, int64_t count) |
298 | 348 | __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1); |
299 | 349 | vsum = _mm_add_sd(vsum, shuf); |
300 | 350 | sum = _mm_cvtsd_f64(vsum); |
| 351 | +#elif defined(EIDOS_HAS_NEON) |
| 352 | + float64x2_t vsum = vdupq_n_f64(0.0); |
| 353 | + for (; i + 2 <= count; i += 2) |
| 354 | + { |
| 355 | + float64x2_t v = vld1q_f64(&input[i]); |
| 356 | + vsum = vaddq_f64(vsum, v); |
| 357 | + } |
| 358 | + // Horizontal sum of 2 doubles |
| 359 | + sum = vaddvq_f64(vsum); |
301 | 360 | #endif |
302 | 361 |
|
303 | 362 | // Scalar remainder |
@@ -339,6 +398,15 @@ inline double product_float64(const double *input, int64_t count) |
339 | 398 | __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1); |
340 | 399 | vprod = _mm_mul_sd(vprod, shuf); |
341 | 400 | prod = _mm_cvtsd_f64(vprod); |
| 401 | +#elif defined(EIDOS_HAS_NEON) |
| 402 | + float64x2_t vprod = vdupq_n_f64(1.0); |
| 403 | + for (; i + 2 <= count; i += 2) |
| 404 | + { |
| 405 | + float64x2_t v = vld1q_f64(&input[i]); |
| 406 | + vprod = vmulq_f64(vprod, v); |
| 407 | + } |
| 408 | + // Horizontal product of 2 doubles |
| 409 | + prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1); |
342 | 410 | #endif |
343 | 411 |
|
344 | 412 | for (; i < count; i++) |
|
0 commit comments