Skip to content

Commit 6ed869e

Browse files
authored
Merge pull request #578 from andrewkern/simd
eidos function SIMD optimizations
2 parents e793829 + a9c364f commit 6ed869e

9 files changed

Lines changed: 821 additions & 19 deletions

CMakeLists.txt

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,60 @@ if(BUILD_LTO)
286286
endif()
287287
endif()
288288

289+
#
290+
# SIMD SUPPORT (independent of OpenMP)
291+
#
292+
293+
# Option to disable SIMD entirely
294+
option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)
295+
296+
# Check architecture
297+
# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
298+
set(IS_X86_64 FALSE)
299+
set(IS_ARM64 FALSE)
300+
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
301+
set(IS_X86_64 TRUE)
302+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
303+
set(IS_ARM64 TRUE)
304+
endif()
305+
306+
if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
307+
include(CheckCXXCompilerFlag)
308+
309+
# Check for AVX2 support
310+
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
311+
check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
312+
check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA)
313+
314+
if(COMPILER_SUPPORTS_AVX2)
315+
message(STATUS "SIMD: AVX2 support detected")
316+
add_compile_definitions(EIDOS_HAS_AVX2=1)
317+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
318+
if(COMPILER_SUPPORTS_FMA)
319+
message(STATUS "SIMD: FMA support detected")
320+
add_compile_definitions(EIDOS_HAS_FMA=1)
321+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
322+
endif()
323+
elseif(COMPILER_SUPPORTS_SSE42)
324+
message(STATUS "SIMD: SSE4.2 support detected (no AVX2)")
325+
add_compile_definitions(EIDOS_HAS_SSE42=1)
326+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
327+
else()
328+
message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
329+
endif()
330+
elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
331+
# ARM64 NEON is always available on ARM64, no compiler flag needed
332+
message(STATUS "SIMD: ARM64 NEON support enabled")
333+
add_compile_definitions(EIDOS_HAS_NEON=1)
334+
elseif(USE_SIMD AND NOT WIN32)
335+
message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
336+
elseif(USE_SIMD AND WIN32)
337+
# Windows/MSVC detection not yet implemented
338+
message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback")
339+
else()
340+
message(STATUS "SIMD: Disabled by user")
341+
endif()
342+
289343
# GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h
290344
set(TARGET_NAME_GSL gsl)
291345
file(GLOB_RECURSE GSL_SOURCES ${PROJECT_SOURCE_DIR}/gsl/*.c ${PROJECT_SOURCE_DIR}/gsl/*/*.c)

eidos/eidos_functions_math.cpp

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020

2121
#include "eidos_functions.h"
22+
#include "eidos_simd.h"
2223

2324
#include <utility>
2425
#include <string>
@@ -88,10 +89,15 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
8889
double *float_result_data = float_result->data_mutable();
8990
result_SP = EidosValue_SP(float_result);
9091

92+
#ifdef _OPENMP
93+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
9194
EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
92-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
95+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
9396
for (int value_index = 0; value_index < x_count; ++value_index)
9497
float_result_data[value_index] = fabs(float_data[value_index]);
98+
#else
99+
Eidos_SIMD::abs_float64(float_data, float_result_data, x_count);
100+
#endif
95101
}
96102

97103
result_SP->CopyDimensionsFromValue(x_value);
@@ -198,10 +204,15 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arg
198204
double *float_result_data = float_result->data_mutable();
199205
result_SP = EidosValue_SP(float_result);
200206

207+
#ifdef _OPENMP
208+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
201209
EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
202-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
210+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
203211
for (int value_index = 0; value_index < x_count; ++value_index)
204212
float_result_data[value_index] = ceil(float_data[value_index]);
213+
#else
214+
Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count);
215+
#endif
205216

206217
result_SP->CopyDimensionsFromValue(x_value);
207218

@@ -344,6 +355,7 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
344355
double *float_result_data = float_result->data_mutable();
345356
result_SP = EidosValue_SP(float_result);
346357

358+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
347359
EIDOS_THREAD_COUNT(gEidos_OMP_threads_EXP_FLOAT);
348360
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_EXP_FLOAT) num_threads(thread_count)
349361
for (int value_index = 0; value_index < x_count; ++value_index)
@@ -367,10 +379,15 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_ar
367379
double *float_result_data = float_result->data_mutable();
368380
result_SP = EidosValue_SP(float_result);
369381

382+
#ifdef _OPENMP
383+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
370384
EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
371-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
385+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
372386
for (int value_index = 0; value_index < x_count; ++value_index)
373387
float_result_data[value_index] = floor(float_data[value_index]);
388+
#else
389+
Eidos_SIMD::floor_float64(float_data, float_result_data, x_count);
390+
#endif
374391

375392
result_SP->CopyDimensionsFromValue(x_value);
376393

@@ -661,6 +678,7 @@ EidosValue_SP Eidos_ExecuteFunction_log(const std::vector<EidosValue_SP> &p_argu
661678
double *float_result_data = float_result->data_mutable();
662679
result_SP = EidosValue_SP(float_result);
663680

681+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
664682
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG_FLOAT);
665683
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG_FLOAT) num_threads(thread_count)
666684
for (int value_index = 0; value_index < x_count; ++value_index)
@@ -696,6 +714,7 @@ EidosValue_SP Eidos_ExecuteFunction_log10(const std::vector<EidosValue_SP> &p_ar
696714
double *float_result_data = float_result->data_mutable();
697715
result_SP = EidosValue_SP(float_result);
698716

717+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
699718
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG10_FLOAT);
700719
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG10_FLOAT) num_threads(thread_count)
701720
for (int value_index = 0; value_index < x_count; ++value_index)
@@ -731,6 +750,7 @@ EidosValue_SP Eidos_ExecuteFunction_log2(const std::vector<EidosValue_SP> &p_arg
731750
double *float_result_data = float_result->data_mutable();
732751
result_SP = EidosValue_SP(float_result);
733752

753+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
734754
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG2_FLOAT);
735755
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG2_FLOAT) num_threads(thread_count)
736756
for (int value_index = 0; value_index < x_count; ++value_index)
@@ -788,10 +808,7 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
788808
else if (x_type == EidosValueType::kValueFloat)
789809
{
790810
const double *float_data = x_value->FloatData();
791-
double product = 1;
792-
793-
for (int value_index = 0; value_index < x_count; ++value_index)
794-
product *= float_data[value_index];
811+
double product = Eidos_SIMD::product_float64(float_data, x_count);
795812

796813
result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product));
797814
}
@@ -811,10 +828,15 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_ar
811828
double *float_result_data = float_result->data_mutable();
812829
result_SP = EidosValue_SP(float_result);
813830

831+
#ifdef _OPENMP
832+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
814833
EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
815-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
834+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
816835
for (int value_index = 0; value_index < x_count; ++value_index)
817836
float_result_data[value_index] = round(float_data[value_index]);
837+
#else
838+
Eidos_SIMD::round_float64(float_data, float_result_data, x_count);
839+
#endif
818840

819841
result_SP->CopyDimensionsFromValue(x_value);
820842

@@ -2427,10 +2449,15 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
24272449
double *float_result_data = float_result->data_mutable();
24282450
result_SP = EidosValue_SP(float_result);
24292451

2452+
#ifdef _OPENMP
2453+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
24302454
EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
2431-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
2455+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
24322456
for (int value_index = 0; value_index < x_count; ++value_index)
24332457
float_result_data[value_index] = sqrt(float_data[value_index]);
2458+
#else
2459+
Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count);
2460+
#endif
24342461
}
24352462

24362463
result_SP->CopyDimensionsFromValue(x_value);
@@ -2518,10 +2545,15 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
25182545
const double *float_data = x_value->FloatData();
25192546
double sum = 0;
25202547

2548+
#ifdef _OPENMP
2549+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
25212550
EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT);
2522-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
2551+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
25232552
for (int value_index = 0; value_index < x_count; ++value_index)
25242553
sum += float_data[value_index];
2554+
#else
2555+
sum = Eidos_SIMD::sum_float64(float_data, x_count);
2556+
#endif
25252557

25262558
result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum));
25272559
}
@@ -2595,10 +2627,15 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_ar
25952627
double *float_result_data = float_result->data_mutable();
25962628
result_SP = EidosValue_SP(float_result);
25972629

2630+
#ifdef _OPENMP
2631+
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
25982632
EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
2599-
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
2633+
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
26002634
for (int value_index = 0; value_index < x_count; ++value_index)
26012635
float_result_data[value_index] = trunc(float_data[value_index]);
2636+
#else
2637+
Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count);
2638+
#endif
26022639

26032640
result_SP->CopyDimensionsFromValue(x_value);
26042641

0 commit comments

Comments
 (0)