added credit; added readme to bench; tried to clean whitespace

andrewkern · andrewkern · commit 0952b2bdbd7e · 2025-12-10T17:57:47.000-08:00
diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp
@@ -88,7 +88,7 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-
+		
 #ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
@@ -98,9 +98,9 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		Eidos_SIMD::abs_float64(float_data, float_result_data, x_count);
 #endif
 	}
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -195,14 +195,14 @@ EidosValue_SP Eidos_ExecuteFunction_atan2(const std::vector<EidosValue_SP> &p_ar
 EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
@@ -211,9 +211,9 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arg
 #else
 	Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -368,14 +368,14 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
@@ -384,9 +384,9 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::floor_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -802,25 +802,25 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
 	{
 		const double *float_data = x_value->FloatData();
 		double product = Eidos_SIMD::product_float64(float_data, x_count);
-
+		
 		result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product));
 	}
-
+	
 	return result_SP;
 }
 
 //	(float)round(float x)
 EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
@@ -829,9 +829,9 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::round_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -2440,7 +2440,7 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-
+		
 #ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
@@ -2450,9 +2450,9 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count);
 #endif
 	}
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -2514,12 +2514,12 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 			// case across multiple threads seems excessively complex; instead we look for an overflow afterwards
 			const int64_t *int_data = x_value->IntData();
 			double sum_d = 0;
-
+			
 			EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_INTEGER);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(int_data) reduction(+: sum_d) if(parallel:x_count >= EIDOS_OMPMIN_SUM_INTEGER) num_threads(thread_count)
 			for (int value_index = 0; value_index < x_count; ++value_index)
 				sum_d += int_data[value_index];
-
+				
 			// 2^53 is the largest integer such that it and all smaller integers can be represented in double losslessly
 			int64_t sum = (int64_t)sum_d;
 			bool fits_in_integer = (((double)sum == sum_d) && (sum < 9007199254740992L) && (sum > -9007199254740992L));
@@ -2609,14 +2609,14 @@ EidosValue_SP Eidos_ExecuteFunction_tan(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
@@ -2625,9 +2625,9 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
@@ -2,7 +2,7 @@
 //  eidos_simd.h
 //  Eidos
 //
-//  Created by Ben Haller on 11/26/2024.
+//  Created by Andrew Kern on 11/26/2025.
 //  Copyright (c) 2024-2025 Philipp Messer.  All rights reserved.
 //	A product of the Messer Lab, http://messerlab.org/slim/
 //
diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp
@@ -378,7 +378,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
-
+	
 	// colSums()
 	EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true);
 	EidosAssertScriptSuccess_L("x = 1:12; y = matrix(x, nrow=3); identical(colSums(y), c(6, 15, 24, 33));", true);
diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md
@@ -0,0 +1,87 @@
+# SIMD Benchmarks
+
+This directory contains benchmark scripts used during the development of SIMD optimizations for SLiM. These files are provided for internal development use and are **not used in the build of SLiM**.
+
+## Contents
+
+- **`run_benchmarks.sh`** - Shell script that builds SLiM with and without SIMD, runs both benchmark scripts, and reports speedup comparisons.
+
+- **`simd_benchmark.eidos`** - Eidos script that benchmarks SIMD-optimized math functions (`sqrt`, `abs`, `floor`, `ceil`, `round`, `trunc`, `sum`, `product`) on large arrays.
+
+- **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance.
+
+## Author
+
+These benchmarks were developed by Andrew Kern as part of SIMD optimization work for SLiM.
+
+## Usage
+
+These files are not part of the SLiM build system. To run the benchmarks:
+
+```bash
+cd simd_benchmarks
+./run_benchmarks.sh [num_runs]
+```
+
+This will build both SIMD-enabled and scalar versions of SLiM, run the benchmarks, and report the speedup.
+
+## Results
+
+Benchmark results look like the following (example output):
+
+```
+$ simd_benchmarks/run_benchmarks.sh 
+============================================
+SIMD Benchmark Runner
+============================================
+SLiM root: /home/adkern/SLiM
+Runs per benchmark: 3
+
+Building with SIMD enabled...
+  Done.
+Building with SIMD disabled...
+  Done.
+
+============================================
+Eidos Math Function Benchmarks
+============================================
+
+SIMD Build:
+  Running Eidos benchmark (SIMD)...
+    sqrt():    0.105 sec
+    abs():     0.171 sec
+    floor():   0.164 sec
+    ceil():    0.166 sec
+    round():   0.164 sec
+    trunc():   0.165 sec
+    sum():     0.032 sec
+    product(): 0.003 sec (1000 elements, 10000 iters)
+
+Scalar Build:
+  Running Eidos benchmark (Scalar)...
+    sqrt():    0.108 sec
+    abs():     0.166 sec
+    floor():   0.231 sec
+    ceil():    0.246 sec
+    round():   0.473 sec
+    trunc():   0.246 sec
+    sum():     0.166 sec
+    product(): 0.017 sec (1000 elements, 10000 iters)
+
+============================================
+SLiM Simulation Benchmark
+(N=5000, 5000 generations, selection)
+============================================
+
+Running 3 iterations each...
+
+SIMD Build:   12.756s (avg)
+Scalar Build: 12.316s (avg)
+
+Speedup: .96x
+
+============================================
+Benchmark complete
+============================================
+```
+so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.