Skip to content

Commit b396148

Browse files
authored
Merge pull request #62 from SwayamInSync/simd-patch
FIX: Adding build-patch to disable Scalar FMA on SLEEF
2 parents 08b5708 + 1281abf commit b396148

10 files changed

Lines changed: 230 additions & 8 deletions

File tree

.github/workflows/test_old_cpu.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,13 @@ jobs:
5151
env:
5252
LDFLAGS: "-fopenmp"
5353
run: |
54-
pip install . --no-build-isolation -v
54+
# For Sandy Bridge (x86-64-v2), we need to disable FMA code paths
55+
# since FMA instructions are not available on that microarchitecture
56+
if [ "${{ matrix.cpu[0] }}" = "snb" ]; then
57+
pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true
58+
else
59+
pip install . --no-build-isolation -v
60+
fi
5561
5662
- name: Test import on ${{ matrix.cpu[1] }}
5763
run: |
@@ -69,4 +75,4 @@ jobs:
6975
- name: Run tests on ${{ matrix.cpu[1] }}
7076
run: |
7177
pip install pytest mpmath
72-
sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short
78+
sde -${{ matrix.cpu[0] }} -- python -m pytest tests/ -v --tb=short -v -s

README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ A cross-platform Quad (128-bit) float Data-Type for NumPy.
1919
- [Installation from source](#installation-from-source)
2020
- [Linux/Unix/macOS](#linuxunixmacos)
2121
- [Windows](#windows)
22+
- [Build Options](#build-options)
23+
- [Disabling FMA (Fused Multiply-Add)](#disabling-fma-fused-multiply-add)
2224
- [Building with ThreadSanitizer (TSan)](#building-with-threadsanitizer-tsan)
2325
- [Building the documentation](#building-the-documentation)
2426
- [Serving the documentation](#serving-the-documentation)
@@ -138,6 +140,23 @@ python -m pytest tests
138140

139141
8. **Architecture**: The instructions are for x64. For x86 builds, change `-A x64` to `-A Win32`.
140142

143+
## Build Options
144+
145+
### Disabling FMA (Fused Multiply-Add)
146+
147+
On older x86-64 CPUs without FMA support (e.g., Sandy Bridge / x86_64-v2), the SLEEF's `PURECFMA` scalar code path will cause illegal instruction errors. By default, FMA support is auto-detected at build time, but you can explicitly disable it:
148+
149+
```bash
150+
pip install . -Csetup-args=-Ddisable_fma=true
151+
```
152+
153+
This is a workaround for a [SLEEF issue](https://github.com/shibatch/sleef/issues/707) where `PURECFMA` scalar functions are unconditionally compiled with FMA instructions even on systems that don't support them.
154+
155+
**When to use this option:**
156+
- Building on or for x86_64-v2 (Sandy Bridge era) CPUs
157+
- Cross-compiling for older x86_64 targets
158+
- Running in emulators/VMs that don't expose FMA capability
159+
141160
## Building with ThreadSanitizer (TSan)
142161

143162
This is a development feature to help detect threading issues. To build `numpy-quaddtype` with TSan enabled, follow these steps:

meson.build

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ qblas_dep = dependency('qblas', fallback: ['qblas', 'qblas_dep'])
1818
# Try to find SLEEF system-wide first, fall back to subproject if not found
1919
# Required SLEEF version (must match sleef.wrap revision)
2020
required_sleef_version = '3.9.0'
21-
sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, fallback: ['sleef', 'sleef_dep'], required: false)
21+
# Don't use fallback here - we need to call subproject() explicitly later with disable_fma option
22+
sleef_dep = dependency('sleef', version: '>=' + required_sleef_version, required: false)
2223

2324
use_system_sleef = false
2425
fallback_reason = ''
2526

26-
if sleef_dep.found() and sleef_dep.type_name() != 'internal' and sleef_dep.version().startswith(required_sleef_version)
27+
if sleef_dep.found() and sleef_dep.version().startswith(required_sleef_version)
2728
# SLEEF found system-wide - verify quad-precision support
2829
cpp = meson.get_compiler('cpp')
2930
sleefquad_lib = cpp.find_library('sleefquad', required: false)
@@ -68,7 +69,9 @@ endif
6869
if use_system_sleef
6970
message('Using system-wide SLEEF installation with quad-precision support')
7071
else
71-
sleef_subproj = subproject('sleef')
72+
# Pass disable_fma option to sleef subproject for x86-64-v2 compatibility
73+
message('SLEEF FMA disable option: ' + get_option('disable_fma').to_string())
74+
sleef_subproj = subproject('sleef', default_options: ['disable_fma=' + get_option('disable_fma').to_string()])
7275
sleef_dep = sleef_subproj.get_variable('sleef_dep')
7376
sleefquad_dep = sleef_subproj.get_variable('sleefquad_dep')
7477
warning(fallback_reason)
@@ -197,4 +200,4 @@ py.extension_module('_quaddtype_main',
197200
install: true,
198201
subdir: 'numpy_quaddtype',
199202
include_directories: [includes, build_includes, pythoncapi_includes],
200-
)
203+
)

meson.options

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
option('disable_fma', type: 'boolean', value: false,
2+
description: 'Disable FMA (Fused Multiply-Add) code paths' +
3+
'Set to true when building for older CPUs like Sandy Bridge that lack FMA support.')

reinstall.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ rm -rf .mesonpy-*
1010

1111
python -m pip uninstall -y numpy_quaddtype
1212
python -m pip install . -vv 2>&1 | tee build_log.txt
13+
# pip install . --no-build-isolation -v -Csetup-args=-Ddisable_fma=true 2>&1 | tee build_log.txt
1314

1415
# for debugging and TSAN builds, comment the above line and uncomment all below:
1516
# export CFLAGS="-fsanitize=thread -g -O0"
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
diff --git a/CMakeLists.txt b/CMakeLists.txt
2+
index 1234567..abcdefg 100644
3+
--- a/CMakeLists.txt
4+
+++ b/CMakeLists.txt
5+
@@ -90,6 +90,10 @@ option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
6+
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
7+
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
8+
9+
+# Option to disable PURECFMA scalar code path on x86 for x86-64-v2 compatibility
10+
+# When ON, PURECFMA scalar dispatch is disabled (useful for Sandy Bridge support)
11+
+# This can be set dynamically by the build system based on target CPU detection
12+
+option(SLEEF_DISABLE_PURECFMA_SCALAR "Disable PURECFMA scalar code path (for x86-64-v2 compatibility)" OFF)
13+
#
14+
15+
if ((NOT "${CMAKE_C_COMPILER_ID}" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
16+
diff --git a/Configure.cmake b/Configure.cmake
17+
index e23f577..f1a2b3c 100644
18+
--- a/Configure.cmake
19+
+++ b/Configure.cmake
20+
@@ -193,7 +193,12 @@ endif()
21+
if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
22+
set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
23+
24+
- set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
25+
+ # Only set PURECFMA_SCALAR flags if not explicitly disabled
26+
+ if(NOT SLEEF_DISABLE_PURECFMA_SCALAR)
27+
+ set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
28+
+ else()
29+
+ message(STATUS "PURECFMA_SCALAR disabled for x86-64-v2 compatibility")
30+
+ endif()
31+
elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
32+
set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
33+
# Aarch64 requires support for advsimdfma4
34+
@@ -220,7 +225,12 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
35+
endif()
36+
37+
set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
38+
-set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
39+
+# Conditionally enable PURECFMA_SCALAR based on option
40+
+if(SLEEF_DISABLE_PURECFMA_SCALAR)
41+
+ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 0)
42+
+else()
43+
+ set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
44+
+endif()
45+
46+
# Compiler feature detection
47+
48+
diff --git a/src/quad/CMakeLists.txt b/src/quad/CMakeLists.txt
49+
index 8e4e261..cc55002 100644
50+
--- a/src/quad/CMakeLists.txt
51+
+++ b/src/quad/CMakeLists.txt
52+
@@ -397,9 +397,17 @@ set_target_properties(qmkdisp PROPERTIES ${COMMON_TARGET_PROPERTIES})
53+
54+
# Target qdispscalar.c
55+
56+
+# Set scalar dispatch backends based on PURECFMA support
57+
+# When SLEEF_DISABLE_PURECFMA_SCALAR is ON, use purec for both slots
58+
+if(COMPILER_SUPPORTS_PURECFMA_SCALAR)
59+
+ set(SCALAR_DISPATCH_BACKENDS "purec" "purecfma")
60+
+else()
61+
+ set(SCALAR_DISPATCH_BACKENDS "purec" "purec")
62+
+endif()
63+
+
64+
add_custom_command(
65+
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
66+
- COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t purec purecfma > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
67+
+ COMMAND $<TARGET_FILE:qmkdisp> 1 Sleef_quad double int32_t int64_t uint64_t ${SCALAR_DISPATCH_BACKENDS} > ${CMAKE_CURRENT_BINARY_DIR}/qdispscalar.c.body
68+
DEPENDS qmkdisp
69+
)
70+
sleef_concat_files(
71+
@@ -420,6 +428,11 @@ target_compile_definitions(qdispscalar_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
72+
target_include_directories(qdispscalar_obj PRIVATE ${sleef_BINARY_DIR}/include)
73+
add_dependencies(qdispscalar_obj qdispscalar.c_generated qrenamedspscalar.h_generated
74+
sleefquad_headers ${TARGET_LIBSLEEF} ${TARGET_HEADERS})
75+
+# Define ENABLE_PURECFMA when PURECFMA is supported, so qdispscalar.c.org
76+
+# can conditionally include the tryFMA() function and SUBST_IF_EXT1 macro
77+
+if(COMPILER_SUPPORTS_PURECFMA_SCALAR)
78+
+ target_compile_definitions(qdispscalar_obj PRIVATE ENABLE_PURECFMA=1)
79+
+endif()
80+
target_sources(sleefquad PRIVATE $<TARGET_OBJECTS:qdispscalar_obj>)
81+
82+
# Target qdispsse2_obj
83+
diff --git a/src/quad/qdispscalar.c.org b/src/quad/qdispscalar.c.org
84+
index c4c1292..48f309c 100644
85+
--- a/src/quad/qdispscalar.c.org
86+
+++ b/src/quad/qdispscalar.c.org
87+
@@ -15,10 +15,14 @@
88+
89+
#include "qdispatcher.h"
90+
91+
+#ifdef ENABLE_PURECFMA
92+
NOEXPORT Sleef_quad sleef_cpuid_QUADFMA_0;
93+
static void tryFMA() { sleef_cpuid_QUADFMA_0 = Sleef_sinq1_u10purecfma(sleef_cpuid_QUADFMA_0); }
94+
95+
#define SUBST_IF_EXT1(funcExt1) if (cpuSupportsExt(tryFMA)) p = funcExt1;
96+
+#else
97+
+#define SUBST_IF_EXT1(funcExt1)
98+
+#endif
99+
100+
//

subprojects/packagefiles/sleef/meson.build

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ project('sleef')
22

33
cmake = find_program('cmake')
44
ninja = find_program('ninja', 'make', required: false)
5+
cc = meson.get_compiler('c')
56

67
sleef_build_dir = 'sleef_build'
78
sleef_install_dir = 'sleef_install'
@@ -19,8 +20,41 @@ endif
1920
# For building sleef with TSan, delete the sleef subproject and follow the README instructions to build sleef externally.
2021
# Enable SIMD extensions that are OFF by default but required by qblas (will change in future)
2122
sleef_simd_flags = []
23+
sleef_purecfma_flag = []
24+
25+
# Check for force-disable FMA option (for cross-compilation or emulation scenarios)
26+
force_disable_fma = get_option('disable_fma')
27+
2228
if host_machine.cpu_family() == 'x86_64' or host_machine.cpu_family() == 'x86'
2329
sleef_simd_flags = ['-DSLEEF_ENABLE_SSE2=ON']
30+
31+
if force_disable_fma
32+
# User explicitly requested no FMA
33+
message('FMA explicitly disabled via option - disabling PURECFMA scalar for x86-64-v2 compatibility')
34+
sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON']
35+
else
36+
# Auto-detect FMA support at configure time by actually running FMA code
37+
fma_test_result = cc.run('''
38+
#include <immintrin.h>
39+
int main(void) {
40+
__m128 a = _mm_set1_ps(1.0f);
41+
__m128 b = _mm_set1_ps(2.0f);
42+
__m128 c = _mm_set1_ps(3.0f);
43+
__m128 r = _mm_fmadd_ps(a, b, c);
44+
(void)r;
45+
return 0;
46+
}
47+
''', args: ['-mfma'], name: 'FMA instruction runtime support')
48+
49+
has_fma = fma_test_result.compiled() and fma_test_result.returncode() == 0
50+
51+
if not has_fma
52+
message('FMA not supported at runtime - disabling PURECFMA scalar code path')
53+
sleef_purecfma_flag = ['-DSLEEF_DISABLE_PURECFMA_SCALAR=ON']
54+
else
55+
message('FMA supported - enabling PURECFMA scalar code path')
56+
endif
57+
endif
2458
endif
2559

2660
sleef_configure = run_command([
@@ -35,7 +69,7 @@ sleef_configure = run_command([
3569
'-DSLEEF_ENABLE_TLFLOAT=OFF', # this is only used for testing in SLEEF, not runtime
3670
'-DCMAKE_POSITION_INDEPENDENT_CODE=ON',
3771
'-DCMAKE_INSTALL_PREFIX=' + meson.current_build_dir() / sleef_install_dir
38-
] + sleef_simd_flags, check: false, capture: true)
72+
] + sleef_simd_flags + sleef_purecfma_flag, check: false, capture: true)
3973

4074
if sleef_configure.returncode() != 0
4175
error('SLEEF CMake configuration failed: ' + sleef_configure.stderr())
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
option('disable_fma', type: 'boolean', value: false,
2+
description: 'Force disable FMA (Fused Multiply-Add) code paths. ' +
3+
'Use this when targeting x86_64-v2 CPUs (like Sandy Bridge) that lack FMA support.')

subprojects/sleef.wrap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ directory=sleef
33
url=https://github.com/shibatch/sleef.git
44
revision=3.9.0
55
patch_directory=sleef
6+
diff_files=sleef/fix-purecfma-scalar-x86.patch
67

78
[provide]
89
sleef = sleef_dep

tests/test_quaddtype.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5877,4 +5877,56 @@ def test_logical_reduce_on_non_quad_arrays():
58775877
with standard NumPy operations like np.logical_or.reduce(np.arange(10.)).
58785878
"""
58795879
result = np.logical_or.reduce(np.arange(10.))
5880-
assert result == True
5880+
assert result == True
5881+
5882+
5883+
def test_sleef_purecfma_symbols():
5884+
"""Test that SLEEF PURECFMA symbols are present in the compiled module.
5885+
5886+
PURECFMA provides optimized scalar code paths using FMA instructions.
5887+
This test verifies the module was built with FMA support enabled.
5888+
On systems without FMA (e.g., x86-64-v2/Sandy Bridge), the build should
5889+
automatically disable PURECFMA, and this test should be skipped.
5890+
"""
5891+
import subprocess
5892+
import shutil
5893+
import pathlib
5894+
5895+
# Skip if nm is not available
5896+
nm_path = shutil.which('nm')
5897+
if nm_path is None:
5898+
pytest.skip("nm command not available")
5899+
5900+
# Get the path to the compiled shared library (.so file)
5901+
module_dir = pathlib.Path(numpy_quaddtype.__file__).parent
5902+
so_files = list(module_dir.glob('_quaddtype_main*.so'))
5903+
5904+
if not so_files:
5905+
pytest.skip("Could not find _quaddtype_main shared library")
5906+
5907+
module_path = str(so_files[0])
5908+
5909+
try:
5910+
result = subprocess.run(
5911+
['nm', module_path],
5912+
capture_output=True,
5913+
text=True,
5914+
timeout=30
5915+
)
5916+
except subprocess.TimeoutExpired:
5917+
pytest.skip("nm command timed out")
5918+
except FileNotFoundError:
5919+
pytest.skip("nm command not found")
5920+
5921+
purecfma_symbols = [
5922+
line for line in result.stdout.lower().splitlines()
5923+
if 'purecfma' in line
5924+
]
5925+
5926+
if purecfma_symbols:
5927+
print(f"\n✓ Found {len(purecfma_symbols)} PURECFMA symbols (FMA optimizations enabled)")
5928+
print(" Sample symbols:")
5929+
for sym in purecfma_symbols[:5]:
5930+
print(f" {sym}")
5931+
if len(purecfma_symbols) > 5:
5932+
print(f" ... and {len(purecfma_symbols) - 5} more")

0 commit comments

Comments
 (0)