Skip to content

Commit c4fcb5e

Browse files
authored
Merge pull request InsightSoftwareConsortium#6007 from hjmjohnson/fftw-simd-redistributable
PERF: Use ABI-guaranteed SIMD baselines for redistribution-safe FFTW builds
2 parents 0bc61e0 + 934faaa commit c4fcb5e

1 file changed

Lines changed: 117 additions & 52 deletions

File tree

CMake/itkExternal_FFTW.cmake

Lines changed: 117 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,56 @@
11
#
22
# Encapsulates building FFTW as an External Project.
33
#
4-
# SIMD codelet selection
5-
# ----------------------
6-
# FFTW SIMD codelets are hand-written assembly routines baked into the
7-
# library at compile time. Passing -march=native to the ITK build does
8-
# NOT activate them; they must be requested explicitly via FFTW's own
9-
# CMake options (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2).
4+
# SIMD codelet selection and binary redistribution policy
5+
# -------------------------------------------------------
6+
# FFTW SIMD codelets are hand-written assembly routines compiled INTO the
7+
# library at build time. Unlike -march=native on the ITK side, FFTW codelets
8+
# must be requested explicitly via FFTW's own CMake options
9+
# (ENABLE_NEON, ENABLE_SSE, ENABLE_SSE2, ENABLE_AVX, ENABLE_AVX2).
10+
#
11+
# For redistributable binary packages (conda, pip/PyPI, manylinux Docker
12+
# images, etc.) SIMD codelets must only be enabled when the resulting binary
13+
# will run correctly on ALL machines in the target distribution. The ISA
14+
# baseline mandated by each architecture ABI is universally safe:
15+
#
16+
# x86_64 / AMD64 : SSE and SSE2 are required by the AMD64 ABI. Every
17+
# 64-bit x86 CPU (including all manylinux2014 /
18+
# manylinux_2_28 targets) supports them. DEFAULT ON.
1019
#
11-
# This file detects appropriate defaults at cmake configure time:
20+
# aarch64 / arm64 : NEON is required by the AArch64 ABI. Every 64-bit ARM
21+
# CPU (Apple Silicon, all Linux aarch64 targets) supports
22+
# it. DEFAULT ON.
1223
#
13-
# Native builds (CMAKE_CROSSCOMPILING is false):
14-
# - ARM64 (aarch64/arm64/ARM64): NEON=ON (mandatory in ARMv8); x86 SIMD off.
15-
# - x86/x86_64 with GCC/Clang: each of SSE, SSE2, AVX, AVX2 is probed
16-
# individually via __builtin_cpu_supports() / CheckCSourceRuns so that
17-
# the detected flags match the actual build-host CPU. A pre-AVX
18-
# Sandy Bridge gets SSE+SSE2 only; a Haswell or later gets all four.
19-
# On MSVC the probes are skipped (intrinsic unavailable) and SIMD
20-
# defaults to off; users can override via FFTW_ENABLE_* options.
21-
# - Other architectures: all SIMD off (conservative fallback).
24+
# AVX / AVX2 : Not part of the baseline ABI; present only on Sandy
25+
# Bridge (2011) and Haswell (2013) and newer CPUs
26+
# respectively. Enabling them by default would produce
27+
# binaries that SIGILL on older (but spec-compliant)
28+
# x86_64 CPUs. DEFAULT OFF unless the compiler is already
29+
# targeting a micro-architecture that includes them.
2230
#
23-
# Cross-compiled builds (CMAKE_CROSSCOMPILING is true):
24-
# - ARM64: NEON=ON (mandatory); x86 SIMD off.
25-
# - x86_64: SSE+SSE2 only (baseline; AVX/AVX2 not assumed for target).
26-
# - Other: all SIMD off.
31+
# Opt-in to AVX / AVX2
32+
# ---------------------
33+
# If the user's toolchain is already generating AVX/AVX2 instructions
34+
# (because they passed -march=native, -mavx2, -march=haswell, or an
35+
# equivalent MSVC /arch: flag) the compiler pre-defines __AVX__ / __AVX2__.
36+
# This file detects those macros at cmake configure time via
37+
# check_c_source_compiles (compile-time, NOT runtime — no build-host CPU
38+
# probe is performed) and auto-enables the matching FFTW codelets so that
39+
# FFTW's generated code aligns with the rest of the ITK build.
40+
# Users who want AVX2 in a redistributed package can set:
41+
# cmake -DFFTW_ENABLE_AVX2=ON ...
42+
#
43+
# macOS universal binary
44+
# ----------------------
45+
# When CMAKE_OSX_ARCHITECTURES lists more than one value (e.g. "arm64;x86_64")
46+
# a single FFTW configure/build pass cannot correctly serve both slices.
47+
# SIMD defaults are set to OFF in this case; use ITK_USE_SYSTEM_FFTW with a
48+
# proper universal FFTW installation (e.g., built with lipo) if SIMD
49+
# performance is required in a macOS universal build.
2750
#
28-
# Every flag is an individually overridable cache option, e.g.:
29-
# cmake -DFFTW_ENABLE_AVX2=OFF ...
51+
# Every flag remains individually overridable, e.g.:
52+
# cmake -DFFTW_ENABLE_AVX2=ON # opt in to AVX2 for a non-redistributed build
53+
# cmake -DFFTW_ENABLE_SSE2=OFF # opt out of SSE2 (unusual)
3054
# Note: option() defaults are only applied on the first configure.
3155
# To re-detect after a toolchain change, delete the CMake cache or use
3256
# cmake --fresh, or pass explicit -DFFTW_ENABLE_*= overrides.
@@ -84,70 +108,111 @@ if(NOT ITK_USE_SYSTEM_FFTW)
84108
set(FFTW_STAGED_INSTALL_PREFIX "${ITK_BINARY_DIR}/fftw")
85109

86110
# Detect SIMD defaults (see file header for full policy description).
87-
# CheckCSourceRuns results are cached after the first cmake configure run.
88-
include(CheckCSourceRuns)
111+
#
112+
# Architecture-guaranteed ISA baselines (no runtime probe needed):
113+
# - x86_64 mandates SSE + SSE2 in the AMD64 ABI.
114+
# - arm64/aarch64 mandates NEON in the AArch64 ABI.
115+
#
116+
# AVX/AVX2 opt-in via compiler predefined macros:
117+
# check_c_source_compiles (not _runs) reflects what the compiler is
118+
# generating for the TARGET architecture, not what the BUILD HOST's CPU
119+
# can execute. This is safe for cross-compilation and redistribution.
120+
include(CheckCSourceCompiles)
89121

90122
set(_fftw_default_neon OFF)
91123
set(_fftw_default_sse OFF)
92124
set(_fftw_default_sse2 OFF)
93125
set(_fftw_default_avx OFF)
94126
set(_fftw_default_avx2 OFF)
95127

96-
if(NOT CMAKE_CROSSCOMPILING)
97-
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
98-
# NEON is mandatory in ARMv8/AArch64 — every arm64 CPU has it.
99-
set(_fftw_default_neon ON)
100-
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|i686")
101-
# Probe each x86 SIMD level individually via CPUID so the defaults
102-
# are accurate for the actual build-host CPU (e.g. pre-AVX Sandy Bridge
103-
# or pre-AVX2 Ivy Bridge get only the levels their hardware supports).
104-
# __builtin_cpu_supports is a GCC/Clang intrinsic; skip on MSVC.
105-
if(CMAKE_C_COMPILER_ID MATCHES "GNU|Clang|AppleClang")
106-
foreach(_fftw_simd IN ITEMS sse sse2 avx avx2)
107-
check_c_source_runs(
108-
"int main(void){return __builtin_cpu_supports(\"${_fftw_simd}\")?0:1;}"
109-
_fftw_cpu_has_${_fftw_simd}
110-
)
111-
if(_fftw_cpu_has_${_fftw_simd})
112-
set(_fftw_default_${_fftw_simd} ON)
113-
endif()
114-
endforeach()
115-
endif()
128+
# Detect macOS universal binary build: a single configure+build pass cannot
129+
# simultaneously produce correct SIMD for both arm64 and x86_64 slices.
130+
set(_fftw_is_universal FALSE)
131+
if(APPLE AND CMAKE_OSX_ARCHITECTURES)
132+
list(LENGTH CMAKE_OSX_ARCHITECTURES _fftw_arch_count)
133+
if(_fftw_arch_count GREATER 1)
134+
set(_fftw_is_universal TRUE)
135+
message(
136+
STATUS
137+
"FFTW: macOS universal binary (${CMAKE_OSX_ARCHITECTURES}): "
138+
"per-architecture SIMD defaults disabled. "
139+
"Use ITK_USE_SYSTEM_FFTW with a universal FFTW to enable SIMD."
140+
)
116141
endif()
117-
else()
118-
# Cross-compiling: conservative architecture-level fallback.
142+
endif()
143+
144+
if(NOT _fftw_is_universal)
119145
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|ARM64")
146+
# NEON is mandatory in the AArch64 ABI — every arm64 CPU has it.
147+
# Safe for all conda/pip arm64 packages and manylinux aarch64.
120148
set(_fftw_default_neon ON)
121149
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64")
122-
# SSE/SSE2 are baseline on all 64-bit x86 CPUs; AVX/AVX2 not assumed.
150+
# SSE and SSE2 are required by the AMD64 ABI — universally present on
151+
# every 64-bit x86 CPU, including the oldest manylinux build targets.
152+
# Safe for all conda/pip x86_64 packages.
123153
set(_fftw_default_sse ON)
124154
set(_fftw_default_sse2 ON)
155+
# AVX and AVX2 are NOT part of the AMD64 baseline. Auto-enable them
156+
# only when the compiler is already producing those instructions — i.e.
157+
# when the user explicitly asked for a specific micro-architecture via
158+
# -march=native, -mavx2, /arch:AVX2, etc. This compile-time check
159+
# mirrors the approach recommended by seanm in ITK PR #6006:
160+
# "the compiler knows what CPU it's compiling for."
161+
#
162+
# check_c_source_compiles caches its result by variable name. Unset
163+
# the cache entry first so the probe always re-runs against the current
164+
# CMAKE_C_FLAGS; this ensures that adding -march=native on a subsequent
165+
# configure is correctly reflected in the auto-detected default.
166+
# Note: FFTW_ENABLE_AVX / FFTW_ENABLE_AVX2 follow standard option()
167+
# caching — they are only auto-set from the detected default when not
168+
# already present in the cache. To force re-evaluation of the option
169+
# after a FLAGS change, delete those entries from the CMake cache or
170+
# pass -DFFTW_ENABLE_AVX2=ON explicitly.
171+
unset(_fftw_compiler_targets_avx CACHE)
172+
check_c_source_compiles(
173+
"#ifndef __AVX__\n#error AVX not enabled\n#endif\nint main(void){return 0;}"
174+
_fftw_compiler_targets_avx
175+
)
176+
if(_fftw_compiler_targets_avx)
177+
set(_fftw_default_avx ON)
178+
endif()
179+
unset(_fftw_compiler_targets_avx2 CACHE)
180+
check_c_source_compiles(
181+
"#ifndef __AVX2__\n#error AVX2 not enabled\n#endif\nint main(void){return 0;}"
182+
_fftw_compiler_targets_avx2
183+
)
184+
if(_fftw_compiler_targets_avx2)
185+
set(_fftw_default_avx2 ON)
186+
endif()
187+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686|i386")
188+
# 32-bit x86 ABI does not mandate SSE/SSE2. Leave defaults OFF;
189+
# users may opt in explicitly if their minimum target CPU supports them.
125190
endif()
126191
endif()
127192

128193
option(
129194
FFTW_ENABLE_NEON
130-
"Enable FFTW NEON SIMD codelets (ARM64)"
195+
"Enable FFTW NEON SIMD codelets (ARM64; ON by default on aarch64/arm64)"
131196
${_fftw_default_neon}
132197
)
133198
option(
134199
FFTW_ENABLE_SSE
135-
"Enable FFTW SSE SIMD codelets (x86)"
200+
"Enable FFTW SSE SIMD codelets (x86; ON by default on x86_64 — required by AMD64 ABI)"
136201
${_fftw_default_sse}
137202
)
138203
option(
139204
FFTW_ENABLE_SSE2
140-
"Enable FFTW SSE2 SIMD codelets (x86)"
205+
"Enable FFTW SSE2 SIMD codelets (x86; ON by default on x86_64 — required by AMD64 ABI)"
141206
${_fftw_default_sse2}
142207
)
143208
option(
144209
FFTW_ENABLE_AVX
145-
"Enable FFTW AVX SIMD codelets (x86)"
210+
"Enable FFTW AVX SIMD codelets (Sandy Bridge+; OFF by default for redistribution safety)"
146211
${_fftw_default_avx}
147212
)
148213
option(
149214
FFTW_ENABLE_AVX2
150-
"Enable FFTW AVX2 SIMD codelets (x86)"
215+
"Enable FFTW AVX2 SIMD codelets (Haswell+; OFF by default for redistribution safety)"
151216
${_fftw_default_avx2}
152217
)
153218

0 commit comments

Comments
 (0)