|
26 | 26 |
|
27 | 27 | namespace TiledArray { |
28 | 28 |
|
29 | | -/// Alignment of in-arena element storage, in bytes. Sized to cover the |
30 | | -/// widest common SIMD register (AVX-512 ZMM = 64 B) and a single x86_64 |
31 | | -/// cache line. Override at configure time by defining |
32 | | -/// TILEDARRAY_INNER_SIMD_ALIGN to a larger power-of-two (e.g. 128 for |
33 | | -/// two-cache-line floor / Apple-Silicon L1 line size). |
34 | | -#ifndef TILEDARRAY_INNER_SIMD_ALIGN |
35 | | -#define TILEDARRAY_INNER_SIMD_ALIGN 64 |
36 | | -#endif |
37 | | - |
38 | | -inline constexpr std::size_t kInnerSimdAlign = TILEDARRAY_INNER_SIMD_ALIGN; |
39 | | -static_assert((kInnerSimdAlign & (kInnerSimdAlign - 1)) == 0, |
40 | | - "kInnerSimdAlign must be a power of two"); |
| 29 | +/// Alignment of in-arena element storage, in bytes. Supplied via CMake |
| 30 | +/// (cache variable `TA_ARENATENSOR_SIMD_ALIGN`, propagated through |
| 31 | +/// `TiledArray/config.h`). The default (32 B) matches the SSE/AVX/AVX2 |
| 32 | +/// family — AVX2's 256-bit YMM registers being the most common x86_64 |
| 33 | +/// SIMD target today. Override at configure time with |
| 34 | +/// `-DTA_ARENATENSOR_SIMD_ALIGN=<N>` for another power of two: |
| 35 | +/// - 64 for AVX-512 ZMM (also matches an x86_64 cache line); |
| 36 | +/// - 16 for NEON-only targets — NEON has no wider register (Apple |
| 37 | +/// Silicon does not implement SVE), so 16 is sufficient there; |
| 38 | +/// - 128 for a two-cache-line / Apple-Silicon L1-line floor (useful |
| 39 | +/// only if cells need that as a false-sharing boundary). |
| 40 | +/// Each ArenaTensor cell pads from `sizeof(Cell)` up to this alignment |
| 41 | +/// before its element storage, so lowering the value cuts per-cell |
| 42 | +/// padding at the cost of narrower vectorized loads/stores. |
| 43 | +inline constexpr std::size_t kArenaTensorSimdAlign = TA_ARENATENSOR_SIMD_ALIGN; |
| 44 | +static_assert((kArenaTensorSimdAlign & (kArenaTensorSimdAlign - 1)) == 0, |
| 45 | + "TA_ARENATENSOR_SIMD_ALIGN must be a power of two"); |
41 | 46 |
|
42 | 47 | template <typename T, typename Range_ = ::btas::zb::RangeNd<>> |
43 | 48 | class ArenaTensor; |
@@ -79,7 +84,8 @@ class ArenaTensor { |
79 | 84 | /// arena slots must honour this so SIMD loads/stores on `data()` are |
80 | 85 | /// aligned without an extra runtime check. |
81 | 86 | static constexpr size_type data_alignment() noexcept { |
82 | | - return alignof(T) > kInnerSimdAlign ? alignof(T) : kInnerSimdAlign; |
| 87 | + return alignof(T) > kArenaTensorSimdAlign ? alignof(T) |
| 88 | + : kArenaTensorSimdAlign; |
83 | 89 | } |
84 | 90 |
|
85 | 91 | /// Offset (in bytes) of the first element past the cell header. |
|
0 commit comments