Skip to content

Commit 2414e71

Browse files
committed
feat: compile-time AVX-512 dispatch + target-cpu=x86-64-v4
- simd.rs: F64x8/F32x16 now use native __m512d/__m512 when target_feature="avx512f" is enabled (compile-time, not runtime) - .cargo/config.toml: rustflags target-cpu=x86-64-v4 ensures all BF16 SIMD paths emit native AVX-512 instructions - No scalar fallback, no hand-rolled constants, Rust 1.94 stable https://claude.ai/code/session_01M3at4EuHVvQ8S95mSnKgtK
1 parent 66289d8 commit 2414e71

2 files changed

Lines changed: 16 additions & 12 deletions

File tree

.cargo/config.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[build]
2+
# x86-64-v4 = AVX-512 baseline. All BF16 SIMD paths use native __m512d.
3+
# Rust 1.94 stable. No nightly.
4+
rustflags = ["-C", "target-cpu=x86-64-v4"]

src/simd.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,19 @@ fn tier() -> Tier { *TIER }
3030
#[cfg(target_arch = "x86_64")]
3131
pub use crate::simd_avx512::{F32x8, F64x4, f32x8, f64x4};
3232

33-
// 512-bit types: tier selects which implementation backs them.
34-
// On AVX-512 machines: simd_avx512 types (__m512 native).
35-
// On AVX2 machines: simd_avx2 types (2× __m256 composed).
36-
// The tier is detected once via LazyLock. After that it's a frozen enum match.
37-
//
38-
// PROBLEM: Rust can't switch `pub use` at runtime.
39-
// SOLUTION: re-export the AVX2 versions (safe on all x86_64).
40-
// On AVX-512 machines, the AVX2 composed types still work correctly —
41-
// just 2 instructions instead of 1. The BLAS hot paths in native.rs
42-
// already dispatch to kernels_avx512 via their own tier() check.
43-
// The SIMD types are for HPC consumer code, not inner BLAS loops.
33+
// 512-bit types: compile-time dispatch via target_feature.
34+
// With target-cpu=x86-64-v4 (or native on AVX-512 hardware),
35+
// avx512f is enabled at compile time → native __m512 types.
36+
// Otherwise falls back to AVX2 composed types (2× __m256).
4437

45-
#[cfg(target_arch = "x86_64")]
38+
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
39+
pub use crate::simd_avx512::{
40+
F32x16, F64x8, U8x64, I32x16, I64x8, U32x16, U64x8,
41+
F32Mask16, F64Mask8,
42+
f32x16, f64x8, u8x64, i32x16, i64x8, u32x16, u64x8,
43+
};
44+
45+
#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
4646
pub use crate::simd_avx2::{
4747
F32x16, F64x8, U8x64, I32x16, I64x8, U32x16, U64x8,
4848
F32Mask16, F64Mask8,

0 commit comments

Comments
 (0)