Skip to content

Commit de52a44

Browse files
committed
fix(simd): SimdProfile::detect() consults amx_available() — Risk #3 closure
Integration plan risk #3 ("Detection robustness across hypervisors"): CPUID may advertise AMX-TILE while the OS/hypervisor has not enabled the tile XSAVE state. Without the OS-level check, the dispatch table routes to AMX kernels that SIGILL at first use. Fix: SimdProfile::detect() now reads `simd_amx::amx_available()` (the existing 4-step gate: CPUID → OSXSAVE → XCR0[17,18] → arch_prctl XCOMP_PERM on Linux 5.19+) and demotes when CPUID and OS disagree. The GraniteRapids and SapphireRapids arms now require both the CPUID bits AND `amx_usable`; the Zen4Avx512 arm catches SPR-class CPUID with locked-down hypervisor XSAVE so dispatch falls to the AVX-512 BF16/FP16 path instead. Verified on the build host (Sapphire Rapids silicon, kernel 6.18.5): - CPUID reports amx_tile=1, amx_int8=1, amx_bf16=1 (all true) - simd_amx::amx_available() returns false (hypervisor masks XCR0[17,18] or the arch_prctl(XCOMP_PERM) request fails) - SimdProfile::detect() correctly resolves to Zen4Avx512, not SapphireRapids — the AMX kernels are not reachable from dispatch on this OS state. Without this fix, the e40f3a3 detect path would have resolved to SapphireRapids on this exact silicon/OS combination, then SIGILL'd the first time a dispatch table called an AMX kernel. Bug closed before any consumer was wired to the dispatch table. The probe binary (examples/simd_profile_probe.rs) gains a new "AMX gating (CPUID vs OS)" section so the CPUID-vs-OS gap is visible without reading source. Format mirrors how the matrix-doc cell summary appears: terse, two lines plus an optional demotion note when the bits disagree. Pinned mode (cpu-* cargo features) intentionally bypasses this gate since pinning is a build-time assertion that the target OS supports the chosen variant — pinned binaries are non-portable by design. Tests: 2077/2077 lib pass. cargo clippy --lib clean under default and --features cpu-spr. Behaviour on hardware with proper AMX enablement (full prctl path success) is unchanged: SapphireRapids still resolves to SapphireRapids when amx_available() returns true.
1 parent 03b30e5 commit de52a44

2 files changed

Lines changed: 37 additions & 4 deletions

File tree

examples/simd_profile_probe.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,27 @@ fn main() {
4949
print_caps(&caps);
5050
println!();
5151

52+
// ── AMX OS-state probe (Risk #3 from integration plan) ────────
53+
// SimdCaps reports raw CPUID. SimdProfile::detect() additionally
54+
// consults `simd_amx::amx_available()` which gates on
55+
// OSXSAVE + XCR0[17,18] + arch_prctl(XCOMP_PERM). If CPUID says
56+
// AMX-TILE but the OS/hypervisor doesn't enable the XSAVE state,
57+
// dispatch demotes from SPR/GNR to Zen4Avx512 (AVX-512 BF16 path
58+
// instead of AMX tiles). Surfacing the gap here lets a reviewer
59+
// see when CPUID-vs-OS disagree without reading source.
60+
#[cfg(target_arch = "x86_64")]
61+
{
62+
let cpuid_says_amx = caps.amx_tile && caps.amx_int8;
63+
let os_allows_amx = ndarray::simd_amx::amx_available();
64+
println!("AMX gating (CPUID vs OS):");
65+
println!(" CPUID amx_tile+amx_int8: {}", cpuid_says_amx);
66+
println!(" OS XSAVE/prctl gate: {}", os_allows_amx);
67+
if cpuid_says_amx && !os_allows_amx {
68+
println!(" → CPUID-reported AMX is OS-DEMOTED — dispatch falls back to AVX-512 path");
69+
}
70+
println!();
71+
}
72+
5273
// ── ARM-specific sub-profile (heuristic; deployment-pragmatic) ──
5374
let arm = caps.arm_profile();
5475
if !matches!(arm, ArmProfile::NotArm) {

src/hpc/simd_profile.rs

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,22 +86,34 @@ impl SimdProfile {
8686
#[cfg(target_arch = "x86_64")]
8787
{
8888
let caps = simd_caps();
89+
// Risk #3 from the integration plan ("Detection robustness
90+
// across hypervisors"): CPUID may report AMX-TILE while the
91+
// OS has not enabled the tile XSAVE state. In that case AMX
92+
// instructions SIGILL despite the CPUID bit being set.
93+
// `simd_amx::amx_available()` runs the full 4-step gate
94+
// (CPUID + OSXSAVE + XCR0 bits 17/18 + arch_prctl
95+
// XCOMP_PERM). Demote to the no-AMX dispatch branch when
96+
// the OS check fails — typically resolves as Zen4Avx512 on
97+
// SPR-class CPUID with locked-down hypervisor XSAVE state.
98+
let amx_usable = caps.amx_tile && crate::simd_amx::amx_available();
8999
// GraniteRapids: AMX-FP16 (CPUID 7,1 EAX bit 21). Must be
90100
// checked first because GNR is a strict superset of SPR.
91-
if caps.has_amx_fp16() {
101+
if amx_usable && caps.amx_fp16 {
92102
return SimdProfile::GraniteRapids;
93103
}
94104
// SapphireRapids / EmeraldRapids: AMX-TILE + AMX-BF16 +
95105
// AVX-512-FP16. EmeraldRapids has identical ISA — same variant.
96-
if caps.amx_tile && caps.amx_bf16 && caps.avx512fp16 {
106+
if amx_usable && caps.amx_bf16 && caps.avx512fp16 {
97107
return SimdProfile::SapphireRapids;
98108
}
99-
// Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no AMX.
109+
// Zen4 / Zen5: AVX-512 + VBMI + BF16 + FP16, but no usable
110+
// AMX. The `!amx_usable` guard also catches OS-demoted SPR
111+
// silicon so it resolves here instead of as SapphireRapids.
100112
if caps.avx512f
101113
&& caps.avx512vbmi
102114
&& caps.avx512bf16
103115
&& caps.avx512fp16
104-
&& !caps.amx_tile
116+
&& !amx_usable
105117
{
106118
return SimdProfile::Zen4Avx512;
107119
}

0 commit comments

Comments
 (0)