feat(simd): Phase 2 — wire simd_nightly into crate::simd::* dispatch

claude · claude · commit f857a812d1a9 · 2026-05-20T12:13:55.000Z
Phase 2 of the integration plan in `.claude/knowledge/
simd-dispatch-architecture.md`.

simd.rs
-------

Adds a top-priority `feature = "nightly-simd"` dispatch arm that
re-exports the full `simd_nightly::*` portable-SIMD type set through
`crate::simd::*`. No `target_arch` constraint — `core::simd` is portable,
so the same arm catches wasm32 / riscv / aarch64 / x86_64.

Tightens the predicate on every other type-re-export arm to
`not(feature = "nightly-simd")`:
  * AVX-512 (avx512f)
  * AVX-512BF16 (BF16x8/16 types)
  * AVX2 baseline (the v3 default arm)
  * U8x32 (cross-tier export)
  * aarch64 NEON
  * non-x86/non-aarch64 scalar fallback
  * the inline `mod scalar` declaration itself

Result: when `cargo +nightly --features nightly-simd ...` is used, every
`use crate::simd::F32x16` call site routes to the portable-SIMD
implementation — and miri can actually execute it (it treats `_mm*`
intrinsics as opaque, but `core::simd::*` runs fine).

BF16 conversion FUNCTIONS (bf16_to_f32_batch etc.) are NOT gated under
the nightly arm: they're scalar/intrinsic functions taking primitive
slices, not the SIMD types, and they coexist cleanly with the portable
backend.

architecture doc
----------------

Parity matrix updated to reflect what `src/simd_avx2.rs` actually ships.
Previous matrix marked U8x64 / I8x64 / I16x32 / I32x16 / I64x8 /
U16x32 / U32x16 / U64x8 as ❌ in the AVX2 column. On survey those types
exist via the `avx2_int_type!` macro — full API-parity structs with
`[$elem; $lanes]` scalar storage (align 64). New 🟠 marker introduced
to distinguish "struct exists with API, storage is scalar" from "true
two-half SIMD composite" (🟡). I8x32 / I16x16 also corrected: they
share the AVX-512 `__m256i` definition (re-exported through
`simd_avx2`'s `pub use crate::simd_avx512::{i16x16, i8x32, ...}`).

The remaining AVX2 vectorization gap (filling 🟠 → 🟡 with real
two-half SIMD ops) is tracked separately as TD-SIMD-3.
diff --git a/.claude/knowledge/simd-dispatch-architecture.md b/.claude/knowledge/simd-dispatch-architecture.md
@@ -128,26 +128,36 @@ chooses the source; the cargo config chooses how `simd.rs` chooses.
 
 ## 4. Parity matrix — typed lane primitives per backend
 
-Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🔵
-scalar polyfill via `core::simd`, ❌ missing, ⛔ N/A for this arch.
+Legend: ✅ native, 🟡 composed wrapper (two-half / four-quarter), 🟠
+scalar polyfill (struct exists with full API but storage is `[$elem;
+$lanes]` — no SIMD execution), 🔵 portable-SIMD polyfill via
+`core::simd`, ❌ missing, ⛔ N/A for this arch.
+
+(Reality check 2026-05-20: many AVX2 int rows previously marked ❌ are
+actually 🟠 — `simd_avx2.rs` ships them via the `avx2_int_type!` macro
+as scalar-storage structs that match the AVX-512 API surface. The
+arithmetic is plain Rust under the hood; only the FLOAT wrappers in
+this column are true two-half SIMD composites. Filling in real AVX2
+vectorization for the int wrappers is its own piece of tech debt
+tracked as TD-SIMD-3.)
 
 | Lane type | `simd_avx512` (v4) | `simd_avx2` (v3) | `simd_neon` (aarch64) | `simd_nightly` | `scalar` |
 |---|---|---|---|---|---|
 | `F32x16` | ✅ `__m512` | 🟡 `(f32x8, f32x8)` | 🟡 `[float32x4_t; 4]` | 🔵 `core::simd::f32x16` | ✅ `[f32; 16]` |
 | `F32x8` | ✅ `__m256` | ❌ | ⛔ | 🔵 | ✅ |
 | `F64x8` | ✅ `__m512d` | 🟡 `(f64x4, f64x4)` | 🟡 `[float64x2_t; 4]` | 🔵 | ✅ |
 | `F64x4` | ✅ `__m256d` | ❌ | ⛔ | 🔵 | ✅ |
-| `U8x64` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
+| `U8x64` | ✅ `__m512i` | 🟠 `[u8; 64]` polyfill | ❌ | 🔵 | ✅ |
 | `U8x32` | ✅ `__m256i` | ✅ `__m256i` | ❌ | 🔵 | ✅ |
-| `U16x32` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `U32x16` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `U64x8` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I8x32` | ✅ `__m256i` | ❌ | ❌ | 🔵 | ✅ |
-| `I8x64` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I16x16` | ✅ `__m256i` | ❌ | ❌ | 🔵 | ✅ |
-| `I16x32` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I32x16` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
-| `I64x8` | ✅ `__m512i` | ❌ | ❌ | 🔵 | ✅ |
+| `U16x32` | ✅ `__m512i` | 🟠 `[u16; 32]` polyfill | ❌ | 🔵 | ✅ |
+| `U32x16` | ✅ `__m512i` | 🟠 `[u32; 16]` polyfill | ❌ | 🔵 | ✅ |
+| `U64x8` | ✅ `__m512i` | 🟠 `[u64; 8]` polyfill | ❌ | 🔵 | ✅ |
+| `I8x32` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
+| `I8x64` | ✅ `__m512i` | 🟠 `[i8; 64]` polyfill | ❌ | 🔵 | ✅ |
+| `I16x16` | ✅ `__m256i` | ✅ `__m256i` (in `simd_avx512`) | ❌ | 🔵 | ✅ |
+| `I16x32` | ✅ `__m512i` | 🟠 `[i16; 32]` polyfill | ❌ | 🔵 | ✅ |
+| `I32x16` | ✅ `__m512i` | 🟠 `[i32; 16]` polyfill | ❌ | 🔵 | ✅ |
+| `I64x8` | ✅ `__m512i` | 🟠 `[i64; 8]` polyfill | ❌ | 🔵 | ✅ |
 | `BF16x8` | ✅ `__m128bh` | ❌ | ❌ | 🔵 | ✅ |
 | `BF16x16` | ✅ `__m256bh` | ❌ | ❌ | 🔵 | ✅ |
 | `F16x16` | ❌ | 🟡 `F16Scaler` (scalar) | ❌ | 🔵 | ✅ |
diff --git a/src/simd.rs b/src/simd.rs
@@ -210,23 +210,21 @@ pub const PREFERRED_I16_LANES: usize = 16;
 //   * aarch64 → simd_neon backend.
 //   * everything else (wasm32, riscv, etc.) → scalar fallback.
 
-// Note on the `nightly-simd` feature: it adds the `crate::simd_nightly`
-// module (a portable-simd backend wrapping `core::simd`) but does NOT
-// replace the intrinsics dispatch below. The polyfill ships full
-// type-parity with production (PR #146): 24 types covering F32x8/16,
-// F64x4/8, BF16x8/16, F16x16, I8x32/64, I16x16/32, I32x16, I64x8,
-// U8x32/64, U16x32, U32x8/16, U64x4/8, plus the F32/F64 mask types —
-// matches the 24 types defined in `simd_avx2.rs` + `simd_avx512.rs`.
-// Consumers who want miri-runnable SIMD code import from `simd_nightly`
-// explicitly today (e.g. `use ndarray::simd_nightly::F32x16`).
-//
-// The remaining work for Miri-clean coverage of `hpc::*` is wiring this
-// file's `pub use crate::simd_{avx512,avx2,neon}::*` re-exports to
-// route through `simd_nightly` under `cfg(miri)`. Once that lands,
-// every `use crate::simd::F32x16` call site becomes miri-checkable
-// without source changes. The polyfill itself is no longer the bottleneck.
+// Nightly-simd dispatch — when `feature = "nightly-simd"` is on, the
+// `crate::simd_nightly` portable backend (wrapping `core::simd::*`)
+// REPLACES the intrinsics arms below. This is a compile-time-dispatch
+// choice: opt in via `cargo +nightly --features nightly-simd ...` and
+// the same `use crate::simd::F32x16` call sites become miri-runnable.
+// No target_arch constraint — `core::simd` is portable, so this arm
+// is the one true backend on wasm32 / riscv / aarch64 / x86_64 alike
+// as soon as `nightly-simd` is on.
+#[cfg(feature = "nightly-simd")]
+pub use crate::simd_nightly::{
+    BF16x16, BF16x8, F16x16, F32Mask16, F32Mask8, F32x16, F32x8, F64Mask4, F64Mask8, F64x4, F64x8, I16x16, I16x32,
+    I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U32x8, U64x4, U64x8, U8x32, U8x64,
+};
 
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", not(feature = "nightly-simd")))]
 pub use crate::simd_avx512::{
     f32x16,
     f32x8,
@@ -276,7 +274,7 @@ pub use crate::simd_avx512::{bf16_to_f32_batch, bf16_to_f32_scalar, f32_to_bf16_
 #[cfg(target_arch = "x86_64")]
 pub use crate::simd_avx512::{f32_to_bf16_batch_rne, f32_to_bf16_scalar_rne};
 // BF16 SIMD types only available when avx512bf16 is enabled at compile time
-#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16"))]
+#[cfg(all(target_arch = "x86_64", target_feature = "avx512bf16", not(feature = "nightly-simd")))]
 pub use crate::simd_avx512::{BF16x16, BF16x8};
 
 // AVX2 baseline arm — selected by the `x86-64-v3` cargo default. The
@@ -290,10 +288,10 @@ pub use crate::simd_avx512::{BF16x16, BF16x8};
 // `RUSTFLAGS="-D warnings"` env, which overrides our v3 config.toml,
 // landing on x86-64 baseline → the previous tighter `avx2` predicate
 // left no matching arm).
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f"), not(feature = "nightly-simd")))]
 pub use crate::simd_avx512::{f32x8, f64x4, i16x16, i8x32, F32x8, F64x4, I16x16, I8x32};
 
-#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f")))]
+#[cfg(all(target_arch = "x86_64", not(target_feature = "avx512f"), not(feature = "nightly-simd")))]
 pub use crate::simd_avx2::{
     f32x16, f64x8, i16x32, i32x16, i64x8, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16, F64Mask8, F64x8, I16x32,
     I32x16, I64x8, I8x64, U16x32, U32x16, U64x8, U8x64,
@@ -304,14 +302,14 @@ pub use crate::simd_avx2::{
 // AVX2 ops, and on AVX-512 builds it's the half-register companion to
 // U8x64. Lives in simd_avx2.rs (single source of truth) and is re-exported
 // from both tier branches.
-#[cfg(target_arch = "x86_64")]
+#[cfg(all(target_arch = "x86_64", not(feature = "nightly-simd")))]
 pub use crate::simd_avx2::{u8x32, U8x32};
 
 // ============================================================================
 // Non-x86: scalar fallback types with identical API
 // ============================================================================
 
-#[cfg(not(target_arch = "x86_64"))]
+#[cfg(all(not(target_arch = "x86_64"), not(feature = "nightly-simd")))]
 pub(crate) mod scalar {
     use core::fmt;
     use core::ops::{
@@ -1587,15 +1585,15 @@ pub(crate) mod scalar {
 // in simd_neon::aarch64_simd (verified 2026-04-30, agent A7 — burn parity item 9).
 // Integer + 256-bit float types still come from the scalar fallback; they're
 // not on the critical path for f32 BLAS-1 / VML kernels.
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
 pub use crate::simd_neon::aarch64_simd::{f32x16, f64x8, F32Mask16, F32x16, F64Mask8, F64x8};
-#[cfg(target_arch = "aarch64")]
+#[cfg(all(target_arch = "aarch64", not(feature = "nightly-simd")))]
 pub use scalar::{
     f32x8, f64x4, i32x16, i64x8, u32x16, u64x8, u8x64, F32x8, F64x4, I32x16, I64x8, U16x32, U32x16, U64x8, U8x64,
 };
 
 // Other non-x86 targets (wasm, riscv, etc.): full scalar fallback.
-#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64")))]
+#[cfg(all(not(target_arch = "x86_64"), not(target_arch = "aarch64"), not(feature = "nightly-simd")))]
 pub use scalar::{
     f32x16, f32x8, f64x4, f64x8, i16x16, i16x32, i32x16, i64x8, i8x32, i8x64, u32x16, u64x8, u8x64, F32Mask16, F32x16,
     F32x8, F64Mask8, F64x4, F64x8, I16x16, I16x32, I32x16, I64x8, I8x32, I8x64, U16x32, U32x16, U64x8, U8x64,