Skip to content

Commit 6b52a46

Browse files
committed
refactor(simd_soa): iterators yield typed lanes via crate::simd::*
Per the layering rule: `simd_soa.rs` MUST consume the typed lane primitives through `crate::simd::*` (which dispatches to AVX-512 / NEON / scalar per `cfg`). The earlier "shape iterator" approach returned raw `&[u8; 64]` and deferred typing to the consumer — that was the wrong layering boundary. iter_u8x64 -> impl Iterator<Item = U8x64> iter_f32x16 -> impl Iterator<Item = F32x16> (was iter_f32x16_bytes) iter_f64x8 -> impl Iterator<Item = F64x8> (was iter_f64x8_bytes) iter_u64x8 -> impl Iterator<Item = U64x8> (was iter_u64x8_bytes) The byte-to-typed conversion uses `core::array::from_fn` + `f32::from_le_bytes` / `f64::from_le_bytes` / `u64::from_le_bytes`. On LE targets the compiler folds this into a single register-width load — equivalent to a `bytemuck::cast` reinterpret but without requiring a new workspace dep and without the alignment risk of pointer-casting `Arc<[u8]>` (which is only `u8`-aligned on stable). Tests: - replaces `bytes_shape_iterators_alias_u8x64` (no longer meaningful — iterators yield distinct typed values) - adds `iter_f32x16_le_round_trip` (writes 16 known f32 values, reads them back as F32x16) - adds `iter_f64x8_le_round_trip` - adds `iter_u64x8_le_round_trip` - adds `typed_iters_yield_three_lanes_over_192_bytes` (count invariant across all four typed iterators)
1 parent d64c5e0 commit 6b52a46

1 file changed

Lines changed: 142 additions & 54 deletions

File tree

src/simd_soa.rs

Lines changed: 142 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,64 @@
3131
3232
use std::sync::Arc;
3333

34+
// Typed lane primitives — dispatched through `crate::simd::*`, which
35+
// re-exports the right backend (AVX-512 / NEON / scalar) per `cfg`. Per
36+
// the W1a layering rule, `simd_soa.rs` MUST go through `crate::simd::`
37+
// rather than dipping into `simd_avx512` / `simd_neon` / `scalar` directly.
38+
use crate::simd::{F32x16, F64x8, U64x8, U8x64};
39+
40+
// Endian-correct `&[u8; 4]` → `f32` / `&[u8; 8]` → `f64`/`u64` helpers.
41+
// `f32::from_le_bytes` is intrinsically optimised to a single load on
42+
// little-endian targets (x86_64, aarch64, wasm32), so this scalar
43+
// `from_fn` loop compiles to the same instruction stream as a
44+
// `bytemuck::cast`-style reinterpret — without requiring a new workspace
45+
// dep and without the alignment risk of pointer-casting `Arc<[u8]>`
46+
// (which is only `u8`-aligned in stable Rust).
47+
#[inline(always)]
48+
fn f32x16_from_chunk(chunk: &[u8; 64]) -> F32x16 {
49+
let arr: [f32; 16] = core::array::from_fn(|i| {
50+
let off = i * 4;
51+
f32::from_le_bytes([chunk[off], chunk[off + 1], chunk[off + 2], chunk[off + 3]])
52+
});
53+
F32x16::from_array(arr)
54+
}
55+
56+
#[inline(always)]
57+
fn f64x8_from_chunk(chunk: &[u8; 64]) -> F64x8 {
58+
let arr: [f64; 8] = core::array::from_fn(|i| {
59+
let off = i * 8;
60+
f64::from_le_bytes([
61+
chunk[off],
62+
chunk[off + 1],
63+
chunk[off + 2],
64+
chunk[off + 3],
65+
chunk[off + 4],
66+
chunk[off + 5],
67+
chunk[off + 6],
68+
chunk[off + 7],
69+
])
70+
});
71+
F64x8::from_array(arr)
72+
}
73+
74+
#[inline(always)]
75+
fn u64x8_from_chunk(chunk: &[u8; 64]) -> U64x8 {
76+
let arr: [u64; 8] = core::array::from_fn(|i| {
77+
let off = i * 8;
78+
u64::from_le_bytes([
79+
chunk[off],
80+
chunk[off + 1],
81+
chunk[off + 2],
82+
chunk[off + 3],
83+
chunk[off + 4],
84+
chunk[off + 5],
85+
chunk[off + 6],
86+
chunk[off + 7],
87+
])
88+
});
89+
U64x8::from_array(arr)
90+
}
91+
3492
// ════════════════════════════════════════════════════════════════════
3593
// MultiLaneColumn — Arc<[u8]> carrier with typed lane-width chunk iters
3694
// ════════════════════════════════════════════════════════════════════
@@ -125,48 +183,52 @@ impl MultiLaneColumn {
125183
&self.data
126184
}
127185

128-
/// Iterate the column as contiguous `&[u8; 64]` windows (`U8x64` shape).
129-
///
130-
/// Each window is exactly 64 bytes — one AVX-512 `U8x64` register load.
131-
/// Zero-copy: each window is a reference into the backing store.
186+
/// Iterate the column as typed [`U8x64`] values dispatched via
187+
/// `crate::simd::*` (AVX-512 / NEON / scalar per `cfg`).
132188
///
133-
/// Feed each window into `U8x64::from_array(*win)` or
134-
/// `crate::simd::U8x64::from_slice(win)` inside the consumer's loop.
189+
/// Each yielded value is one register-width load over a 64-byte chunk
190+
/// of the backing store. The construction is zero-cost on every backend:
191+
/// `U8x64::from_array(*chunk)` is a single move on AVX-512, a paired
192+
/// LD2 on NEON, and a memcpy on the scalar fallback.
135193
///
136194
/// # Examples
137195
///
138196
/// ```
139-
/// use ndarray::simd::MultiLaneColumn;
197+
/// use ndarray::simd::{MultiLaneColumn, U8x64};
140198
/// use std::sync::Arc;
141199
///
142200
/// let data: Arc<[u8]> = Arc::from((0u8..128).collect::<Vec<_>>());
143201
/// let col = MultiLaneColumn::new(data).unwrap();
144-
/// let windows: Vec<&[u8; 64]> = col.iter_u8x64().collect();
145-
/// assert_eq!(windows.len(), 2);
146-
/// assert_eq!(windows[0][0], 0u8);
147-
/// assert_eq!(windows[1][0], 64u8);
202+
/// let lanes: Vec<U8x64> = col.iter_u8x64().collect();
203+
/// assert_eq!(lanes.len(), 2);
204+
/// assert_eq!(lanes[0].to_array()[0], 0u8);
205+
/// assert_eq!(lanes[1].to_array()[0], 64u8);
148206
/// ```
149-
pub fn iter_u8x64(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
150-
self.data.as_chunks::<64>().0.iter()
207+
pub fn iter_u8x64(&self) -> impl Iterator<Item = U8x64> + '_ {
208+
self.data.as_chunks::<64>().0.iter().map(|chunk| U8x64::from_array(*chunk))
151209
}
152210

153-
/// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f32; 16]`-shape.
211+
/// Iterate the column as typed [`F32x16`] values dispatched via
212+
/// `crate::simd::*` (AVX-512 / NEON / scalar per `cfg`).
154213
///
155-
/// The bytes are NOT converted — same memory, different lane width.
156-
/// Consumer is responsible for using `F32x16::from_array(bytemuck::cast(*win))`
157-
/// or equivalent typed reinterpretation.
158-
pub fn iter_f32x16_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
159-
self.data.as_chunks::<64>().0.iter()
214+
/// Bytes are decoded little-endian. On LE targets (x86_64, aarch64,
215+
/// wasm32) the `f32::from_le_bytes` loop optimises to a register-width
216+
/// load equivalent to a `bytemuck::cast`-style reinterpret, without the
217+
/// alignment risk of pointer-casting `Arc<[u8]>` (which is `u8`-aligned).
218+
pub fn iter_f32x16(&self) -> impl Iterator<Item = F32x16> + '_ {
219+
self.data.as_chunks::<64>().0.iter().map(f32x16_from_chunk)
160220
}
161221

162-
/// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f64; 8]`-shape.
163-
pub fn iter_f64x8_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
164-
self.data.as_chunks::<64>().0.iter()
222+
/// Iterate the column as typed [`F64x8`] values dispatched via
223+
/// `crate::simd::*`.
224+
pub fn iter_f64x8(&self) -> impl Iterator<Item = F64x8> + '_ {
225+
self.data.as_chunks::<64>().0.iter().map(f64x8_from_chunk)
165226
}
166227

167-
/// Iterate the column as `&[u8; 64]` windows reinterpreted as `[u64; 8]`-shape.
168-
pub fn iter_u64x8_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
169-
self.data.as_chunks::<64>().0.iter()
228+
/// Iterate the column as typed [`U64x8`] values dispatched via
229+
/// `crate::simd::*`.
230+
pub fn iter_u64x8(&self) -> impl Iterator<Item = U64x8> + '_ {
231+
self.data.as_chunks::<64>().0.iter().map(u64x8_from_chunk)
170232
}
171233
}
172234

@@ -198,14 +260,14 @@ mod tests {
198260
}
199261

200262
#[test]
201-
fn empty_buffer_yields_zero_windows() {
263+
fn empty_buffer_yields_zero_lanes() {
202264
let col = MultiLaneColumn::new(Arc::from(vec![0u8; 0])).unwrap();
203265
assert!(col.is_empty());
204266
assert_eq!(col.len_bytes(), 0);
205267
assert_eq!(col.iter_u8x64().count(), 0);
206-
assert_eq!(col.iter_f32x16_bytes().count(), 0);
207-
assert_eq!(col.iter_f64x8_bytes().count(), 0);
208-
assert_eq!(col.iter_u64x8_bytes().count(), 0);
268+
assert_eq!(col.iter_f32x16().count(), 0);
269+
assert_eq!(col.iter_f64x8().count(), 0);
270+
assert_eq!(col.iter_u64x8().count(), 0);
209271
}
210272

211273
#[test]
@@ -215,12 +277,14 @@ mod tests {
215277
v[i] = i as u8;
216278
}
217279
let col = MultiLaneColumn::new(Arc::from(v)).unwrap();
218-
let windows: Vec<&[u8; 64]> = col.iter_u8x64().collect();
219-
assert_eq!(windows.len(), 2);
220-
assert_eq!(windows[0][0], 0u8);
221-
assert_eq!(windows[0][63], 63u8);
222-
assert_eq!(windows[1][0], 64u8);
223-
assert_eq!(windows[1][63], 127u8);
280+
let lanes: Vec<U8x64> = col.iter_u8x64().collect();
281+
assert_eq!(lanes.len(), 2);
282+
let a0 = lanes[0].to_array();
283+
let a1 = lanes[1].to_array();
284+
assert_eq!(a0[0], 0u8);
285+
assert_eq!(a0[63], 63u8);
286+
assert_eq!(a1[0], 64u8);
287+
assert_eq!(a1[63], 127u8);
224288
}
225289

226290
#[test]
@@ -235,27 +299,51 @@ mod tests {
235299
}
236300

237301
#[test]
238-
fn bytes_shape_iterators_alias_u8x64() {
239-
let v: Vec<u8> = (0u8..192).collect();
240-
let col = MultiLaneColumn::new(Arc::from(v)).unwrap();
302+
fn iter_f32x16_le_round_trip() {
303+
// Build a buffer of 16 f32 values laid out little-endian, then
304+
// verify iter_f32x16 reads them back in order.
305+
let src: [f32; 16] = core::array::from_fn(|i| i as f32 * 0.25 - 1.0);
306+
let mut bytes = vec![0u8; 64];
307+
for (i, &v) in src.iter().enumerate() {
308+
bytes[i * 4..i * 4 + 4].copy_from_slice(&v.to_le_bytes());
309+
}
310+
let col = MultiLaneColumn::new(Arc::from(bytes)).unwrap();
311+
let lane = col.iter_f32x16().next().expect("one lane");
312+
assert_eq!(lane.to_array(), src);
313+
}
314+
315+
#[test]
316+
fn iter_f64x8_le_round_trip() {
317+
let src: [f64; 8] = core::array::from_fn(|i| (i as f64).sin());
318+
let mut bytes = vec![0u8; 64];
319+
for (i, &v) in src.iter().enumerate() {
320+
bytes[i * 8..i * 8 + 8].copy_from_slice(&v.to_le_bytes());
321+
}
322+
let col = MultiLaneColumn::new(Arc::from(bytes)).unwrap();
323+
let lane = col.iter_f64x8().next().expect("one lane");
324+
assert_eq!(lane.to_array(), src);
325+
}
241326

242-
let u8_wins: Vec<&[u8; 64]> = col.iter_u8x64().collect();
243-
let f32_wins: Vec<&[u8; 64]> = col.iter_f32x16_bytes().collect();
244-
let f64_wins: Vec<&[u8; 64]> = col.iter_f64x8_bytes().collect();
245-
let u64_wins: Vec<&[u8; 64]> = col.iter_u64x8_bytes().collect();
246-
247-
assert_eq!(u8_wins.len(), 3);
248-
assert_eq!(f32_wins.len(), 3);
249-
assert_eq!(f64_wins.len(), 3);
250-
assert_eq!(u64_wins.len(), 3);
251-
252-
for i in 0..3 {
253-
assert_eq!(u8_wins[i].as_ptr(), f32_wins[i].as_ptr());
254-
assert_eq!(u8_wins[i].as_ptr(), f64_wins[i].as_ptr());
255-
assert_eq!(u8_wins[i].as_ptr(), u64_wins[i].as_ptr());
256-
assert_eq!(u8_wins[i][0], (i as u8) * 64);
257-
assert_eq!(u8_wins[i][63], (i as u8) * 64 + 63);
327+
#[test]
328+
fn iter_u64x8_le_round_trip() {
329+
let src: [u64; 8] = core::array::from_fn(|i| (i as u64 + 1) * 0x0123_4567_89AB_CDEF);
330+
let mut bytes = vec![0u8; 64];
331+
for (i, &v) in src.iter().enumerate() {
332+
bytes[i * 8..i * 8 + 8].copy_from_slice(&v.to_le_bytes());
258333
}
334+
let col = MultiLaneColumn::new(Arc::from(bytes)).unwrap();
335+
let lane = col.iter_u64x8().next().expect("one lane");
336+
assert_eq!(lane.to_array(), src);
337+
}
338+
339+
#[test]
340+
fn typed_iters_yield_three_lanes_over_192_bytes() {
341+
let v: Vec<u8> = (0u8..192).collect();
342+
let col = MultiLaneColumn::new(Arc::from(v)).unwrap();
343+
assert_eq!(col.iter_u8x64().count(), 3);
344+
assert_eq!(col.iter_f32x16().count(), 3);
345+
assert_eq!(col.iter_f64x8().count(), 3);
346+
assert_eq!(col.iter_u64x8().count(), 3);
259347
}
260348

261349
#[test]

0 commit comments

Comments
 (0)