3131
3232use std:: sync:: Arc ;
3333
34+ // Typed lane primitives — dispatched through `crate::simd::*`, which
35+ // re-exports the right backend (AVX-512 / NEON / scalar) per `cfg`. Per
36+ // the W1a layering rule, `simd_soa.rs` MUST go through `crate::simd::`
37+ // rather than dipping into `simd_avx512` / `simd_neon` / `scalar` directly.
38+ use crate :: simd:: { F32x16 , F64x8 , U64x8 , U8x64 } ;
39+
40+ // Endian-correct `&[u8; 4]` → `f32` / `&[u8; 8]` → `f64`/`u64` helpers.
41+ // `f32::from_le_bytes` is intrinsically optimised to a single load on
42+ // little-endian targets (x86_64, aarch64, wasm32), so this scalar
43+ // `from_fn` loop compiles to the same instruction stream as a
44+ // `bytemuck::cast`-style reinterpret — without requiring a new workspace
45+ // dep and without the alignment risk of pointer-casting `Arc<[u8]>`
46+ // (which is only `u8`-aligned in stable Rust).
47+ #[ inline( always) ]
48+ fn f32x16_from_chunk ( chunk : & [ u8 ; 64 ] ) -> F32x16 {
49+ let arr: [ f32 ; 16 ] = core:: array:: from_fn ( |i| {
50+ let off = i * 4 ;
51+ f32:: from_le_bytes ( [ chunk[ off] , chunk[ off + 1 ] , chunk[ off + 2 ] , chunk[ off + 3 ] ] )
52+ } ) ;
53+ F32x16 :: from_array ( arr)
54+ }
55+
56+ #[ inline( always) ]
57+ fn f64x8_from_chunk ( chunk : & [ u8 ; 64 ] ) -> F64x8 {
58+ let arr: [ f64 ; 8 ] = core:: array:: from_fn ( |i| {
59+ let off = i * 8 ;
60+ f64:: from_le_bytes ( [
61+ chunk[ off] ,
62+ chunk[ off + 1 ] ,
63+ chunk[ off + 2 ] ,
64+ chunk[ off + 3 ] ,
65+ chunk[ off + 4 ] ,
66+ chunk[ off + 5 ] ,
67+ chunk[ off + 6 ] ,
68+ chunk[ off + 7 ] ,
69+ ] )
70+ } ) ;
71+ F64x8 :: from_array ( arr)
72+ }
73+
74+ #[ inline( always) ]
75+ fn u64x8_from_chunk ( chunk : & [ u8 ; 64 ] ) -> U64x8 {
76+ let arr: [ u64 ; 8 ] = core:: array:: from_fn ( |i| {
77+ let off = i * 8 ;
78+ u64:: from_le_bytes ( [
79+ chunk[ off] ,
80+ chunk[ off + 1 ] ,
81+ chunk[ off + 2 ] ,
82+ chunk[ off + 3 ] ,
83+ chunk[ off + 4 ] ,
84+ chunk[ off + 5 ] ,
85+ chunk[ off + 6 ] ,
86+ chunk[ off + 7 ] ,
87+ ] )
88+ } ) ;
89+ U64x8 :: from_array ( arr)
90+ }
91+
3492// ════════════════════════════════════════════════════════════════════
3593// MultiLaneColumn — Arc<[u8]> carrier with typed lane-width chunk iters
3694// ════════════════════════════════════════════════════════════════════
@@ -125,48 +183,52 @@ impl MultiLaneColumn {
125183 & self . data
126184 }
127185
128- /// Iterate the column as contiguous `&[u8; 64]` windows (`U8x64` shape).
129- ///
130- /// Each window is exactly 64 bytes — one AVX-512 `U8x64` register load.
131- /// Zero-copy: each window is a reference into the backing store.
186+ /// Iterate the column as typed [`U8x64`] values dispatched via
187+ /// `crate::simd::*` (AVX-512 / NEON / scalar per `cfg`).
132188 ///
133- /// Feed each window into `U8x64::from_array(*win)` or
134- /// `crate::simd::U8x64::from_slice(win)` inside the consumer's loop.
189+ /// Each yielded value is one register-width load over a 64-byte chunk
190+ /// of the backing store. The construction is zero-cost on every backend:
191+ /// `U8x64::from_array(*chunk)` is a single move on AVX-512, a paired
192+ /// LD2 on NEON, and a memcpy on the scalar fallback.
135193 ///
136194 /// # Examples
137195 ///
138196 /// ```
139- /// use ndarray::simd::MultiLaneColumn;
197+ /// use ndarray::simd::{ MultiLaneColumn, U8x64} ;
140198 /// use std::sync::Arc;
141199 ///
142200 /// let data: Arc<[u8]> = Arc::from((0u8..128).collect::<Vec<_>>());
143201 /// let col = MultiLaneColumn::new(data).unwrap();
144- /// let windows : Vec<&[u8; 64] > = col.iter_u8x64().collect();
145- /// assert_eq!(windows .len(), 2);
146- /// assert_eq!(windows [0][0], 0u8);
147- /// assert_eq!(windows [1][0], 64u8);
202+ /// let lanes : Vec<U8x64 > = col.iter_u8x64().collect();
203+ /// assert_eq!(lanes .len(), 2);
204+ /// assert_eq!(lanes [0].to_array() [0], 0u8);
205+ /// assert_eq!(lanes [1].to_array() [0], 64u8);
148206 /// ```
149- pub fn iter_u8x64 ( & self ) -> impl Iterator < Item = & [ u8 ; 64 ] > + ' _ {
150- self . data . as_chunks :: < 64 > ( ) . 0 . iter ( )
207+ pub fn iter_u8x64 ( & self ) -> impl Iterator < Item = U8x64 > + ' _ {
208+ self . data . as_chunks :: < 64 > ( ) . 0 . iter ( ) . map ( |chunk| U8x64 :: from_array ( * chunk ) )
151209 }
152210
153- /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f32; 16]`-shape.
211+ /// Iterate the column as typed [`F32x16`] values dispatched via
212+ /// `crate::simd::*` (AVX-512 / NEON / scalar per `cfg`).
154213 ///
155- /// The bytes are NOT converted — same memory, different lane width.
156- /// Consumer is responsible for using `F32x16::from_array(bytemuck::cast(*win))`
157- /// or equivalent typed reinterpretation.
158- pub fn iter_f32x16_bytes ( & self ) -> impl Iterator < Item = & [ u8 ; 64 ] > + ' _ {
159- self . data . as_chunks :: < 64 > ( ) . 0 . iter ( )
214+ /// Bytes are decoded little-endian. On LE targets (x86_64, aarch64,
215+ /// wasm32) the `f32::from_le_bytes` loop optimises to a register-width
216+ /// load equivalent to a `bytemuck::cast`-style reinterpret, without the
217+ /// alignment risk of pointer-casting `Arc<[u8]>` (which is `u8`-aligned).
218+ pub fn iter_f32x16 ( & self ) -> impl Iterator < Item = F32x16 > + ' _ {
219+ self . data . as_chunks :: < 64 > ( ) . 0 . iter ( ) . map ( f32x16_from_chunk)
160220 }
161221
162- /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f64; 8]`-shape.
163- pub fn iter_f64x8_bytes ( & self ) -> impl Iterator < Item = & [ u8 ; 64 ] > + ' _ {
164- self . data . as_chunks :: < 64 > ( ) . 0 . iter ( )
222+ /// Iterate the column as typed [`F64x8`] values dispatched via
223+ /// `crate::simd::*`.
224+ pub fn iter_f64x8 ( & self ) -> impl Iterator < Item = F64x8 > + ' _ {
225+ self . data . as_chunks :: < 64 > ( ) . 0 . iter ( ) . map ( f64x8_from_chunk)
165226 }
166227
167- /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[u64; 8]`-shape.
168- pub fn iter_u64x8_bytes ( & self ) -> impl Iterator < Item = & [ u8 ; 64 ] > + ' _ {
169- self . data . as_chunks :: < 64 > ( ) . 0 . iter ( )
228+ /// Iterate the column as typed [`U64x8`] values dispatched via
229+ /// `crate::simd::*`.
230+ pub fn iter_u64x8 ( & self ) -> impl Iterator < Item = U64x8 > + ' _ {
231+ self . data . as_chunks :: < 64 > ( ) . 0 . iter ( ) . map ( u64x8_from_chunk)
170232 }
171233}
172234
@@ -198,14 +260,14 @@ mod tests {
198260 }
199261
200262 #[ test]
201- fn empty_buffer_yields_zero_windows ( ) {
263+ fn empty_buffer_yields_zero_lanes ( ) {
202264 let col = MultiLaneColumn :: new ( Arc :: from ( vec ! [ 0u8 ; 0 ] ) ) . unwrap ( ) ;
203265 assert ! ( col. is_empty( ) ) ;
204266 assert_eq ! ( col. len_bytes( ) , 0 ) ;
205267 assert_eq ! ( col. iter_u8x64( ) . count( ) , 0 ) ;
206- assert_eq ! ( col. iter_f32x16_bytes ( ) . count( ) , 0 ) ;
207- assert_eq ! ( col. iter_f64x8_bytes ( ) . count( ) , 0 ) ;
208- assert_eq ! ( col. iter_u64x8_bytes ( ) . count( ) , 0 ) ;
268+ assert_eq ! ( col. iter_f32x16 ( ) . count( ) , 0 ) ;
269+ assert_eq ! ( col. iter_f64x8 ( ) . count( ) , 0 ) ;
270+ assert_eq ! ( col. iter_u64x8 ( ) . count( ) , 0 ) ;
209271 }
210272
211273 #[ test]
@@ -215,12 +277,14 @@ mod tests {
215277 v[ i] = i as u8 ;
216278 }
217279 let col = MultiLaneColumn :: new ( Arc :: from ( v) ) . unwrap ( ) ;
218- let windows: Vec < & [ u8 ; 64 ] > = col. iter_u8x64 ( ) . collect ( ) ;
219- assert_eq ! ( windows. len( ) , 2 ) ;
220- assert_eq ! ( windows[ 0 ] [ 0 ] , 0u8 ) ;
221- assert_eq ! ( windows[ 0 ] [ 63 ] , 63u8 ) ;
222- assert_eq ! ( windows[ 1 ] [ 0 ] , 64u8 ) ;
223- assert_eq ! ( windows[ 1 ] [ 63 ] , 127u8 ) ;
280+ let lanes: Vec < U8x64 > = col. iter_u8x64 ( ) . collect ( ) ;
281+ assert_eq ! ( lanes. len( ) , 2 ) ;
282+ let a0 = lanes[ 0 ] . to_array ( ) ;
283+ let a1 = lanes[ 1 ] . to_array ( ) ;
284+ assert_eq ! ( a0[ 0 ] , 0u8 ) ;
285+ assert_eq ! ( a0[ 63 ] , 63u8 ) ;
286+ assert_eq ! ( a1[ 0 ] , 64u8 ) ;
287+ assert_eq ! ( a1[ 63 ] , 127u8 ) ;
224288 }
225289
226290 #[ test]
@@ -235,27 +299,51 @@ mod tests {
235299 }
236300
237301 #[ test]
238- fn bytes_shape_iterators_alias_u8x64 ( ) {
239- let v: Vec < u8 > = ( 0u8 ..192 ) . collect ( ) ;
240- let col = MultiLaneColumn :: new ( Arc :: from ( v) ) . unwrap ( ) ;
302+ fn iter_f32x16_le_round_trip ( ) {
303+ // Build a buffer of 16 f32 values laid out little-endian, then
304+ // verify iter_f32x16 reads them back in order.
305+ let src: [ f32 ; 16 ] = core:: array:: from_fn ( |i| i as f32 * 0.25 - 1.0 ) ;
306+ let mut bytes = vec ! [ 0u8 ; 64 ] ;
307+ for ( i, & v) in src. iter ( ) . enumerate ( ) {
308+ bytes[ i * 4 ..i * 4 + 4 ] . copy_from_slice ( & v. to_le_bytes ( ) ) ;
309+ }
310+ let col = MultiLaneColumn :: new ( Arc :: from ( bytes) ) . unwrap ( ) ;
311+ let lane = col. iter_f32x16 ( ) . next ( ) . expect ( "one lane" ) ;
312+ assert_eq ! ( lane. to_array( ) , src) ;
313+ }
314+
315+ #[ test]
316+ fn iter_f64x8_le_round_trip ( ) {
317+ let src: [ f64 ; 8 ] = core:: array:: from_fn ( |i| ( i as f64 ) . sin ( ) ) ;
318+ let mut bytes = vec ! [ 0u8 ; 64 ] ;
319+ for ( i, & v) in src. iter ( ) . enumerate ( ) {
320+ bytes[ i * 8 ..i * 8 + 8 ] . copy_from_slice ( & v. to_le_bytes ( ) ) ;
321+ }
322+ let col = MultiLaneColumn :: new ( Arc :: from ( bytes) ) . unwrap ( ) ;
323+ let lane = col. iter_f64x8 ( ) . next ( ) . expect ( "one lane" ) ;
324+ assert_eq ! ( lane. to_array( ) , src) ;
325+ }
241326
242- let u8_wins: Vec < & [ u8 ; 64 ] > = col. iter_u8x64 ( ) . collect ( ) ;
243- let f32_wins: Vec < & [ u8 ; 64 ] > = col. iter_f32x16_bytes ( ) . collect ( ) ;
244- let f64_wins: Vec < & [ u8 ; 64 ] > = col. iter_f64x8_bytes ( ) . collect ( ) ;
245- let u64_wins: Vec < & [ u8 ; 64 ] > = col. iter_u64x8_bytes ( ) . collect ( ) ;
246-
247- assert_eq ! ( u8_wins. len( ) , 3 ) ;
248- assert_eq ! ( f32_wins. len( ) , 3 ) ;
249- assert_eq ! ( f64_wins. len( ) , 3 ) ;
250- assert_eq ! ( u64_wins. len( ) , 3 ) ;
251-
252- for i in 0 ..3 {
253- assert_eq ! ( u8_wins[ i] . as_ptr( ) , f32_wins[ i] . as_ptr( ) ) ;
254- assert_eq ! ( u8_wins[ i] . as_ptr( ) , f64_wins[ i] . as_ptr( ) ) ;
255- assert_eq ! ( u8_wins[ i] . as_ptr( ) , u64_wins[ i] . as_ptr( ) ) ;
256- assert_eq ! ( u8_wins[ i] [ 0 ] , ( i as u8 ) * 64 ) ;
257- assert_eq ! ( u8_wins[ i] [ 63 ] , ( i as u8 ) * 64 + 63 ) ;
327+ #[ test]
328+ fn iter_u64x8_le_round_trip ( ) {
329+ let src: [ u64 ; 8 ] = core:: array:: from_fn ( |i| ( i as u64 + 1 ) * 0x0123_4567_89AB_CDEF ) ;
330+ let mut bytes = vec ! [ 0u8 ; 64 ] ;
331+ for ( i, & v) in src. iter ( ) . enumerate ( ) {
332+ bytes[ i * 8 ..i * 8 + 8 ] . copy_from_slice ( & v. to_le_bytes ( ) ) ;
258333 }
334+ let col = MultiLaneColumn :: new ( Arc :: from ( bytes) ) . unwrap ( ) ;
335+ let lane = col. iter_u64x8 ( ) . next ( ) . expect ( "one lane" ) ;
336+ assert_eq ! ( lane. to_array( ) , src) ;
337+ }
338+
339+ #[ test]
340+ fn typed_iters_yield_three_lanes_over_192_bytes ( ) {
341+ let v: Vec < u8 > = ( 0u8 ..192 ) . collect ( ) ;
342+ let col = MultiLaneColumn :: new ( Arc :: from ( v) ) . unwrap ( ) ;
343+ assert_eq ! ( col. iter_u8x64( ) . count( ) , 3 ) ;
344+ assert_eq ! ( col. iter_f32x16( ) . count( ) , 3 ) ;
345+ assert_eq ! ( col. iter_f64x8( ) . count( ) , 3 ) ;
346+ assert_eq ! ( col. iter_u64x8( ) . count( ) , 3 ) ;
259347 }
260348
261349 #[ test]
0 commit comments