AdaWorldAPI
diff --git a/‎src/hpc/array_window.rs‎
Lines changed: 35 additions & 18 deletions b/‎src/hpc/array_window.rs‎
Lines changed: 35 additions & 18 deletions
diff --git a/‎src/hpc/column.rs‎
Lines changed: 125 additions & 38 deletions b/‎src/hpc/column.rs‎
Lines changed: 125 additions & 38 deletions
@@ -8,14 +8,19 @@
 //!
 //! # Layering
 //!
-//! Lives in `hpc::array_window`, re-exported from `crate::simd::*` per the
-//! W1a consumer contract at
-//! `.claude/knowledge/vertical-simd-consumer-contract.md`.
+//! Lives in `hpc::array_window`; the `crate::simd::*` re-export lands in the
+//! PR-X1 re-export sweep (see `.claude/knowledge/pr-x1-design.md` § 4).
+//! Doctests therefore use the canonical `ndarray::hpc::array_window` path
+//! until the sweep ships.
 //!
 //! # Design reference
 //!
-//! `.claude/knowledge/pr-x1-design.md` § "3. `array_window`" — verbatim API
-//! surface; this file is the commented-out final form for the PR-X1 sprint.
+//! `.claude/knowledge/pr-x1-design.md` § "3. `array_window`". This module
+//! ships the **iterator-shape** variant (whole-buffer walk yielding all
+//! const-size windows). The design doc sketches a singular-window form
+//! (`array_window(slice, offset) -> &[T; N]`); the maintainer-blessed final
+//! shape is the iterator form here, which composes directly with SIMD-staged
+//! consumer loops and avoids per-call panic surface in tight inner loops.
 
 /// Walk `data` as a sequence of non-overlapping const-size windows.
 ///
@@ -29,7 +34,7 @@
 /// # Examples
 ///
 /// ```
-/// use ndarray::simd::array_window;
+/// use ndarray::hpc::array_window::array_window;
 /// let data: Vec<u8> = (0..16).collect();
 /// let windows: Vec<&[u8; 4]> = array_window::<u8, 4>(&data).collect();
 /// assert_eq!(windows.len(), 4);
@@ -40,18 +45,15 @@
 /// # Examples — tail discarded
 ///
 /// ```
-/// use ndarray::simd::array_window;
+/// use ndarray::hpc::array_window::array_window;
 /// let data: Vec<u8> = (0..7).collect();
 /// let windows: Vec<&[u8; 4]> = array_window::<u8, 4>(&data).collect();
 /// // 7 / 4 = 1 window; the trailing 3 items are dropped.
 /// assert_eq!(windows.len(), 1);
 /// ```
 #[inline]
-pub fn array_window<T, const N: usize>(_data: &[T]) -> impl Iterator<Item = &[T; N]> + '_ {
-    // Skeleton: `data.as_chunks::<N>().0.iter()` once that API stabilises,
-    // or a manual chunks loop yielding `<&[T] as TryInto<&[T; N]>>::try_into`.
-    // Implementation lands in the uncomment sprint.
-    core::iter::empty::<&[T; N]>()
+pub fn array_window<T, const N: usize>(data: &[T]) -> impl Iterator<Item = &[T; N]> + '_ {
+    data.as_chunks::<N>().0.iter()
 }
 
 /// Walk `data` as `&[T; N]` windows, returning `Err(())` if `data.len()`
@@ -64,7 +66,7 @@ pub fn array_window<T, const N: usize>(_data: &[T]) -> impl Iterator<Item = &[T;
 /// # Examples
 ///
 /// ```
-/// use ndarray::simd::array_window_checked;
+/// use ndarray::hpc::array_window::array_window_checked;
 /// let data: Vec<u8> = (0..16).collect();
 /// let it = array_window_checked::<u8, 4>(&data).expect("16 is a multiple of 4");
 /// assert_eq!(it.count(), 4);
@@ -93,30 +95,45 @@ mod tests {
     /// 16-element buffer yields four 4-wide windows.
     #[test]
     fn array_window_4_over_16() {
-        unimplemented!("PR-X1 test: collect into Vec<&[u8;4]>; assert windows.len() == 4 and contents match 0..16")
+        let data: Vec<u8> = (0u8..16).collect();
+        let windows: Vec<&[u8; 4]> = array_window::<u8, 4>(&data).collect();
+        assert_eq!(windows.len(), 4);
+        assert_eq!(windows[0], &[0, 1, 2, 3]);
+        assert_eq!(windows[1], &[4, 5, 6, 7]);
+        assert_eq!(windows[2], &[8, 9, 10, 11]);
+        assert_eq!(windows[3], &[12, 13, 14, 15]);
     }
 
     /// Tail items are silently discarded by `array_window`.
     #[test]
     fn array_window_drops_tail() {
-        unimplemented!("PR-X1 test: 7-element buffer over N=4 → 1 window; trailing 3 items dropped")
+        let data: Vec<u8> = (0u8..7).collect();
+        let windows: Vec<&[u8; 4]> = array_window::<u8, 4>(&data).collect();
+        assert_eq!(windows.len(), 1);
+        assert_eq!(windows[0], &[0, 1, 2, 3]);
     }
 
     /// Mismatched length surfaces as Err in the checked variant.
     #[test]
     fn array_window_checked_rejects_mismatch() {
-        unimplemented!("PR-X1 test: assert!(array_window_checked::<u8,4>(&[0u8;7]).is_err())")
+        assert!(array_window_checked::<u8, 4>(&[0u8; 7]).is_err());
+        assert!(array_window_checked::<u8, 4>(&[0u8; 5]).is_err());
+        assert!(array_window_checked::<u8, 4>(&[0u8; 1]).is_err());
     }
 
     /// Aligned length succeeds in the checked variant.
     #[test]
     fn array_window_checked_accepts_aligned() {
-        unimplemented!("PR-X1 test: array_window_checked::<u8,4>(&[0u8;16]) returns Ok iterator yielding 4 windows")
+        let data = [0u8; 16];
+        let it = array_window_checked::<u8, 4>(&data).expect("16 is a multiple of 4");
+        assert_eq!(it.count(), 4);
     }
 
     /// Empty buffer yields zero windows (not an error in either variant).
     #[test]
     fn array_window_empty_buffer() {
-        unimplemented!("PR-X1 test: array_window::<u8,4>(&[]).count() == 0; array_window_checked is Ok (0 % 4 == 0)")
+        assert_eq!(array_window::<u8, 4>(&[]).count(), 0);
+        let it = array_window_checked::<u8, 4>(&[]).expect("0 % 4 == 0, should be Ok");
+        assert_eq!(it.count(), 0);
     }
 }
@@ -10,9 +10,10 @@
 //!
 //! # Layering
 //!
-//! Lives in `hpc::column`, re-exported from `crate::simd::*` per the
-//! W1a consumer contract at
-//! `.claude/knowledge/vertical-simd-consumer-contract.md`.
+//! Lives in `hpc::column`; the `crate::simd::*` re-export lands in the PR-X1
+//! re-export sweep (see `.claude/knowledge/pr-x1-design.md` § 4). Doctests in
+//! this file therefore use the canonical `ndarray::hpc::column` path until
+//! the sweep ships.
 //!
 //! # Distance typing
 //!
@@ -21,12 +22,14 @@
 //!
 //! # Design reference
 //!
-//! `.claude/knowledge/pr-x1-design.md` § "1. `MultiLaneColumn`" — verbatim
-//! API surface; this file is the commented-out final form (preflight
-//! skeleton) for the PR-X1 sprint.
+//! `.claude/knowledge/pr-x1-design.md` § "1. `MultiLaneColumn`". The
+//! `iter_*_bytes` family deliberately returns `&[u8; 64]` "shape" iterators
+//! (the consumer applies the typed reinterpret at the call site) — this is
+//! the maintainer-blessed deviation from the design doc's typed-iterator
+//! sketch, centralising the one allowed `unsafe` cast at the consumer rather
+//! than per-iterator here.
 
-extern crate alloc;
-use alloc::sync::Arc;
+use std::sync::Arc;
 
 /// Multi-lane (N-wide) typed column view over a shared `Arc<[u8]>` buffer.
 ///
@@ -42,10 +45,10 @@ use alloc::sync::Arc;
 /// # Examples
 ///
 /// ```
-/// use ndarray::simd::MultiLaneColumn;
-/// use alloc::sync::Arc;
+/// use ndarray::hpc::column::MultiLaneColumn;
+/// use std::sync::Arc;
 ///
-/// let data: Arc<[u8]> = vec![0u8; 128].into();
+/// let data: Arc<[u8]> = Arc::from(vec![0u8; 128]);
 /// let col = MultiLaneColumn::new(data).unwrap();
 /// assert_eq!(col.len_bytes(), 128);
 /// assert_eq!(col.len_u8x64(), 2);
@@ -67,52 +70,55 @@ impl MultiLaneColumn {
     /// # Examples
     ///
     /// ```
-    /// use ndarray::simd::MultiLaneColumn;
-    /// use alloc::sync::Arc;
+    /// use ndarray::hpc::column::MultiLaneColumn;
+    /// use std::sync::Arc;
     ///
-    /// let ok: Arc<[u8]> = vec![1u8; 64].into();
+    /// let ok: Arc<[u8]> = Arc::from(vec![1u8; 64]);
     /// assert!(MultiLaneColumn::new(ok).is_ok());
     ///
-    /// let bad: Arc<[u8]> = vec![0u8; 100].into();
+    /// let bad: Arc<[u8]> = Arc::from(vec![0u8; 100]);
     /// assert!(MultiLaneColumn::new(bad).is_err());
     /// ```
-    pub fn new(_data: Arc<[u8]>) -> Result<Self, ()> {
-        unimplemented!("PR-X1: MultiLaneColumn::new — multiple-of-64 check + Arc wrap")
+    pub fn new(data: Arc<[u8]>) -> Result<Self, ()> {
+        if data.len() % 64 != 0 {
+            return Err(());
+        }
+        Ok(Self { data })
     }
 
     /// Total byte length of the backing store.
     pub fn len_bytes(&self) -> usize {
-        unimplemented!("PR-X1: MultiLaneColumn::len_bytes — returns self.data.len()")
+        self.data.len()
     }
 
     /// Returns `true` if the column has zero bytes.
     pub fn is_empty(&self) -> bool {
-        unimplemented!("PR-X1: MultiLaneColumn::is_empty — returns self.data.is_empty()")
+        self.data.is_empty()
     }
 
     /// Number of 64-byte (`U8x64`) chunks in this column.
     pub fn len_u8x64(&self) -> usize {
-        unimplemented!("PR-X1: MultiLaneColumn::len_u8x64 — returns self.data.len() / 64")
+        self.data.len() / 64
     }
 
     /// Number of `F32x16`-shaped (16 × f32 = 64-byte) chunks.
     pub fn len_f32x16(&self) -> usize {
-        unimplemented!("PR-X1: MultiLaneColumn::len_f32x16 — returns self.data.len() / 64")
+        self.data.len() / 64
     }
 
     /// Number of `F64x8`-shaped (8 × f64 = 64-byte) chunks.
     pub fn len_f64x8(&self) -> usize {
-        unimplemented!("PR-X1: MultiLaneColumn::len_f64x8 — returns self.data.len() / 64")
+        self.data.len() / 64
     }
 
     /// Number of `U64x8`-shaped (8 × u64 = 64-byte) chunks.
     pub fn len_u64x8(&self) -> usize {
-        unimplemented!("PR-X1: MultiLaneColumn::len_u64x8 — returns self.data.len() / 64")
+        self.data.len() / 64
     }
 
     /// View the backing store as a raw byte slice.
     pub fn as_bytes(&self) -> &[u8] {
-        unimplemented!("PR-X1: MultiLaneColumn::as_bytes — returns &self.data")
+        &self.data
     }
 
     /// Iterate the column as contiguous `&[u8; 64]` windows (`U8x64` shape).
@@ -126,20 +132,18 @@ impl MultiLaneColumn {
     /// # Examples
     ///
     /// ```
-    /// use ndarray::simd::MultiLaneColumn;
-    /// use alloc::sync::Arc;
+    /// use ndarray::hpc::column::MultiLaneColumn;
+    /// use std::sync::Arc;
     ///
-    /// let data: Arc<[u8]> = (0u8..128).collect::<Vec<_>>().into();
+    /// let data: Arc<[u8]> = Arc::from((0u8..128).collect::<Vec<_>>());
     /// let col = MultiLaneColumn::new(data).unwrap();
     /// let windows: Vec<&[u8; 64]> = col.iter_u8x64().collect();
     /// assert_eq!(windows.len(), 2);
     /// assert_eq!(windows[0][0], 0u8);
     /// assert_eq!(windows[1][0], 64u8);
     /// ```
     pub fn iter_u8x64(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
-        // Skeleton: as_chunks::<64>() over &self.data, yielding &[u8;64].
-        // Implementation lands in the uncomment sprint.
-        core::iter::empty::<&[u8; 64]>()
+        self.data.as_chunks::<64>().0.iter()
     }
 
     /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f32; 16]`-shape.
@@ -148,17 +152,17 @@ impl MultiLaneColumn {
     /// Consumer is responsible for using `F32x16::from_array(bytemuck::cast(*win))`
     /// or equivalent typed reinterpretation.
     pub fn iter_f32x16_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
-        core::iter::empty::<&[u8; 64]>()
+        self.data.as_chunks::<64>().0.iter()
     }
 
     /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[f64; 8]`-shape.
     pub fn iter_f64x8_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
-        core::iter::empty::<&[u8; 64]>()
+        self.data.as_chunks::<64>().0.iter()
     }
 
     /// Iterate the column as `&[u8; 64]` windows reinterpreted as `[u64; 8]`-shape.
     pub fn iter_u64x8_bytes(&self) -> impl Iterator<Item = &[u8; 64]> + '_ {
-        core::iter::empty::<&[u8; 64]>()
+        self.data.as_chunks::<64>().0.iter()
     }
 }
 
@@ -173,30 +177,113 @@ mod tests {
     /// Construction with a 64-byte buffer succeeds; len_bytes round-trips.
     #[test]
     fn new_64byte_buffer_succeeds() {
-        unimplemented!("PR-X1 test: assert_eq!(MultiLaneColumn::new(Arc::from(vec![0u8;64])).unwrap().len_bytes(), 64)")
+        let col = MultiLaneColumn::new(Arc::from(vec![0u8; 64])).unwrap();
+        assert_eq!(col.len_bytes(), 64);
+        assert_eq!(col.len_u8x64(), 1);
+        assert_eq!(col.len_f32x16(), 1);
+        assert_eq!(col.len_f64x8(), 1);
+        assert_eq!(col.len_u64x8(), 1);
     }
 
     /// Construction with a non-multiple-of-64 buffer returns Err.
     #[test]
     fn new_non_multiple_of_64_errors() {
-        unimplemented!("PR-X1 test: assert!(MultiLaneColumn::new(Arc::from(vec![0u8;100])).is_err())")
+        assert!(MultiLaneColumn::new(Arc::from(vec![0u8; 100])).is_err());
+        assert!(MultiLaneColumn::new(Arc::from(vec![0u8; 63])).is_err());
+        assert!(MultiLaneColumn::new(Arc::from(vec![0u8; 65])).is_err());
     }
 
     /// Empty buffer is accepted; is_empty == true; iterators yield 0 windows.
     #[test]
     fn empty_buffer_yields_zero_windows() {
-        unimplemented!("PR-X1 test: empty Arc → is_empty true + iter_u8x64.count() == 0")
+        let col = MultiLaneColumn::new(Arc::from(vec![0u8; 0])).unwrap();
+        assert!(col.is_empty());
+        assert_eq!(col.len_bytes(), 0);
+        assert_eq!(col.iter_u8x64().count(), 0);
+        assert_eq!(col.iter_f32x16_bytes().count(), 0);
+        assert_eq!(col.iter_f64x8_bytes().count(), 0);
+        assert_eq!(col.iter_u64x8_bytes().count(), 0);
     }
 
     /// Two-chunk buffer yields exactly 2 windows of 64 bytes each.
     #[test]
     fn iter_u8x64_two_chunks() {
-        unimplemented!("PR-X1 test: 128-byte Arc → iter_u8x64 yields 2 windows starting at byte 0 + byte 64")
+        let mut v = vec![0u8; 128];
+        for i in 0..128 {
+            v[i] = i as u8;
+        }
+        let col = MultiLaneColumn::new(Arc::from(v)).unwrap();
+        let windows: Vec<&[u8; 64]> = col.iter_u8x64().collect();
+        assert_eq!(windows.len(), 2);
+        assert_eq!(windows[0][0], 0u8);
+        assert_eq!(windows[0][63], 63u8);
+        assert_eq!(windows[1][0], 64u8);
+        assert_eq!(windows[1][63], 127u8);
     }
 
     /// Clone shares the same backing Arc (no copy).
     #[test]
     fn clone_shares_backing() {
-        unimplemented!("PR-X1 test: Arc::strong_count after clone == 2")
+        let col = MultiLaneColumn::new(Arc::from(vec![0u8; 64])).unwrap();
+        let col2 = col.clone();
+        // Both columns reference the same underlying allocation: pointer equality
+        // is the observable contract without accessing private Arc internals.
+        assert_eq!(
+            col.as_bytes().as_ptr(),
+            col2.as_bytes().as_ptr(),
+            "clone must share the same Arc backing, not copy"
+        );
+    }
+
+    /// Bytes-shape iterators all yield the same chunk count and content as
+    /// `iter_u8x64` — they are pure aliasing views, not separate buffers.
+    #[test]
+    fn bytes_shape_iterators_alias_u8x64() {
+        let v: Vec<u8> = (0u8..192).collect();
+        let col = MultiLaneColumn::new(Arc::from(v)).unwrap();
+
+        let u8_wins: Vec<&[u8; 64]> = col.iter_u8x64().collect();
+        let f32_wins: Vec<&[u8; 64]> = col.iter_f32x16_bytes().collect();
+        let f64_wins: Vec<&[u8; 64]> = col.iter_f64x8_bytes().collect();
+        let u64_wins: Vec<&[u8; 64]> = col.iter_u64x8_bytes().collect();
+
+        assert_eq!(u8_wins.len(), 3);
+        assert_eq!(f32_wins.len(), 3);
+        assert_eq!(f64_wins.len(), 3);
+        assert_eq!(u64_wins.len(), 3);
+
+        // Each shape iterator yields references into the same backing bytes:
+        // pointer equality across the four iterators on every chunk.
+        for i in 0..3 {
+            assert_eq!(u8_wins[i].as_ptr(), f32_wins[i].as_ptr());
+            assert_eq!(u8_wins[i].as_ptr(), f64_wins[i].as_ptr());
+            assert_eq!(u8_wins[i].as_ptr(), u64_wins[i].as_ptr());
+            assert_eq!(u8_wins[i][0], (i as u8) * 64);
+            assert_eq!(u8_wins[i][63], (i as u8) * 64 + 63);
+        }
+    }
+
+    /// `as_bytes()` returns the full backing slice and aliases the Arc storage.
+    #[test]
+    fn as_bytes_returns_full_backing_slice() {
+        let v: Vec<u8> = (0u8..64).collect();
+        let arc: Arc<[u8]> = Arc::from(v);
+        let arc_ptr = arc.as_ptr();
+        let col = MultiLaneColumn::new(arc).unwrap();
+        let bytes = col.as_bytes();
+        assert_eq!(bytes.len(), 64);
+        assert_eq!(bytes.as_ptr(), arc_ptr, "as_bytes must alias the Arc backing, not copy");
+        for (i, &b) in bytes.iter().enumerate() {
+            assert_eq!(b, i as u8);
+        }
+    }
+
+    /// Static assertion: `MultiLaneColumn` is `Send + Sync`, so it can cross
+    /// thread boundaries — required for cognitive-shader-stack multi-consumer
+    /// access patterns.
+    #[test]
+    fn multilane_column_is_send_sync() {
+        fn assert_send_sync<T: Send + Sync>() {}
+        assert_send_sync::<MultiLaneColumn>();
     }
 }