|
| 1 | +//! SoA envelope little-endian contract. |
| 2 | +//! |
| 3 | +//! # Why this module exists |
| 4 | +//! |
| 5 | +//! Column-level LE knowledge is not enough. ndarray's `MultiLaneColumn` |
| 6 | +//! (the column carrier) already decodes its own bytes little-endian, and |
| 7 | +//! `CausalEdge64` / `EpisodicEdges64` each know their own `to_le_bytes` / |
| 8 | +//! `from_le_bytes`. But the **SoA envelope as a whole** — the thing a Lance |
| 9 | +//! version snapshots, the thing `simd_soa` sweeps, the thing a future reader |
| 10 | +//! decodes — has no contract describing how those columns *assemble* into one |
| 11 | +//! row-strided packet. The parts know the LE contract; the envelope did not. |
| 12 | +//! |
| 13 | +//! [`SoaEnvelope`] is that missing contract. It makes one SoA snapshot a |
| 14 | +//! **self-describing little-endian packet per cycle**: a stable column |
| 15 | +//! ordering, a fixed row byte stride, a `cycle` version stamp, and a |
| 16 | +//! [`ENVELOPE_LAYOUT_VERSION`]. With it, a Lance version IS a coherent LE |
| 17 | +//! packet at cycle N — not a loose collection of independently-correct |
| 18 | +//! columns. |
| 19 | +//! |
| 20 | +//! # Layering (read before adding an ndarray dependency here) |
| 21 | +//! |
| 22 | +//! This module is **zero-dep, byte-geometry only**. It describes *where* |
| 23 | +//! columns sit in a row packet and *what* LE element each holds — as data |
| 24 | +//! ([`ColumnDescriptor`]), never as ndarray generic bounds. That keeps |
| 25 | +//! `lance-graph-contract` featherweight for its non-HPC consumers |
| 26 | +//! (`crewai-rust`, `n8n-rs`), and it keeps ndarray usable standalone by any |
| 27 | +//! pure-SIMD consumer. |
| 28 | +//! |
| 29 | +//! The split is deliberate and complementary, not duplicated: |
| 30 | +//! |
| 31 | +//! | Level | Home | Answers | |
| 32 | +//! |-------|------|---------| |
| 33 | +//! | Column LE contract | `ndarray::simd::MultiLaneColumn` | "how do I sweep one typed column" | |
| 34 | +//! | Envelope LE contract | this module | "where do columns sit in the row packet, what cycle is this" | |
| 35 | +//! | Composition | `lance-graph` (always has both deps) | carve envelope columns → wrap each in `MultiLaneColumn` | |
| 36 | +//! |
| 37 | +//! ndarray never learns the envelope exists; this crate never learns ndarray |
| 38 | +//! exists; `lance-graph` binds them. |
| 39 | +
|
| 40 | +/// Layout version of the envelope byte geometry. |
| 41 | +/// |
| 42 | +/// Bumped whenever the meaning of [`ColumnDescriptor`] offsets/strides |
| 43 | +/// changes. A reader MUST refuse to decode a packet whose stamped version it |
| 44 | +/// does not understand (per `I-LEGACY-API-FEATURE-GATED`: layout reclaim is |
| 45 | +/// paired with a version gate on the serialization path). |
| 46 | +pub const ENVELOPE_LAYOUT_VERSION: u8 = 1; |
| 47 | + |
| 48 | +/// The little-endian element type of one column. |
| 49 | +/// |
| 50 | +/// Width only — no distance semantics, no domain meaning (cf. ndarray's |
| 51 | +/// no-umbrella rule). The actual decode (`from_le_bytes`) happens in the |
| 52 | +/// consumer's `MultiLaneColumn` lane iterator. |
| 53 | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 54 | +#[repr(u8)] |
| 55 | +pub enum ColumnKind { |
| 56 | + U8 = 0, |
| 57 | + I8 = 1, |
| 58 | + U16 = 2, |
| 59 | + I16 = 3, |
| 60 | + U32 = 4, |
| 61 | + F32 = 5, |
| 62 | + U64 = 6, |
| 63 | + F64 = 7, |
| 64 | +} |
| 65 | + |
| 66 | +impl ColumnKind { |
| 67 | + /// Bytes per element of this LE column kind. |
| 68 | + pub const fn elem_bytes(self) -> usize { |
| 69 | + match self { |
| 70 | + ColumnKind::U8 | ColumnKind::I8 => 1, |
| 71 | + ColumnKind::U16 | ColumnKind::I16 => 2, |
| 72 | + ColumnKind::U32 | ColumnKind::F32 => 4, |
| 73 | + ColumnKind::U64 | ColumnKind::F64 => 8, |
| 74 | + } |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +/// One column's placement within a single row packet. |
| 79 | +/// |
| 80 | +/// `Copy` and `repr(C)` so a descriptor table is itself a stable LE artifact. |
| 81 | +/// `name_id` is a stable column ordinal (an enum discriminant on the consumer |
| 82 | +/// side), NOT a string — keeping this crate alloc-free and the descriptor |
| 83 | +/// `Copy`. |
| 84 | +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
| 85 | +#[repr(C)] |
| 86 | +pub struct ColumnDescriptor { |
| 87 | + /// Stable column identity (consumer-side enum ordinal). |
| 88 | + pub name_id: u16, |
| 89 | + /// LE element kind. |
| 90 | + pub kind: ColumnKind, |
| 91 | + /// Elements of `kind` per row for this column (e.g. content = 256 × u64, |
| 92 | + /// energy = 1 × f32). |
| 93 | + pub elems_per_row: u16, |
| 94 | + /// Byte offset of this column within one row packet. |
| 95 | + pub row_offset: u32, |
| 96 | +} |
| 97 | + |
| 98 | +impl ColumnDescriptor { |
| 99 | + /// Bytes this column occupies in one row. |
| 100 | + pub const fn col_bytes_per_row(&self) -> usize { |
| 101 | + self.kind.elem_bytes() * self.elems_per_row as usize |
| 102 | + } |
| 103 | + |
| 104 | + /// Byte range `[start, end)` of this column within a row packet. |
| 105 | + pub const fn row_byte_range(&self) -> (usize, usize) { |
| 106 | + let start = self.row_offset as usize; |
| 107 | + (start, start + self.col_bytes_per_row()) |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +/// What can go wrong validating an envelope's byte geometry. |
| 112 | +#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 113 | +pub enum EnvelopeError { |
| 114 | + /// The stamped layout version is not the one this build understands. |
| 115 | + LayoutVersionMismatch { expected: u8, found: u8 }, |
| 116 | + /// Sum of column byte-widths does not equal the declared row stride. |
| 117 | + StrideMismatch { declared: usize, summed: usize }, |
| 118 | + /// Two columns overlap, or a gap/ordering violation was found. |
| 119 | + ColumnOverlap { col_a: u16, col_b: u16 }, |
| 120 | + /// `as_le_bytes().len()` is not `row_stride * n_rows`. |
| 121 | + PacketSizeMismatch { expected: usize, found: usize }, |
| 122 | + /// A requested row or column index is out of bounds. |
| 123 | + OutOfBounds, |
| 124 | +} |
| 125 | + |
| 126 | +/// A self-describing little-endian SoA packet for one cycle. |
| 127 | +/// |
| 128 | +/// Implemented by the owner of the backing store (e.g. the mailbox SoA). The |
| 129 | +/// envelope is read-only here; mutation lives on the owner type, never on this |
| 130 | +/// view (mirrors `MailboxSoaView` vs `MailboxSoaOwner`). |
| 131 | +pub trait SoaEnvelope { |
| 132 | + /// Layout version this implementor's geometry conforms to. |
| 133 | + const LAYOUT_VERSION: u8 = ENVELOPE_LAYOUT_VERSION; |
| 134 | + |
| 135 | + /// Stable, ordered column placement table. Ordering is part of the |
| 136 | + /// contract: a reader walks columns in this order. |
| 137 | + fn columns(&self) -> &[ColumnDescriptor]; |
| 138 | + |
| 139 | + /// Total bytes per row across all columns. |
| 140 | + fn row_stride(&self) -> usize; |
| 141 | + |
| 142 | + /// Number of rows in this snapshot. |
| 143 | + fn n_rows(&self) -> usize; |
| 144 | + |
| 145 | + /// The version stamp this snapshot carries (the cycle whose committed |
| 146 | + /// state these bytes are). This is what turns a Lance version into a |
| 147 | + /// coherent "packet at cycle N". |
| 148 | + fn cycle(&self) -> u32; |
| 149 | + |
| 150 | + /// The whole packet as contiguous LE bytes, zero-copy. Length MUST be |
| 151 | + /// `row_stride() * n_rows()`. |
| 152 | + fn as_le_bytes(&self) -> &[u8]; |
| 153 | + |
| 154 | + /// Zero-copy LE view of one full row. |
| 155 | + fn row_le(&self, row: usize) -> Option<&[u8]> { |
| 156 | + let stride = self.row_stride(); |
| 157 | + let start = row.checked_mul(stride)?; |
| 158 | + let end = start.checked_add(stride)?; |
| 159 | + self.as_le_bytes().get(start..end) |
| 160 | + } |
| 161 | + |
| 162 | + /// Zero-copy LE view of one column within one row. |
| 163 | + fn column_le(&self, row: usize, col: &ColumnDescriptor) -> Option<&[u8]> { |
| 164 | + let r = self.row_le(row)?; |
| 165 | + let (start, end) = col.row_byte_range(); |
| 166 | + r.get(start..end) |
| 167 | + } |
| 168 | + |
| 169 | + /// Validate that the declared geometry is internally consistent and that |
| 170 | + /// the backing packet matches. Call this at the Lance read boundary — a |
| 171 | + /// v1 packet under a v2 reader (or a torn snapshot) is refused here rather |
| 172 | + /// than silently mis-decoded downstream. |
| 173 | + fn verify_layout(&self) -> Result<(), EnvelopeError> { |
| 174 | + // 1. Version gate. |
| 175 | + if Self::LAYOUT_VERSION != ENVELOPE_LAYOUT_VERSION { |
| 176 | + return Err(EnvelopeError::LayoutVersionMismatch { |
| 177 | + expected: ENVELOPE_LAYOUT_VERSION, |
| 178 | + found: Self::LAYOUT_VERSION, |
| 179 | + }); |
| 180 | + } |
| 181 | + // 2. Columns are non-overlapping and their widths sum to the stride. |
| 182 | + let cols = self.columns(); |
| 183 | + let mut summed = 0usize; |
| 184 | + for (i, a) in cols.iter().enumerate() { |
| 185 | + let (a_start, a_end) = a.row_byte_range(); |
| 186 | + summed += a.col_bytes_per_row(); |
| 187 | + for b in &cols[i + 1..] { |
| 188 | + let (b_start, b_end) = b.row_byte_range(); |
| 189 | + let overlap = a_start < b_end && b_start < a_end; |
| 190 | + if overlap { |
| 191 | + return Err(EnvelopeError::ColumnOverlap { |
| 192 | + col_a: a.name_id, |
| 193 | + col_b: b.name_id, |
| 194 | + }); |
| 195 | + } |
| 196 | + } |
| 197 | + } |
| 198 | + let stride = self.row_stride(); |
| 199 | + if summed != stride { |
| 200 | + return Err(EnvelopeError::StrideMismatch { |
| 201 | + declared: stride, |
| 202 | + summed, |
| 203 | + }); |
| 204 | + } |
| 205 | + // 3. Backing packet size matches stride × rows. |
| 206 | + let expected = stride.saturating_mul(self.n_rows()); |
| 207 | + let found = self.as_le_bytes().len(); |
| 208 | + if expected != found { |
| 209 | + return Err(EnvelopeError::PacketSizeMismatch { expected, found }); |
| 210 | + } |
| 211 | + Ok(()) |
| 212 | + } |
| 213 | +} |
| 214 | + |
| 215 | +#[cfg(test)] |
| 216 | +mod tests { |
| 217 | + use super::*; |
| 218 | + |
| 219 | + struct TestEnvelope { |
| 220 | + cols: Vec<ColumnDescriptor>, |
| 221 | + stride: usize, |
| 222 | + rows: usize, |
| 223 | + bytes: Vec<u8>, |
| 224 | + cycle: u32, |
| 225 | + } |
| 226 | + |
| 227 | + impl SoaEnvelope for TestEnvelope { |
| 228 | + fn columns(&self) -> &[ColumnDescriptor] { |
| 229 | + &self.cols |
| 230 | + } |
| 231 | + fn row_stride(&self) -> usize { |
| 232 | + self.stride |
| 233 | + } |
| 234 | + fn n_rows(&self) -> usize { |
| 235 | + self.rows |
| 236 | + } |
| 237 | + fn cycle(&self) -> u32 { |
| 238 | + self.cycle |
| 239 | + } |
| 240 | + fn as_le_bytes(&self) -> &[u8] { |
| 241 | + &self.bytes |
| 242 | + } |
| 243 | + } |
| 244 | + |
| 245 | + fn two_col_envelope(rows: usize) -> TestEnvelope { |
| 246 | + // col 0: 1 × f32 (4 B) at offset 0 |
| 247 | + // col 1: 1 × u64 (8 B) at offset 4 |
| 248 | + let cols = vec![ |
| 249 | + ColumnDescriptor { |
| 250 | + name_id: 0, |
| 251 | + kind: ColumnKind::F32, |
| 252 | + elems_per_row: 1, |
| 253 | + row_offset: 0, |
| 254 | + }, |
| 255 | + ColumnDescriptor { |
| 256 | + name_id: 1, |
| 257 | + kind: ColumnKind::U64, |
| 258 | + elems_per_row: 1, |
| 259 | + row_offset: 4, |
| 260 | + }, |
| 261 | + ]; |
| 262 | + let stride = 12; |
| 263 | + TestEnvelope { |
| 264 | + cols, |
| 265 | + stride, |
| 266 | + rows, |
| 267 | + bytes: vec![0u8; stride * rows], |
| 268 | + cycle: 7, |
| 269 | + } |
| 270 | + } |
| 271 | + |
| 272 | + #[test] |
| 273 | + fn kind_widths() { |
| 274 | + assert_eq!(ColumnKind::U8.elem_bytes(), 1); |
| 275 | + assert_eq!(ColumnKind::F32.elem_bytes(), 4); |
| 276 | + assert_eq!(ColumnKind::U64.elem_bytes(), 8); |
| 277 | + } |
| 278 | + |
| 279 | + #[test] |
| 280 | + fn descriptor_byte_range() { |
| 281 | + let d = ColumnDescriptor { |
| 282 | + name_id: 0, |
| 283 | + kind: ColumnKind::U64, |
| 284 | + elems_per_row: 256, |
| 285 | + row_offset: 16, |
| 286 | + }; |
| 287 | + assert_eq!(d.col_bytes_per_row(), 256 * 8); |
| 288 | + assert_eq!(d.row_byte_range(), (16, 16 + 256 * 8)); |
| 289 | + } |
| 290 | + |
| 291 | + #[test] |
| 292 | + fn valid_envelope_passes() { |
| 293 | + let env = two_col_envelope(4); |
| 294 | + assert_eq!(env.cycle(), 7); |
| 295 | + assert!(env.verify_layout().is_ok()); |
| 296 | + } |
| 297 | + |
| 298 | + #[test] |
| 299 | + fn stride_mismatch_caught() { |
| 300 | + let mut env = two_col_envelope(4); |
| 301 | + env.stride = 16; // columns sum to 12, not 16 |
| 302 | + env.bytes = vec![0u8; 16 * 4]; |
| 303 | + assert_eq!( |
| 304 | + env.verify_layout(), |
| 305 | + Err(EnvelopeError::StrideMismatch { |
| 306 | + declared: 16, |
| 307 | + summed: 12, |
| 308 | + }) |
| 309 | + ); |
| 310 | + } |
| 311 | + |
| 312 | + #[test] |
| 313 | + fn overlap_caught() { |
| 314 | + let mut env = two_col_envelope(1); |
| 315 | + env.cols[1].row_offset = 2; // u64 at 2 overlaps f32 at [0,4) |
| 316 | + env.stride = 10; |
| 317 | + env.bytes = vec![0u8; 10]; |
| 318 | + assert!(matches!( |
| 319 | + env.verify_layout(), |
| 320 | + Err(EnvelopeError::ColumnOverlap { .. }) |
| 321 | + )); |
| 322 | + } |
| 323 | + |
| 324 | + #[test] |
| 325 | + fn packet_size_mismatch_caught() { |
| 326 | + let mut env = two_col_envelope(4); |
| 327 | + env.bytes.truncate(12 * 3); // one row short |
| 328 | + assert_eq!( |
| 329 | + env.verify_layout(), |
| 330 | + Err(EnvelopeError::PacketSizeMismatch { |
| 331 | + expected: 48, |
| 332 | + found: 36, |
| 333 | + }) |
| 334 | + ); |
| 335 | + } |
| 336 | + |
| 337 | + #[test] |
| 338 | + fn row_and_column_views_are_zero_copy_slices() { |
| 339 | + let mut env = two_col_envelope(2); |
| 340 | + // Write row 1, col 1 (u64) = 0x0102030405060708 LE. |
| 341 | + let v: u64 = 0x0102_0304_0506_0708; |
| 342 | + let row1_col1_start = 12 + 4; |
| 343 | + env.bytes[row1_col1_start..row1_col1_start + 8].copy_from_slice(&v.to_le_bytes()); |
| 344 | + |
| 345 | + let row = env.row_le(1).unwrap(); |
| 346 | + assert_eq!(row.len(), 12); |
| 347 | + |
| 348 | + let col = env.column_le(1, &env.cols[1]).unwrap(); |
| 349 | + assert_eq!(col.len(), 8); |
| 350 | + assert_eq!(u64::from_le_bytes(col.try_into().unwrap()), v); |
| 351 | + |
| 352 | + // Out of bounds. |
| 353 | + assert!(env.row_le(2).is_none()); |
| 354 | + } |
| 355 | +} |
0 commit comments