|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright The Lance Authors |
| 3 | + |
| 4 | +//! `content_store` — content-addressed cold text/blob store contract (zero-dep). |
| 5 | +//! |
| 6 | +//! The episodic/OSINT **text table**: `ContentId` (the `fnv1a` hash of the bytes) |
| 7 | +//! → bytes, resolved **cold, at the membrane** — never in the hot path. This is |
| 8 | +//! the typed surface for the rule the OGAR canon + `I-VSA-IDENTITIES` Test 0 |
| 9 | +//! (register laziness) demand: *the reference is the identity, never a serialized |
| 10 | +//! pointer/offset inlined in the SoA*. |
| 11 | +//! |
| 12 | +//! ## Three invariants this encodes |
| 13 | +//! |
| 14 | +//! 1. **The join key IS the identity.** Nothing variable-length enters the 512 B |
| 15 | +//! node. The node carries only a fixed-size [`ContentId`] (a value tenant); |
| 16 | +//! the text lives in a columnar table next to it and joins by id. No pointer |
| 17 | +//! field, no budget break. |
| 18 | +//! 2. **Content-address, not raw GUID.** OSINT sources are shared (one document |
| 19 | +//! backs many observations). [`ContentId::of`] hashes the bytes, so identical |
| 20 | +//! sources dedup (many episodic edges → one source row). Uses [`crate::hash::fnv1a`] |
| 21 | +//! — **stable across versions/platforms** (unlike `DefaultHasher`, which must |
| 22 | +//! never key a content address; see `TECH_DEBT` re `WitnessEntry::tie_break_hash`). |
| 23 | +//! 3. **Hot/cold firewall (ADR-022).** [`ContentStore::resolve`] is the COLD / |
| 24 | +//! membrane surface: bytes are materialized only when genuinely needed (LLM |
| 25 | +//! hydration, rendering, citing). The hot path (SIMD sweep, resonance, |
| 26 | +//! AriGraph edge traversal, family-basin routing) touches only the fixed-size |
| 27 | +//! [`ContentId`] + [`SourceSpan`] — the fingerprint is the hot-path stand-in |
| 28 | +//! for the text; this trait is never called during computation. |
| 29 | +//! |
| 30 | +//! ## Provenance: `SourceSpan` is the typed `(source_id, start, end)` |
| 31 | +//! |
| 32 | +//! The merged `template-equivalence` provenance model uses |
| 33 | +//! `source_spans: Vec<(String, usize, usize)>` = `(source_id, start, end)`. |
| 34 | +//! [`SourceSpan`] is its fixed-size typed form: `source_id` IS a [`ContentId`] |
| 35 | +//! (the content-table key), `start`/`end` index into the resolved bytes. The |
| 36 | +//! gate "no source span → no claim" is literally [`SourceSpan::is_cited`]. |
| 37 | +
|
| 38 | +use crate::hash::fnv1a; |
| 39 | + |
| 40 | +/// A content address: the `fnv1a`-64 hash of the stored bytes. |
| 41 | +/// |
| 42 | +/// Identical bytes ⇒ identical id ⇒ natural dedup. `ContentId(0)` is the |
| 43 | +/// reserved **empty/sentinel** (no content), mirroring the canon's zero-fallback |
| 44 | +/// ladder (a zero tier = "not consulted", never a valid address). |
| 45 | +/// |
| 46 | +/// Note: 64-bit fnv1a is the workspace-canonical hash and is sufficient for |
| 47 | +/// OSINT-corpus scale; if a corpus ever approaches birthday-collision range |
| 48 | +/// (~2^32 distinct sources), widen to a 128-bit content address — the upgrade |
| 49 | +/// is local to this type. |
| 50 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] |
| 51 | +pub struct ContentId(pub u64); |
| 52 | + |
| 53 | +impl ContentId { |
| 54 | + /// Content-address arbitrary bytes. |
| 55 | + #[must_use] |
| 56 | + pub fn of(bytes: &[u8]) -> Self { |
| 57 | + Self(fnv1a(bytes)) |
| 58 | + } |
| 59 | + |
| 60 | + /// Content-address a string slice. |
| 61 | + #[must_use] |
| 62 | + pub fn of_str(s: &str) -> Self { |
| 63 | + Self(fnv1a(s.as_bytes())) |
| 64 | + } |
| 65 | + |
| 66 | + /// The reserved empty/sentinel address (no content). |
| 67 | + #[must_use] |
| 68 | + pub fn is_sentinel(self) -> bool { |
| 69 | + self.0 == 0 |
| 70 | + } |
| 71 | +} |
| 72 | + |
| 73 | +/// A provenance reference: which content, and the `[start, end)` byte span within |
| 74 | +/// it. Fixed-size and `Copy` — it lives on the episodic node (a value tenant); |
| 75 | +/// the bytes resolve cold via [`ContentStore`]. The typed form of |
| 76 | +/// `template-equivalence`'s `(source_id, start, end)`. |
| 77 | +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, Default)] |
| 78 | +pub struct SourceSpan { |
| 79 | + /// The content-table key (the source the span cites). |
| 80 | + pub content: ContentId, |
| 81 | + /// Inclusive start byte offset into the resolved content. |
| 82 | + pub start: u32, |
| 83 | + /// Exclusive end byte offset. |
| 84 | + pub end: u32, |
| 85 | +} |
| 86 | + |
| 87 | +impl SourceSpan { |
| 88 | + /// New span; `end` is clamped to be `>= start`. |
| 89 | + #[must_use] |
| 90 | + pub fn new(content: ContentId, start: u32, end: u32) -> Self { |
| 91 | + Self { content, start, end: end.max(start) } |
| 92 | + } |
| 93 | + |
| 94 | + /// Span length in bytes. |
| 95 | + #[must_use] |
| 96 | + pub fn len(self) -> u32 { |
| 97 | + self.end - self.start |
| 98 | + } |
| 99 | + |
| 100 | + /// Whether the span covers zero bytes. |
| 101 | + #[must_use] |
| 102 | + pub fn is_empty(self) -> bool { |
| 103 | + self.end <= self.start |
| 104 | + } |
| 105 | + |
| 106 | + /// "No source span → no claim": a claim is cited iff it carries a non-empty |
| 107 | + /// span into real (non-sentinel) content. The provenance gate's predicate. |
| 108 | + #[must_use] |
| 109 | + pub fn is_cited(self) -> bool { |
| 110 | + !self.content.is_sentinel() && !self.is_empty() |
| 111 | + } |
| 112 | +} |
| 113 | + |
| 114 | +/// Failure resolving content from the cold store. |
| 115 | +#[derive(Clone, Copy, Debug, PartialEq, Eq)] |
| 116 | +pub enum ContentError { |
| 117 | + /// No content stored under this id. |
| 118 | + NotFound, |
| 119 | + /// The span's `[start, end)` exceeds the resolved content's length. |
| 120 | + SpanOutOfBounds, |
| 121 | +} |
| 122 | + |
| 123 | +impl core::fmt::Display for ContentError { |
| 124 | + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 125 | + match self { |
| 126 | + ContentError::NotFound => write!(f, "content-store: id not found"), |
| 127 | + ContentError::SpanOutOfBounds => write!(f, "content-store: span out of bounds"), |
| 128 | + } |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +/// The content-addressed **cold** store (read side). |
| 133 | +/// |
| 134 | +/// Lives in the zero-dep contract so any consumer can declare it without pulling |
| 135 | +/// Arrow/Lance. Implemented downstream by a Lance text table (and, in-RAM, by the |
| 136 | +/// AriGraph `EpisodicMemory` / `WitnessCorpus` acting as the cold tier). |
| 137 | +/// `resolve` returns a borrow into the backing store (mmap'd Lance buffer or |
| 138 | +/// in-RAM `Bytes`), so reads are zero-copy at the membrane. |
| 139 | +pub trait ContentStore { |
| 140 | + /// Resolve the full content bytes for an id. `None` if absent. COLD path only. |
| 141 | + fn resolve(&self, id: ContentId) -> Option<&[u8]>; |
| 142 | + |
| 143 | + /// Resolve a span's bytes (cold). Default composes [`resolve`](Self::resolve) |
| 144 | + /// with a bounds check. |
| 145 | + fn resolve_span(&self, span: SourceSpan) -> Result<&[u8], ContentError> { |
| 146 | + let bytes = self.resolve(span.content).ok_or(ContentError::NotFound)?; |
| 147 | + bytes |
| 148 | + .get(span.start as usize..span.end as usize) |
| 149 | + .ok_or(ContentError::SpanOutOfBounds) |
| 150 | + } |
| 151 | + |
| 152 | + /// Whether an id is present without committing to a borrow shape. |
| 153 | + fn contains(&self, id: ContentId) -> bool { |
| 154 | + self.resolve(id).is_some() |
| 155 | + } |
| 156 | +} |
| 157 | + |
| 158 | +/// The content-addressed store (write side, membrane-only). |
| 159 | +/// |
| 160 | +/// Ingest is idempotent by construction: identical bytes ⇒ same [`ContentId`] ⇒ |
| 161 | +/// dedup (the many-episodes → one-source rule). Writing happens at the cold |
| 162 | +/// membrane during ingestion, never on the hot path. |
| 163 | +pub trait ContentSink { |
| 164 | + /// Store `bytes`, returning their content address. Idempotent. |
| 165 | + fn put(&mut self, bytes: &[u8]) -> ContentId; |
| 166 | + |
| 167 | + /// Store a string slice. |
| 168 | + fn put_str(&mut self, s: &str) -> ContentId { |
| 169 | + self.put(s.as_bytes()) |
| 170 | + } |
| 171 | +} |
| 172 | + |
| 173 | +#[cfg(test)] |
| 174 | +mod tests { |
| 175 | + use super::*; |
| 176 | + use std::collections::HashMap; |
| 177 | + |
| 178 | + /// Reference in-RAM impl (the cold tier mirror) used to exercise the contract. |
| 179 | + #[derive(Default)] |
| 180 | + struct MemStore { |
| 181 | + map: HashMap<ContentId, Vec<u8>>, |
| 182 | + } |
| 183 | + impl ContentStore for MemStore { |
| 184 | + fn resolve(&self, id: ContentId) -> Option<&[u8]> { |
| 185 | + self.map.get(&id).map(Vec::as_slice) |
| 186 | + } |
| 187 | + } |
| 188 | + impl ContentSink for MemStore { |
| 189 | + fn put(&mut self, bytes: &[u8]) -> ContentId { |
| 190 | + let id = ContentId::of(bytes); |
| 191 | + self.map.entry(id).or_insert_with(|| bytes.to_vec()); |
| 192 | + id |
| 193 | + } |
| 194 | + } |
| 195 | + |
| 196 | + #[test] |
| 197 | + fn content_address_is_stable_and_dedups() { |
| 198 | + let a = ContentId::of_str("the same source document"); |
| 199 | + let b = ContentId::of_str("the same source document"); |
| 200 | + assert_eq!(a, b); // identical bytes ⇒ identical id (dedup key) |
| 201 | + assert_ne!(a, ContentId::of_str("a different document")); |
| 202 | + } |
| 203 | + |
| 204 | + #[test] |
| 205 | + fn put_is_idempotent_one_row_per_source() { |
| 206 | + let mut s = MemStore::default(); |
| 207 | + let id1 = s.put_str("shared OSINT source"); |
| 208 | + let id2 = s.put_str("shared OSINT source"); // many episodes → one source |
| 209 | + assert_eq!(id1, id2); |
| 210 | + assert_eq!(s.map.len(), 1); |
| 211 | + } |
| 212 | + |
| 213 | + #[test] |
| 214 | + fn resolve_span_returns_the_cited_slice() { |
| 215 | + let mut s = MemStore::default(); |
| 216 | + let id = s.put_str("Alice met Bob in Paris."); |
| 217 | + let span = SourceSpan::new(id, 10, 13); // "Bob" |
| 218 | + assert_eq!(s.resolve_span(span).unwrap(), b"Bob"); |
| 219 | + assert!(span.is_cited()); |
| 220 | + } |
| 221 | + |
| 222 | + #[test] |
| 223 | + fn out_of_bounds_and_missing_fail() { |
| 224 | + let mut s = MemStore::default(); |
| 225 | + let id = s.put_str("short"); |
| 226 | + assert_eq!(s.resolve_span(SourceSpan::new(id, 0, 999)), Err(ContentError::SpanOutOfBounds)); |
| 227 | + assert_eq!( |
| 228 | + s.resolve_span(SourceSpan::new(ContentId(123), 0, 1)), |
| 229 | + Err(ContentError::NotFound) |
| 230 | + ); |
| 231 | + } |
| 232 | + |
| 233 | + #[test] |
| 234 | + fn uncited_span_is_rejected_by_the_gate() { |
| 235 | + // sentinel content, or empty span ⇒ not a citation |
| 236 | + assert!(!SourceSpan::new(ContentId(0), 0, 5).is_cited()); |
| 237 | + assert!(!SourceSpan::new(ContentId(7), 5, 5).is_cited()); |
| 238 | + assert!(SourceSpan::new(ContentId(7), 0, 5).is_cited()); |
| 239 | + } |
| 240 | +} |
0 commit comments