|
| 1 | +//! P6 · D-DNV-2 — the SPO → `CausalEdge64` → 2³ Pearl decomposition, driven by a |
| 2 | +//! REAL deepnsm COCA FSM parse (not synthetic indices). |
| 3 | +//! |
| 4 | +//! The prior probes exercised the pieces on manufactured input: P2 packed PRNG |
| 5 | +//! palette indices into `CausalEdge64`; P3b ran `all_projections` on hand-set |
| 6 | +//! table cells. This probe closes the loop **end-to-end from text**: |
| 7 | +//! |
| 8 | +//! ```text |
| 9 | +//! real COCA text |
| 10 | +//! → Vocabulary::tokenize (real COCA word ranks) |
| 11 | +//! → Parser::parse (6-state FSM) → real SpoTriple (S, P, O ranks) |
| 12 | +//! → rank → centroid index → CausalEdge64::pack_v2 (the edge) |
| 13 | +//! → SpoHead → SpoDistances::all_projections (2³) |
| 14 | +//! ``` |
| 15 | +//! |
| 16 | +//! The only stand-in is the distance palette — the trained centroid codebook is |
| 17 | +//! future work (`E-V3-DEEPNSM-IS-THE-ENCODER-NOT-A-MIGRATION-1`); the S/P/O |
| 18 | +//! **identity landing is real**, produced by the shipped FSM on real vocabulary. |
| 19 | +//! DeepNSM → V3 D-DNV-2 (`.claude/plans/deepnsm-v3-convergence-v1.md`). Extends |
| 20 | +//! #624 P2 (edge round-trip) and P3b (2³ amortization) onto real input. |
| 21 | +//! |
| 22 | +//! Integer-exact: the palette is a deterministic SplitMix64 fill, the parse is |
| 23 | +//! deterministic, no seed entropy. Real COCA CSVs are committed under |
| 24 | +//! `crates/deepnsm/word_frequency/`, so the load is CI-safe. |
| 25 | +
|
| 26 | +use causal_edge::{CausalEdge64, CausalMask, PlasticityState}; |
| 27 | +use deepnsm::parser::Parser; |
| 28 | +use deepnsm::Vocabulary; |
| 29 | +use lance_graph_planner::cache::nars_engine::{ |
| 30 | + SpoDistances, SpoHead, ALL_MASKS, MASK_NONE, MASK_O, MASK_P, MASK_PO, MASK_S, MASK_SO, MASK_SPO, |
| 31 | +}; |
| 32 | +use std::path::Path; |
| 33 | + |
| 34 | +mod common; |
| 35 | +use common::splitmix64; |
| 36 | + |
| 37 | +/// A deterministic symmetric 256×256 palette — the codebook stand-in, same |
| 38 | +/// shape p2/p3b use. Real trained centroids are future work (D-DNV-2 §stand-in). |
| 39 | +/// Zero on the diagonal (a centroid is distance 0 from itself), positive off it. |
| 40 | +fn synth_palette(seed: u64) -> Vec<u16> { |
| 41 | + let mut t = vec![0u16; 256 * 256]; |
| 42 | + let mut s = seed; |
| 43 | + for a in 0..256usize { |
| 44 | + for b in (a + 1)..256usize { |
| 45 | + let v = 1 + (splitmix64(&mut s) % 60_000) as u16; |
| 46 | + t[a * 256 + b] = v; |
| 47 | + t[b * 256 + a] = v; |
| 48 | + } |
| 49 | + } |
| 50 | + t |
| 51 | +} |
| 52 | + |
| 53 | +fn nars() -> SpoDistances { |
| 54 | + SpoDistances { |
| 55 | + s_table: synth_palette(0x0700_0001), |
| 56 | + p_table: synth_palette(0x0700_0002), |
| 57 | + o_table: synth_palette(0x0700_0003), |
| 58 | + } |
| 59 | +} |
| 60 | + |
| 61 | +/// The COCA rank of the trained centroid the word lands on is future work; until |
| 62 | +/// the codebook is trained, a word's rank maps to a centroid by its low byte. |
| 63 | +/// Deterministic and total — the honest stand-in for `code = codebook(word)[k]`. |
| 64 | +fn centroid(rank: u16) -> u8 { |
| 65 | + (rank & 0xFF) as u8 |
| 66 | +} |
| 67 | + |
| 68 | +/// Bridge a packed edge into the distance engine's head — the SAME palette |
| 69 | +/// indices, nothing else (the P2 join point, reused verbatim). |
| 70 | +fn head_of(e: CausalEdge64) -> SpoHead { |
| 71 | + let mut h = SpoHead::zero(); |
| 72 | + h.s_idx = e.s_idx(); |
| 73 | + h.p_idx = e.p_idx(); |
| 74 | + h.o_idx = e.o_idx(); |
| 75 | + h |
| 76 | +} |
| 77 | + |
| 78 | +/// Parse real text through the shipped COCA FSM and return the first SPO triple |
| 79 | +/// that carries an object, as `(subject, predicate, object)` real word ranks. |
| 80 | +fn first_real_triple(vocab: &Vocabulary, text: &str) -> (u16, u16, u16) { |
| 81 | + let toks = vocab.tokenize(text); |
| 82 | + let parsed = Parser::new().parse(&toks); |
| 83 | + let t = parsed |
| 84 | + .triples |
| 85 | + .iter() |
| 86 | + .find(|t| t.has_object()) |
| 87 | + .expect("a real COCA sentence yields at least one SPO triple with an object"); |
| 88 | + (t.subject(), t.predicate(), t.object()) |
| 89 | +} |
| 90 | + |
| 91 | +fn load_vocab() -> Vocabulary { |
| 92 | + // osint CARGO_MANIFEST_DIR is crates/lance-graph-osint; the COCA CSVs live |
| 93 | + // in the sibling deepnsm crate. |
| 94 | + let dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("../deepnsm/word_frequency"); |
| 95 | + Vocabulary::load(&dir).expect("load committed COCA word_frequency CSVs") |
| 96 | +} |
| 97 | + |
| 98 | +fn idx(m: u8) -> usize { |
| 99 | + ALL_MASKS.iter().position(|&x| x == m).unwrap() |
| 100 | +} |
| 101 | + |
| 102 | +/// P6a — a real COCA parse's S/P/O identity survives the `CausalEdge64` carrier |
| 103 | +/// unchanged (extends P2's PRNG round-trip onto real, FSM-derived words). |
| 104 | +#[test] |
| 105 | +fn p6_real_coca_spo_round_trips_the_edge_carrier() { |
| 106 | + let vocab = load_vocab(); |
| 107 | + let (s, p, o) = first_real_triple(&vocab, "The system resolves the reference."); |
| 108 | + let (si, pi, oi) = (centroid(s), centroid(p), centroid(o)); |
| 109 | + |
| 110 | + let edge = CausalEdge64::pack_v2( |
| 111 | + si, |
| 112 | + pi, |
| 113 | + oi, |
| 114 | + 200, // freq — Grok-grade truth ≈0.78, the COCA-wire stand-in |
| 115 | + 163, // conf |
| 116 | + CausalMask::SPO, |
| 117 | + 0, |
| 118 | + PlasticityState::ALL_FROZEN, |
| 119 | + ); |
| 120 | + |
| 121 | + assert_eq!(edge.s_idx(), si, "real subject centroid round-trips"); |
| 122 | + assert_eq!(edge.p_idx(), pi, "real predicate centroid round-trips"); |
| 123 | + assert_eq!(edge.o_idx(), oi, "real object centroid round-trips"); |
| 124 | + assert_eq!(edge.causal_mask(), CausalMask::SPO); |
| 125 | + assert_eq!(edge.frequency_u8(), 200); |
| 126 | + assert_eq!(edge.confidence_u8(), 163); |
| 127 | +} |
| 128 | + |
| 129 | +/// P6b — the 8 Pearl projections run over a head built from a REAL COCA parse, |
| 130 | +/// and the 2³ ladder is monotone with the counterfactual (SPO) dominating |
| 131 | +/// (extends P3b's hand-set-cell amortization onto real, FSM-derived heads). |
| 132 | +#[test] |
| 133 | +fn p6_real_coca_2cube_ladder_holds_on_a_real_parse() { |
| 134 | + let vocab = load_vocab(); |
| 135 | + // Candidate = a real COCA FSM parse. Context = three high-frequency real |
| 136 | + // COCA words looked up by rank (guaranteed present in the top-4096 vocab) — |
| 137 | + // both heads are real COCA identities, no fragile second parse. |
| 138 | + let (cs, cp, co) = first_real_triple(&vocab, "The system resolves the reference."); |
| 139 | + let word_rank = |w: &str| { |
| 140 | + vocab |
| 141 | + .rank_of(w) |
| 142 | + .expect("common COCA word is in the top-4096 vocab") |
| 143 | + }; |
| 144 | + let (xs, xp, xo) = (word_rank("time"), word_rank("people"), word_rank("world")); |
| 145 | + |
| 146 | + let cand = head_of(CausalEdge64::pack_v2( |
| 147 | + centroid(cs), |
| 148 | + centroid(cp), |
| 149 | + centroid(co), |
| 150 | + 200, |
| 151 | + 163, |
| 152 | + CausalMask::SPO, |
| 153 | + 0, |
| 154 | + PlasticityState::ALL_FROZEN, |
| 155 | + )); |
| 156 | + let ctx = head_of(CausalEdge64::pack_v2( |
| 157 | + centroid(xs), |
| 158 | + centroid(xp), |
| 159 | + centroid(xo), |
| 160 | + 200, |
| 161 | + 163, |
| 162 | + CausalMask::SPO, |
| 163 | + 0, |
| 164 | + PlasticityState::ALL_FROZEN, |
| 165 | + )); |
| 166 | + |
| 167 | + let d = nars(); |
| 168 | + let proj = d.all_projections(&cand, &ctx); |
| 169 | + |
| 170 | + // Amortization + taxonomy: prior is 0; the counterfactual is the full sum. |
| 171 | + assert_eq!(proj[idx(MASK_NONE)], 0, "MASK_NONE = prior"); |
| 172 | + assert_eq!( |
| 173 | + proj[idx(MASK_SPO)], |
| 174 | + proj[idx(MASK_S)] + proj[idx(MASK_P)] + proj[idx(MASK_O)], |
| 175 | + "MASK_SPO = counterfactual = the three marginals summed" |
| 176 | + ); |
| 177 | + |
| 178 | + // The 2³ lattice is monotone: adding a plane never decreases the projection. |
| 179 | + assert!(proj[idx(MASK_NONE)] <= proj[idx(MASK_S)]); |
| 180 | + assert!(proj[idx(MASK_S)] <= proj[idx(MASK_SO)]); |
| 181 | + assert!(proj[idx(MASK_SO)] <= proj[idx(MASK_SPO)]); |
| 182 | + assert!(proj[idx(MASK_O)] <= proj[idx(MASK_PO)]); |
| 183 | + assert!(proj[idx(MASK_PO)] <= proj[idx(MASK_SPO)]); |
| 184 | + assert_eq!( |
| 185 | + proj[idx(MASK_SPO)], |
| 186 | + *proj.iter().max().unwrap(), |
| 187 | + "counterfactual dominates every sub-question on the real-derived head" |
| 188 | + ); |
| 189 | + |
| 190 | + // The real FSM parse produced a genuine 3-role triple whose S and O landed on |
| 191 | + // distinct centroids, so Association (S+O) strictly exceeds its parts here — |
| 192 | + // the rung decomposition is doing real work, not collapsing to one number. |
| 193 | + if cand.s_idx != ctx.s_idx && cand.o_idx != ctx.o_idx { |
| 194 | + assert!( |
| 195 | + proj[idx(MASK_SO)] > proj[idx(MASK_S)], |
| 196 | + "Association adds the Object plane on top of the Subject plane" |
| 197 | + ); |
| 198 | + assert!( |
| 199 | + proj[idx(MASK_SO)] > proj[idx(MASK_O)], |
| 200 | + "Association adds the Subject plane on top of the Object plane" |
| 201 | + ); |
| 202 | + } |
| 203 | +} |
0 commit comments