11//! META-AGENT: add `pub mod markov_bundle;` to lib.rs.
2+ //!
3+ //! Slice coordinates are imported from `lance_graph_contract::grammar::role_keys`
4+ //! so that this module and the rest of the workspace agree on the [start:stop)
5+ //! boundaries of every grammatical role inside the 16384-dim VSA carrier.
6+ //! The previously hard-coded equal-partition layout (`16384 / 5 = 3277` per
7+ //! role) was incompatible with the domain-specific widths in role_keys (e.g.
8+ //! SUBJECT owns [0..2000), TEMPORAL owns [9000..9200)) — see PR #279 review,
9+ //! CRITICAL #1.
210
311use crate :: trajectory:: Trajectory ;
412
13+ use lance_graph_contract:: grammar:: role_keys:: {
14+ CONTEXT_SLICE , INSTRUMENT_SLICE , KAUSAL_SLICE , LOKAL_SLICE , MODAL_SLICE ,
15+ MODIFIER_SLICE , OBJECT_SLICE , PREDICATE_SLICE , RoleKeySlice , SUBJECT_SLICE ,
16+ TEMPORAL_SLICE , VSA_DIMS ,
17+ } ;
18+
519#[ derive( Debug , Clone , Copy , PartialEq , Eq , Default ) ]
620pub enum Kernel {
721 Uniform ,
@@ -36,20 +50,26 @@ pub enum GrammaticalRole {
3650}
3751
3852impl GrammaticalRole {
39- /// Slice of the 16384-dim VSA carrier that owns this role.
40- pub fn slice ( & self ) -> ( usize , usize ) {
53+ /// Canonical [start:stop) slice of the 16384-dim VSA carrier that owns
54+ /// this role, sourced from `lance_graph_contract::grammar::role_keys`.
55+ /// This is the single source of truth — the constants below are simple
56+ /// re-exports of the contract crate's `RoleKeySlice` descriptors so that
57+ /// every consumer (markov bundler, role-key catalogue, slice-aware
58+ /// codecs) agrees on the same boundaries.
59+ pub fn slice ( & self ) -> RoleKeySlice {
4160 match self {
42- Self :: Subject => ( 0 , 3277 ) ,
43- Self :: Predicate => ( 3277 , 6554 ) ,
44- Self :: Object => ( 6554 , 9830 ) ,
45- Self :: Modifier => ( 9830 , 13107 ) ,
46- Self :: Context => ( 13107 , 16384 ) ,
47- // TEKAMOLO sub-slices inside Context band.
48- Self :: Temporal => ( 13107 , 13762 ) ,
49- Self :: Kausal => ( 13762 , 14418 ) ,
50- Self :: Modal => ( 14418 , 15074 ) ,
51- Self :: Lokal => ( 15074 , 15729 ) ,
52- Self :: Instrument => ( 15729 , 16384 ) ,
61+ Self :: Subject => SUBJECT_SLICE ,
62+ Self :: Predicate => PREDICATE_SLICE ,
63+ Self :: Object => OBJECT_SLICE ,
64+ Self :: Modifier => MODIFIER_SLICE ,
65+ Self :: Context => CONTEXT_SLICE ,
66+ // TEKAMOLO sub-slices (NOT inside Context — they live in their
67+ // own [9000..9650) post-context band per role_keys.rs layout).
68+ Self :: Temporal => TEMPORAL_SLICE ,
69+ Self :: Kausal => KAUSAL_SLICE ,
70+ Self :: Modal => MODAL_SLICE ,
71+ Self :: Lokal => LOKAL_SLICE ,
72+ Self :: Instrument => INSTRUMENT_SLICE ,
5373 }
5474 }
5575}
@@ -77,7 +97,9 @@ impl MarkovBundler {
7797 Self {
7898 radius,
7999 kernel,
80- dims : 16_384 ,
100+ // Width of the canonical VSA carrier — kept in lock-step with
101+ // `lance_graph_contract::grammar::role_keys::VSA_DIMS` (16_384).
102+ dims : VSA_DIMS ,
81103 buffer : std:: collections:: VecDeque :: with_capacity ( ( 2 * radius + 1 ) as usize ) ,
82104 }
83105 }
@@ -101,18 +123,33 @@ impl MarkovBundler {
101123 let delta = ( i as i32 ) - focal;
102124 let weight = self . kernel . weight ( delta, self . radius ) ;
103125 for tok in & sent. tokens {
104- let ( start, stop) = tok. role . slice ( ) ;
105- let len = ( stop - start) . min ( tok. content_fp . len ( ) ) ;
126+ let slice = tok. role . slice ( ) ;
127+ // Use the canonical role_keys width (NOT an equal partition).
128+ let len = slice. len ( ) . min ( tok. content_fp . len ( ) ) ;
106129 for k in 0 ..len {
107- acc[ start + k] += weight * tok. content_fp [ k] ;
130+ acc[ slice . start + k] += weight * tok. content_fp [ k] ;
108131 }
109132 }
110133 }
111- // permute by position offset (rotate_right)
112- if !acc. is_empty ( ) {
113- let k = ( self . radius as usize ) % acc. len ( ) ;
114- acc. rotate_right ( k) ;
134+ // REMOVED: post-bundle acc.rotate_right(k) — corrupted role-slice alignment.
135+ // Plan called for per-sentence pre-bundle vsa_permute; that's a follow-up.
136+ // Until then, no permutation = aligned bundle.
137+
138+ // Bundle normalization (HIGH item from PR #279 review): divide by the
139+ // sum of |kernel weights| so cosine comparisons across kernel choices
140+ // are invariant to kernel-shape magnitude. Without this, MexicanHat
141+ // bundles have systematically smaller norms than Uniform bundles
142+ // simply because the kernel weights peak at 1 and decay.
143+ let radius_i = self . radius as i32 ;
144+ let total_abs_weight: f32 = ( -radius_i..=radius_i)
145+ . map ( |d| self . kernel . weight ( d, self . radius ) . abs ( ) )
146+ . sum ( ) ;
147+ if total_abs_weight > 1e-9 {
148+ for v in acc. iter_mut ( ) {
149+ * v /= total_abs_weight;
150+ }
115151 }
152+
116153 Trajectory {
117154 fingerprint : acc,
118155 radius : self . radius ,
@@ -158,8 +195,180 @@ mod tests {
158195 }
159196 #[ test]
160197 fn role_slices_disjoint ( ) {
198+ // SPO core slices are contiguous: SUBJECT.stop == PREDICATE.start by
199+ // construction in `role_keys.rs` (0..2000, 2000..4000, ...).
161200 let s = GrammaticalRole :: Subject . slice ( ) ;
162201 let p = GrammaticalRole :: Predicate . slice ( ) ;
163- assert_eq ! ( s. 1 , p. 0 ) ;
202+ assert_eq ! ( s. stop, p. start) ;
203+ }
204+
205+ #[ test]
206+ fn role_slice_widths_match_role_keys_canonical ( ) {
207+ // Spot-check that `GrammaticalRole::slice` returns the role_keys-canonical
208+ // widths (NOT the old equal-partition 16384/5 = 3277 layout).
209+ assert_eq ! ( GrammaticalRole :: Subject . slice( ) . len( ) , 2000 ) ;
210+ assert_eq ! ( GrammaticalRole :: Predicate . slice( ) . len( ) , 2000 ) ;
211+ assert_eq ! ( GrammaticalRole :: Object . slice( ) . len( ) , 2000 ) ;
212+ assert_eq ! ( GrammaticalRole :: Modifier . slice( ) . len( ) , 1500 ) ;
213+ assert_eq ! ( GrammaticalRole :: Context . slice( ) . len( ) , 1500 ) ;
214+ assert_eq ! ( GrammaticalRole :: Temporal . slice( ) . len( ) , 200 ) ;
215+ assert_eq ! ( GrammaticalRole :: Kausal . slice( ) . len( ) , 200 ) ;
216+ assert_eq ! ( GrammaticalRole :: Modal . slice( ) . len( ) , 100 ) ;
217+ assert_eq ! ( GrammaticalRole :: Lokal . slice( ) . len( ) , 150 ) ;
218+ assert_eq ! ( GrammaticalRole :: Instrument . slice( ) . len( ) , 100 ) ;
219+ }
220+
221+ /// Helper: fill a bundler's window so a single push triggers `bundle_current`.
222+ fn fill_and_bundle (
223+ kernel : Kernel ,
224+ radius : u32 ,
225+ sent : WindowedSentence ,
226+ ) -> Trajectory {
227+ let mut b = MarkovBundler :: new ( radius, kernel) ;
228+ let cap = ( 2 * radius + 1 ) as usize ;
229+ let mut last: Option < Trajectory > = None ;
230+ for _ in 0 ..cap {
231+ last = b. push ( sent. clone ( ) ) ;
232+ }
233+ last. expect ( "bundler should emit a trajectory once window is full" )
234+ }
235+
236+ /// Helper: push a sequence of distinct sentences so per-position
237+ /// kernel weights actually shape the bundle. Returns the trajectory
238+ /// emitted on the final push (window saturated).
239+ fn bundle_sequence (
240+ kernel : Kernel ,
241+ radius : u32 ,
242+ sentences : Vec < WindowedSentence > ,
243+ ) -> Trajectory {
244+ let mut b = MarkovBundler :: new ( radius, kernel) ;
245+ let cap = ( 2 * radius + 1 ) as usize ;
246+ assert_eq ! ( sentences. len( ) , cap, "sequence must fill exactly one window" ) ;
247+ let mut last: Option < Trajectory > = None ;
248+ for s in sentences {
249+ last = b. push ( s) ;
250+ }
251+ last. expect ( "bundler should emit on the saturating push" )
252+ }
253+
254+ /// REGRESSION (PR #279 CRITICAL #2): the removed `rotate_right` shifted
255+ /// SUBJECT-slice content into the PREDICATE slice (or worse, the
256+ /// CONTEXT band). After the fix, a SUBJECT-only window must keep all
257+ /// non-zero content inside `[0, 3277)` and have ~zero everywhere else.
258+ #[ test]
259+ fn bundle_does_not_rotate_subject_dims_outside_subject_slice ( ) {
260+ // SUBJECT-only window: every sentence has a single SUBJECT token
261+ // whose content_fp is all 1.0 across the SUBJECT slice.
262+ let subject_len = GrammaticalRole :: Subject . slice ( ) . stop
263+ - GrammaticalRole :: Subject . slice ( ) . start ;
264+ let sent = WindowedSentence {
265+ tokens : vec ! [ TokenWithRole {
266+ content_fp: vec![ 1.0 ; subject_len] ,
267+ role: GrammaticalRole :: Subject ,
268+ } ] ,
269+ } ;
270+ let traj = fill_and_bundle ( Kernel :: Uniform , 5 , sent) ;
271+
272+ let _slice = GrammaticalRole :: Subject . slice ( ) ;
273+ let s_start = _slice. start ;
274+ let s_stop = _slice. stop ;
275+ // SUBJECT slice should be non-zero (positive after normalization).
276+ let subject_sum: f32 =
277+ traj. fingerprint [ s_start..s_stop] . iter ( ) . sum ( ) ;
278+ assert ! (
279+ subject_sum > 1.0 ,
280+ "expected non-trivial SUBJECT content, got sum={subject_sum}"
281+ ) ;
282+ // Outside the SUBJECT slice every dim must be ~0 (no rotation).
283+ let outside_max: f32 = traj. fingerprint [ s_stop..]
284+ . iter ( )
285+ . fold ( 0.0f32 , |acc, v| acc. max ( v. abs ( ) ) ) ;
286+ assert ! (
287+ outside_max < 1e-6 ,
288+ "rotation leaked SUBJECT content past slice boundary: \
289+ max |outside| = {outside_max}"
290+ ) ;
291+ }
292+
293+ /// MexicanHat and Uniform kernels must produce materially different
294+ /// bundles on the same window — otherwise the kernel selector is
295+ /// ineffective at runtime. Uses an asymmetric heterogeneous window
296+ /// (one outlier position carries content; others are blank) so that
297+ /// per-position kernel weights reshape the accumulated bundle in a
298+ /// way symmetric kernels can't equalize.
299+ #[ test]
300+ fn mexican_hat_bundle_differs_from_uniform_bundle ( ) {
301+ let subject_len = GrammaticalRole :: Subject . slice ( ) . stop
302+ - GrammaticalRole :: Subject . slice ( ) . start ;
303+ let radius = 5u32 ;
304+ let cap = ( 2 * radius + 1 ) as usize ;
305+ // Single outlier at position 1 (delta = -4). Uniform weights this
306+ // identically to focal; MexicanHat strongly attenuates it
307+ // (w(-4, 5) ≈ 0.26 vs w(0, 5) = 1.0). Normalization divides each
308+ // by its own Σ|w|, so the per-dim values differ across the
309+ // SUBJECT slice.
310+ let outlier_pos = 1usize ;
311+ let sentences: Vec < WindowedSentence > = ( 0 ..cap)
312+ . map ( |i| WindowedSentence {
313+ tokens : vec ! [ TokenWithRole {
314+ content_fp: vec![
315+ if i == outlier_pos { 1.0 } else { 0.0 } ;
316+ subject_len
317+ ] ,
318+ role: GrammaticalRole :: Subject ,
319+ } ] ,
320+ } )
321+ . collect ( ) ;
322+ let uni = bundle_sequence ( Kernel :: Uniform , radius, sentences. clone ( ) ) ;
323+ let mex = bundle_sequence ( Kernel :: MexicanHat , radius, sentences) ;
324+ assert_eq ! ( uni. fingerprint. len( ) , mex. fingerprint. len( ) ) ;
325+ let l2: f32 = uni
326+ . fingerprint
327+ . iter ( )
328+ . zip ( mex. fingerprint . iter ( ) )
329+ . map ( |( a, b) | ( a - b) * ( a - b) )
330+ . sum :: < f32 > ( )
331+ . sqrt ( ) ;
332+ assert ! (
333+ l2 > 1e-3 ,
334+ "MexicanHat bundle should differ from Uniform bundle, l2={l2}"
335+ ) ;
336+ }
337+
338+ /// Bundle normalization (HIGH from PR #279) makes the L2 norm
339+ /// invariant to kernel-shape magnitude. We assert all three kernels
340+ /// land in a loose [0.5, 1.5] band on a controlled SUBJECT-only window.
341+ #[ test]
342+ fn bundle_l2_norm_invariant_to_kernel ( ) {
343+ let subject_len = GrammaticalRole :: Subject . slice ( ) . stop
344+ - GrammaticalRole :: Subject . slice ( ) . start ;
345+ let sent = WindowedSentence {
346+ tokens : vec ! [ TokenWithRole {
347+ content_fp: vec![ 1.0 ; subject_len] ,
348+ role: GrammaticalRole :: Subject ,
349+ } ] ,
350+ } ;
351+ for k in [ Kernel :: Uniform , Kernel :: MexicanHat , Kernel :: Gaussian ] {
352+ let traj = fill_and_bundle ( k, 5 , sent. clone ( ) ) ;
353+ // Per-dim mean of |v| × sqrt(N_subj) ≈ L2 norm; we test L2 directly.
354+ let l2: f32 = traj
355+ . fingerprint
356+ . iter ( )
357+ . map ( |v| v * v)
358+ . sum :: < f32 > ( )
359+ . sqrt ( ) ;
360+ // Each SUBJECT dim sums to (Σ_i w_i) / (Σ_i |w_i|). For Uniform
361+ // and Gaussian (all-positive weights) this is exactly 1.0 per dim,
362+ // so L2 = sqrt(subject_len) ≈ 57.2. For Mexican-hat the negative
363+ // brim cancels part of the positive core, dropping the per-dim
364+ // value but keeping it within the same order of magnitude.
365+ // We loose-bound on L2 / sqrt(subject_len) ∈ [0.5, 1.5].
366+ let scale = ( subject_len as f32 ) . sqrt ( ) ;
367+ let norm_l2 = l2 / scale;
368+ assert ! (
369+ ( 0.5 ..=1.5 ) . contains( & norm_l2) ,
370+ "kernel {k:?}: normalized L2 {norm_l2} (raw {l2}) out of [0.5, 1.5]"
371+ ) ;
372+ }
164373 }
165374}
0 commit comments