Skip to content

Commit 218c2cf

Browse files
authored
Merge pull request #282 from AdaWorldAPI/claude/grammar-fixes-r2-2026-04-29
fix: Grammar/Markov hardening — slice unification, kernel wiring, parser tests, triangle distance
2 parents 63c3f5b + eeddfe9 commit 218c2cf

10 files changed

Lines changed: 1206 additions & 129 deletions

File tree

crates/deepnsm/Cargo.toml

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ version = "0.1.0"
44
edition = "2021"
55
license = "Apache-2.0"
66
publish = false
7+
8+
# Empty `[workspace]` so cargo treats this crate as standalone when invoked
9+
# via `--manifest-path` (deepnsm is `exclude`d from the parent workspace,
10+
# but in nested git-worktree directories cargo's auto-discovery would
11+
# otherwise walk further up and pick up the outer workspace root).
12+
[workspace]
713
description = """
814
DeepNSM: Distributional semantic transformer replacement.
915
4,096 words × 12 bits × 8MB distance matrix = complete semantic engine.
@@ -20,12 +26,17 @@ No GPU. No learned weights. Same decision boundaries as cosine.
2026
# never touches backend optimization files.
2127
[features]
2228
default = []
23-
contract-ticket = ["dep:lance-graph-contract"]
29+
contract-ticket = []
2430
grammar-triangle = ["dep:lance-graph-cognitive"]
2531

2632
[dependencies]
2733
ndarray = { path = "../../../ndarray", default-features = false, features = ["std"] }
28-
lance-graph-contract = { path = "../lance-graph-contract", optional = true }
34+
# `lance-graph-contract` is now a hard dep: markov_bundle imports the canonical
35+
# RoleKeySlice constants (SUBJECT_SLICE..) from `grammar::role_keys` so that
36+
# slice coordinates are unified with the rest of the workspace (CRITICAL fix
37+
# from PR #279 review — the equal-partition 16384/5 layout in markov_bundle
38+
# disagreed with the domain-specific widths in role_keys).
39+
lance-graph-contract = { path = "../lance-graph-contract" }
2940
lance-graph-cognitive = { path = "../lance-graph-cognitive", optional = true }
3041

3142
[dev-dependencies]

crates/deepnsm/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ pub mod vocabulary;
6363

6464
pub mod trajectory;
6565
pub mod markov_bundle;
66+
pub mod nsm_primes;
6667

6768
#[cfg(feature = "contract-ticket")]
6869
pub mod ticket_emit;

crates/deepnsm/src/markov_bundle.rs

Lines changed: 231 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,21 @@
11
//! META-AGENT: add `pub mod markov_bundle;` to lib.rs.
2+
//!
3+
//! Slice coordinates are imported from `lance_graph_contract::grammar::role_keys`
4+
//! so that this module and the rest of the workspace agree on the [start:stop)
5+
//! boundaries of every grammatical role inside the 16384-dim VSA carrier.
6+
//! The previously hard-coded equal-partition layout (`16384 / 5 = 3277` per
7+
//! role) was incompatible with the domain-specific widths in role_keys (e.g.
8+
//! SUBJECT owns [0..2000), TEMPORAL owns [9000..9200)) — see PR #279 review,
9+
//! CRITICAL #1.
210
311
use crate::trajectory::Trajectory;
412

13+
use lance_graph_contract::grammar::role_keys::{
14+
CONTEXT_SLICE, INSTRUMENT_SLICE, KAUSAL_SLICE, LOKAL_SLICE, MODAL_SLICE,
15+
MODIFIER_SLICE, OBJECT_SLICE, PREDICATE_SLICE, RoleKeySlice, SUBJECT_SLICE,
16+
TEMPORAL_SLICE, VSA_DIMS,
17+
};
18+
519
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
620
pub enum Kernel {
721
Uniform,
@@ -36,20 +50,26 @@ pub enum GrammaticalRole {
3650
}
3751

3852
impl GrammaticalRole {
39-
/// Slice of the 16384-dim VSA carrier that owns this role.
40-
pub fn slice(&self) -> (usize, usize) {
53+
/// Canonical [start:stop) slice of the 16384-dim VSA carrier that owns
54+
/// this role, sourced from `lance_graph_contract::grammar::role_keys`.
55+
/// This is the single source of truth — the constants below are simple
56+
/// re-exports of the contract crate's `RoleKeySlice` descriptors so that
57+
/// every consumer (markov bundler, role-key catalogue, slice-aware
58+
/// codecs) agrees on the same boundaries.
59+
pub fn slice(&self) -> RoleKeySlice {
4160
match self {
42-
Self::Subject => (0, 3277),
43-
Self::Predicate => (3277, 6554),
44-
Self::Object => (6554, 9830),
45-
Self::Modifier => (9830, 13107),
46-
Self::Context => (13107, 16384),
47-
// TEKAMOLO sub-slices inside Context band.
48-
Self::Temporal => (13107, 13762),
49-
Self::Kausal => (13762, 14418),
50-
Self::Modal => (14418, 15074),
51-
Self::Lokal => (15074, 15729),
52-
Self::Instrument => (15729, 16384),
61+
Self::Subject => SUBJECT_SLICE,
62+
Self::Predicate => PREDICATE_SLICE,
63+
Self::Object => OBJECT_SLICE,
64+
Self::Modifier => MODIFIER_SLICE,
65+
Self::Context => CONTEXT_SLICE,
66+
// TEKAMOLO sub-slices (NOT inside Context — they live in their
67+
// own [9000..9650) post-context band per role_keys.rs layout).
68+
Self::Temporal => TEMPORAL_SLICE,
69+
Self::Kausal => KAUSAL_SLICE,
70+
Self::Modal => MODAL_SLICE,
71+
Self::Lokal => LOKAL_SLICE,
72+
Self::Instrument => INSTRUMENT_SLICE,
5373
}
5474
}
5575
}
@@ -77,7 +97,9 @@ impl MarkovBundler {
7797
Self {
7898
radius,
7999
kernel,
80-
dims: 16_384,
100+
// Width of the canonical VSA carrier — kept in lock-step with
101+
// `lance_graph_contract::grammar::role_keys::VSA_DIMS` (16_384).
102+
dims: VSA_DIMS,
81103
buffer: std::collections::VecDeque::with_capacity((2 * radius + 1) as usize),
82104
}
83105
}
@@ -101,18 +123,33 @@ impl MarkovBundler {
101123
let delta = (i as i32) - focal;
102124
let weight = self.kernel.weight(delta, self.radius);
103125
for tok in &sent.tokens {
104-
let (start, stop) = tok.role.slice();
105-
let len = (stop - start).min(tok.content_fp.len());
126+
let slice = tok.role.slice();
127+
// Use the canonical role_keys width (NOT an equal partition).
128+
let len = slice.len().min(tok.content_fp.len());
106129
for k in 0..len {
107-
acc[start + k] += weight * tok.content_fp[k];
130+
acc[slice.start + k] += weight * tok.content_fp[k];
108131
}
109132
}
110133
}
111-
// permute by position offset (rotate_right)
112-
if !acc.is_empty() {
113-
let k = (self.radius as usize) % acc.len();
114-
acc.rotate_right(k);
134+
// REMOVED: post-bundle acc.rotate_right(k) — corrupted role-slice alignment.
135+
// Plan called for per-sentence pre-bundle vsa_permute; that's a follow-up.
136+
// Until then, no permutation = aligned bundle.
137+
138+
// Bundle normalization (HIGH item from PR #279 review): divide by the
139+
// sum of |kernel weights| so cosine comparisons across kernel choices
140+
// are invariant to kernel-shape magnitude. Without this, MexicanHat
141+
// bundles have systematically smaller norms than Uniform bundles
142+
// simply because the kernel weights peak at 1 and decay.
143+
let radius_i = self.radius as i32;
144+
let total_abs_weight: f32 = (-radius_i..=radius_i)
145+
.map(|d| self.kernel.weight(d, self.radius).abs())
146+
.sum();
147+
if total_abs_weight > 1e-9 {
148+
for v in acc.iter_mut() {
149+
*v /= total_abs_weight;
150+
}
115151
}
152+
116153
Trajectory {
117154
fingerprint: acc,
118155
radius: self.radius,
@@ -158,8 +195,180 @@ mod tests {
158195
}
159196
#[test]
160197
fn role_slices_disjoint() {
198+
// SPO core slices are contiguous: SUBJECT.stop == PREDICATE.start by
199+
// construction in `role_keys.rs` (0..2000, 2000..4000, ...).
161200
let s = GrammaticalRole::Subject.slice();
162201
let p = GrammaticalRole::Predicate.slice();
163-
assert_eq!(s.1, p.0);
202+
assert_eq!(s.stop, p.start);
203+
}
204+
205+
#[test]
206+
fn role_slice_widths_match_role_keys_canonical() {
207+
// Spot-check that `GrammaticalRole::slice` returns the role_keys-canonical
208+
// widths (NOT the old equal-partition 16384/5 = 3277 layout).
209+
assert_eq!(GrammaticalRole::Subject.slice().len(), 2000);
210+
assert_eq!(GrammaticalRole::Predicate.slice().len(), 2000);
211+
assert_eq!(GrammaticalRole::Object.slice().len(), 2000);
212+
assert_eq!(GrammaticalRole::Modifier.slice().len(), 1500);
213+
assert_eq!(GrammaticalRole::Context.slice().len(), 1500);
214+
assert_eq!(GrammaticalRole::Temporal.slice().len(), 200);
215+
assert_eq!(GrammaticalRole::Kausal.slice().len(), 200);
216+
assert_eq!(GrammaticalRole::Modal.slice().len(), 100);
217+
assert_eq!(GrammaticalRole::Lokal.slice().len(), 150);
218+
assert_eq!(GrammaticalRole::Instrument.slice().len(), 100);
219+
}
220+
221+
/// Helper: fill a bundler's window so a single push triggers `bundle_current`.
222+
fn fill_and_bundle(
223+
kernel: Kernel,
224+
radius: u32,
225+
sent: WindowedSentence,
226+
) -> Trajectory {
227+
let mut b = MarkovBundler::new(radius, kernel);
228+
let cap = (2 * radius + 1) as usize;
229+
let mut last: Option<Trajectory> = None;
230+
for _ in 0..cap {
231+
last = b.push(sent.clone());
232+
}
233+
last.expect("bundler should emit a trajectory once window is full")
234+
}
235+
236+
/// Helper: push a sequence of distinct sentences so per-position
237+
/// kernel weights actually shape the bundle. Returns the trajectory
238+
/// emitted on the final push (window saturated).
239+
fn bundle_sequence(
240+
kernel: Kernel,
241+
radius: u32,
242+
sentences: Vec<WindowedSentence>,
243+
) -> Trajectory {
244+
let mut b = MarkovBundler::new(radius, kernel);
245+
let cap = (2 * radius + 1) as usize;
246+
assert_eq!(sentences.len(), cap, "sequence must fill exactly one window");
247+
let mut last: Option<Trajectory> = None;
248+
for s in sentences {
249+
last = b.push(s);
250+
}
251+
last.expect("bundler should emit on the saturating push")
252+
}
253+
254+
/// REGRESSION (PR #279 CRITICAL #2): the removed `rotate_right` shifted
255+
/// SUBJECT-slice content into the PREDICATE slice (or worse, the
256+
/// CONTEXT band). After the fix, a SUBJECT-only window must keep all
257+
/// non-zero content inside `[0, 3277)` and have ~zero everywhere else.
258+
#[test]
259+
fn bundle_does_not_rotate_subject_dims_outside_subject_slice() {
260+
// SUBJECT-only window: every sentence has a single SUBJECT token
261+
// whose content_fp is all 1.0 across the SUBJECT slice.
262+
let subject_len = GrammaticalRole::Subject.slice().stop
263+
- GrammaticalRole::Subject.slice().start;
264+
let sent = WindowedSentence {
265+
tokens: vec![TokenWithRole {
266+
content_fp: vec![1.0; subject_len],
267+
role: GrammaticalRole::Subject,
268+
}],
269+
};
270+
let traj = fill_and_bundle(Kernel::Uniform, 5, sent);
271+
272+
let _slice = GrammaticalRole::Subject.slice();
273+
let s_start = _slice.start;
274+
let s_stop = _slice.stop;
275+
// SUBJECT slice should be non-zero (positive after normalization).
276+
let subject_sum: f32 =
277+
traj.fingerprint[s_start..s_stop].iter().sum();
278+
assert!(
279+
subject_sum > 1.0,
280+
"expected non-trivial SUBJECT content, got sum={subject_sum}"
281+
);
282+
// Outside the SUBJECT slice every dim must be ~0 (no rotation).
283+
let outside_max: f32 = traj.fingerprint[s_stop..]
284+
.iter()
285+
.fold(0.0f32, |acc, v| acc.max(v.abs()));
286+
assert!(
287+
outside_max < 1e-6,
288+
"rotation leaked SUBJECT content past slice boundary: \
289+
max |outside| = {outside_max}"
290+
);
291+
}
292+
293+
/// MexicanHat and Uniform kernels must produce materially different
294+
/// bundles on the same window — otherwise the kernel selector is
295+
/// ineffective at runtime. Uses an asymmetric heterogeneous window
296+
/// (one outlier position carries content; others are blank) so that
297+
/// per-position kernel weights reshape the accumulated bundle in a
298+
/// way symmetric kernels can't equalize.
299+
#[test]
300+
fn mexican_hat_bundle_differs_from_uniform_bundle() {
301+
let subject_len = GrammaticalRole::Subject.slice().stop
302+
- GrammaticalRole::Subject.slice().start;
303+
let radius = 5u32;
304+
let cap = (2 * radius + 1) as usize;
305+
// Single outlier at position 1 (delta = -4). Uniform weights this
306+
// identically to focal; MexicanHat strongly attenuates it
307+
// (w(-4, 5) ≈ 0.26 vs w(0, 5) = 1.0). Normalization divides each
308+
// by its own Σ|w|, so the per-dim values differ across the
309+
// SUBJECT slice.
310+
let outlier_pos = 1usize;
311+
let sentences: Vec<WindowedSentence> = (0..cap)
312+
.map(|i| WindowedSentence {
313+
tokens: vec![TokenWithRole {
314+
content_fp: vec![
315+
if i == outlier_pos { 1.0 } else { 0.0 };
316+
subject_len
317+
],
318+
role: GrammaticalRole::Subject,
319+
}],
320+
})
321+
.collect();
322+
let uni = bundle_sequence(Kernel::Uniform, radius, sentences.clone());
323+
let mex = bundle_sequence(Kernel::MexicanHat, radius, sentences);
324+
assert_eq!(uni.fingerprint.len(), mex.fingerprint.len());
325+
let l2: f32 = uni
326+
.fingerprint
327+
.iter()
328+
.zip(mex.fingerprint.iter())
329+
.map(|(a, b)| (a - b) * (a - b))
330+
.sum::<f32>()
331+
.sqrt();
332+
assert!(
333+
l2 > 1e-3,
334+
"MexicanHat bundle should differ from Uniform bundle, l2={l2}"
335+
);
336+
}
337+
338+
/// Bundle normalization (HIGH from PR #279) makes the L2 norm
339+
/// invariant to kernel-shape magnitude. We assert all three kernels
340+
/// land in a loose [0.5, 1.5] band on a controlled SUBJECT-only window.
341+
#[test]
342+
fn bundle_l2_norm_invariant_to_kernel() {
343+
let subject_len = GrammaticalRole::Subject.slice().stop
344+
- GrammaticalRole::Subject.slice().start;
345+
let sent = WindowedSentence {
346+
tokens: vec![TokenWithRole {
347+
content_fp: vec![1.0; subject_len],
348+
role: GrammaticalRole::Subject,
349+
}],
350+
};
351+
for k in [Kernel::Uniform, Kernel::MexicanHat, Kernel::Gaussian] {
352+
let traj = fill_and_bundle(k, 5, sent.clone());
353+
// Per-dim mean of |v| × sqrt(N_subj) ≈ L2 norm; we test L2 directly.
354+
let l2: f32 = traj
355+
.fingerprint
356+
.iter()
357+
.map(|v| v * v)
358+
.sum::<f32>()
359+
.sqrt();
360+
// Each SUBJECT dim sums to (Σ_i w_i) / (Σ_i |w_i|). For Uniform
361+
// and Gaussian (all-positive weights) this is exactly 1.0 per dim,
362+
// so L2 = sqrt(subject_len) ≈ 57.2. For Mexican-hat the negative
363+
// brim cancels part of the positive core, dropping the per-dim
364+
// value but keeping it within the same order of magnitude.
365+
// We loose-bound on L2 / sqrt(subject_len) ∈ [0.5, 1.5].
366+
let scale = (subject_len as f32).sqrt();
367+
let norm_l2 = l2 / scale;
368+
assert!(
369+
(0.5..=1.5).contains(&norm_l2),
370+
"kernel {k:?}: normalized L2 {norm_l2} (raw {l2}) out of [0.5, 1.5]"
371+
);
372+
}
164373
}
165374
}

0 commit comments

Comments
 (0)