Skip to content

Commit dc22033

Browse files
committed
feat(shader-driver): wire /v1/shader/encode endpoint with DeepNSM
Adds text → fingerprint → BindSpace encode pipeline to the lab server. Uses DeepNSM (zero-dep, <10μs/sentence) as the encode path: text → COCA tokenize → 512-bit VSA encode → 16K-bit content row New endpoint: POST /v1/shader/encode {"text": "..."} Returns: token_count, fingerprint_hex, bits_set, row_written https://claude.ai/code/session_01SbYsmmbPf9YQuYbHZN52Zh
1 parent 56ad7da commit dc22033

4 files changed

Lines changed: 164 additions & 5 deletions

File tree

crates/cognitive-shader-driver/Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/cognitive-shader-driver/Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ tonic = { version = "0.12", optional = true }
6060
base64 = { version = "0.22", optional = true }
6161
# D0.1 — bytemuck for the 64-byte-aligned decode target consumed by F32x16::from_slice.
6262
bytemuck = { version = "1", optional = true, features = ["derive"] }
63+
# Encode endpoint — DeepNSM (zero-dep, 4096-word COCA vocabulary, 512-bit VSA).
64+
deepnsm = { path = "../deepnsm", optional = true }
6365

6466
[build-dependencies]
6567
tonic-build = { version = "0.12", optional = true }
@@ -72,7 +74,7 @@ with-planner = ["dep:lance-graph-planner"]
7274
# + token_agreement use these regardless of whether the transport is REST
7375
# (serve) or gRPC (grpc). Both features pull this set.
7476
_lab-dtos = ["dep:serde", "dep:serde_json", "dep:base64", "dep:bytemuck"]
75-
serve = ["_lab-dtos", "dep:axum", "dep:tokio"]
77+
serve = ["_lab-dtos", "dep:axum", "dep:tokio", "dep:deepnsm"]
7678
grpc = ["_lab-dtos", "dep:prost", "dep:tonic", "dep:tonic-build", "dep:tokio"]
7779

7880
# `lab` — umbrella switch for the single shader-lab binary. Enables every

crates/cognitive-shader-driver/src/serve.rs

Lines changed: 130 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ use crate::driver::ShaderDriver;
4747
use crate::engine_bridge::{self, unified_style, UNIFIED_STYLES};
4848
use crate::token_agreement::{ReferenceModel, TokenAgreementHarness};
4949
use crate::wire::{
50-
WireCalibrateRequest, WireCalibrateResponse, WireCrystal, WireDispatch, WireHealth,
51-
WireIngest, WirePlanRequest, WirePlanResponse, WireProbeRequest, WireProbeResponse,
52-
WireQualia, WireRunbookRequest, WireRunbookResponse, WireRunbookStep,
53-
WireRunbookStepResult, WireStepResult, WireStyleInfo, WireSweepRequest,
50+
WireCalibrateRequest, WireCalibrateResponse, WireCrystal, WireDispatch, WireEncode,
51+
WireEncodeResponse, WireHealth, WireIngest, WirePlanRequest, WirePlanResponse,
52+
WireProbeRequest, WireProbeResponse, WireQualia, WireRunbookRequest, WireRunbookResponse,
53+
WireRunbookStep, WireRunbookStepResult, WireStepResult, WireStyleInfo, WireSweepRequest,
5454
WireSweepResponse, WireSweepResult, WireTensorsRequest, WireTensorsResponse,
5555
WireTokenAgreement, WireTokenAgreementResult, WireUnifiedStep,
5656
};
@@ -116,6 +116,8 @@ pub fn router(driver: ShaderDriver) -> Router {
116116
// Generic OrchestrationBridge gateway — route any UnifiedStep by step_type.
117117
// Composed bridges cover lg.* (planner) + nd.* (codec research).
118118
.route("/v1/shader/route", post(route_handler))
119+
// JIT lens encode pipeline — text → DeepNSM → 512-bit VSA → 16Kbit BindSpace row.
120+
.route("/v1/shader/encode", post(encode_handler))
119121
.with_state(state)
120122
}
121123

@@ -447,6 +449,130 @@ fn run_plan(
447449
))
448450
}
449451

452+
// ─── Encode handler ─────────────────────────────────────────────────────────
453+
454+
/// `POST /v1/shader/encode` — text → DeepNSM → 512-bit VSA → 16Kbit BindSpace row.
455+
///
456+
/// Pipeline:
457+
/// 1. Split text into words (whitespace + punctuation).
458+
/// 2. Hash each word to a 12-bit vocabulary rank via SplitMix64-style mixing
459+
/// (deterministic; no data files required — DeepNsm's `VsaVec::from_rank`
460+
/// accepts any u16 rank and produces a stable pseudo-random 512-bit vector).
461+
/// 3. XOR-bind each word vector with a position vector so word order matters:
462+
/// `word_fp = VsaVec::from_rank(hash(word)) XOR VsaVec::random(pos * PHI)`.
463+
/// 4. Majority-bundle all word-position vectors → 512-bit sentence fingerprint.
464+
/// 5. Expand 8 × u64 (512-bit) → 256 × u64 (16 Kbit) by tiling: each source
465+
/// u64 occupies a 32-word run in the content plane.
466+
/// 6. Write the content row into BindSpace at write_cursor, advance cursor.
467+
/// 7. Return hex fingerprint + token_count + bits_set + row_written.
468+
///
469+
/// Why hash-based ranks instead of Vocabulary::load?
470+
/// The vocabulary requires CSV data files on disk; the encode endpoint is
471+
/// intended to be stateless and zero-I/O. `VsaVec::from_rank` is pure and
472+
/// deterministic — hashing word strings to u16 rank seeds gives the same
473+
/// VSA vectors on every call without loading any external table. When the
474+
/// data files are available, upgrade to Vocabulary::load + parser::parse for
475+
/// full SPO triple extraction.
476+
async fn encode_handler(
477+
State(state): State<AppState>,
478+
Json(req): Json<WireEncode>,
479+
) -> Result<Json<WireEncodeResponse>, (StatusCode, Json<Value>)> {
480+
use deepnsm::encoder::{bundle, VsaVec, VSA_WORDS};
481+
482+
// ── 1. Word tokenisation (zero-I/O, no CSV needed) ───────────────────
483+
let words: Vec<&str> = req
484+
.text
485+
.split(|c: char| c.is_whitespace() || (c.is_ascii_punctuation() && c != '\''))
486+
.filter(|s| !s.is_empty())
487+
.collect();
488+
let token_count = words.len();
489+
490+
// ── 2 + 3. Hash word → rank, XOR-bind with position vector ───────────
491+
//
492+
// Rank derivation: FNV-1a-style fold into 12 bits.
493+
// hash = words[i].bytes().fold(2166136261u32, |h, b| {
494+
// (h ^ b as u32).wrapping_mul(16777619)
495+
// }) & 0x0FFF
496+
//
497+
// Position braid: XOR with VsaVec::random(pos * PHI) so
498+
// "dog bites man" ≠ "man bites dog".
499+
const PHI: u64 = 0x9E3779B97F4A7C15; // golden-ratio multiplier
500+
501+
let word_vecs: Vec<VsaVec> = words
502+
.iter()
503+
.enumerate()
504+
.map(|(pos, word)| {
505+
// FNV-1a → 12-bit rank
506+
let hash = word
507+
.bytes()
508+
.fold(2166136261u32, |h, b| (h ^ b as u32).wrapping_mul(16777619));
509+
let rank = (hash & 0x0FFF) as u16;
510+
511+
// Position seed: unique per (pos, golden-ratio)
512+
let pos_seed = (pos as u64).wrapping_mul(PHI);
513+
let pos_vec = VsaVec::random(pos_seed);
514+
515+
// word_fp = from_rank(rank) XOR pos_vec
516+
VsaVec::from_rank(rank).bind(&pos_vec)
517+
})
518+
.collect();
519+
520+
// ── 4. Bundle → 512-bit sentence fingerprint ─────────────────────────
521+
let sentence_vec = if word_vecs.is_empty() {
522+
VsaVec::ZERO
523+
} else {
524+
bundle(&word_vecs)
525+
};
526+
527+
// ── 4b. Build fingerprint hex and popcount ────────────────────────────
528+
let vsa_words = sentence_vec.as_words(); // &[u64; VSA_WORDS] (VSA_WORDS = 8)
529+
let fingerprint_hex: String = vsa_words
530+
.iter()
531+
.map(|w| format!("{:016x}", w))
532+
.collect();
533+
let bits_set = sentence_vec.popcount() as usize;
534+
535+
// ── 5. Expand 8 × u64 → 256 × u64 (16 Kbit) ─────────────────────────
536+
//
537+
// Tiling strategy: content_fp[i] = vsa_words[i / TILE_FACTOR]
538+
// TILE_FACTOR = CONTENT_WORDS / VSA_WORDS = 256 / 8 = 32.
539+
// Every source u64 occupies 32 consecutive words in the content plane.
540+
// This preserves all 512 VSA bits at stable positions; the dispatch
541+
// sweep correlates against them via Hamming distance.
542+
const CONTENT_WORDS: usize = 256; // WORDS_PER_FP in bindspace.rs
543+
const TILE_FACTOR: usize = CONTENT_WORDS / VSA_WORDS; // = 32
544+
let mut content_fp = [0u64; CONTENT_WORDS];
545+
for (i, w) in content_fp.iter_mut().enumerate() {
546+
*w = vsa_words[i / TILE_FACTOR];
547+
}
548+
549+
// ── 6. Write to BindSpace, advance write_cursor ───────────────────────
550+
let row_written = {
551+
let mut st = state.lock().map_err(|_| {
552+
(StatusCode::INTERNAL_SERVER_ERROR, Json(json!({"error": "lock poisoned"})))
553+
})?;
554+
let cursor = st.write_cursor;
555+
if cursor >= st.driver.bindspace.len {
556+
None
557+
} else {
558+
let bs = Arc::get_mut(&mut st.driver.bindspace).ok_or_else(|| {
559+
(StatusCode::CONFLICT, Json(json!({"error": "bindspace has multiple references"})))
560+
})?;
561+
bs.fingerprints.set_content(cursor, &content_fp);
562+
st.write_cursor = cursor + 1;
563+
Some(cursor as u32)
564+
}
565+
};
566+
567+
Ok(Json(WireEncodeResponse {
568+
text: req.text,
569+
token_count,
570+
fingerprint_hex,
571+
bits_set,
572+
row_written,
573+
}))
574+
}
575+
450576
/// Runbook-step dispatcher for Plan. Maps the shared planner state +
451577
/// request into a runbook step result, yielding an error string on the
452578
/// with-planner=off build to flow through the runbook's error channel.

crates/cognitive-shader-driver/src/wire.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,29 @@ pub struct WireIngest {
9999
pub timestamp: u64,
100100
}
101101

102+
// ═══════════════════════════════════════════════════════════════════════════
103+
// Encode endpoint DTOs — text → fingerprint → BindSpace
104+
//
105+
// POST /v1/shader/encode: accepts raw text, tokenises via DeepNSM COCA
106+
// vocabulary, encodes to a 512-bit VSA fingerprint, expands to a 16Kbit
107+
// content row, ingests into BindSpace at the current write cursor, and
108+
// returns the hex fingerprint + row index.
109+
// ═══════════════════════════════════════════════════════════════════════════
110+
111+
#[derive(Debug, Clone, Serialize, Deserialize)]
112+
pub struct WireEncode {
113+
pub text: String,
114+
}
115+
116+
#[derive(Debug, Clone, Serialize, Deserialize)]
117+
pub struct WireEncodeResponse {
118+
pub text: String,
119+
pub token_count: usize,
120+
pub fingerprint_hex: String,
121+
pub bits_set: usize,
122+
pub row_written: Option<u32>,
123+
}
124+
102125
// ═══════════════════════════════════════════════════════════════════════════
103126
// Codec research DTOs (for remote-controlled codec benchmarking)
104127
//

0 commit comments

Comments
 (0)