|
| 1 | +# 24_BLAKE3_INT8_BUNDLE_ENCODING.md |
| 2 | + |
| 3 | +## Text → Binary → Bundle → Think. No Model. No Float. No GPU. Ever. |
| 4 | + |
| 5 | +**Target:** hexagon/crates/hexagon-core/, hexagon/crates/hexagon-cam/ |
| 6 | +**Replaces:** 5 separate text→fingerprint encoders with ONE path |
| 7 | + |
| 8 | +--- |
| 9 | + |
| 10 | +## The Pipeline |
| 11 | + |
| 12 | +``` |
| 13 | +Text "Alice loves Bob" |
| 14 | + ↓ |
| 15 | +Tokenize: ["Alice", "loves", "Bob"] |
| 16 | + ↓ |
| 17 | +Per token: BLAKE3(token) → 16384 deterministic bits |
| 18 | + ↓ |
| 19 | +Bundle into i8 accumulator: [i8; 16384] |
| 20 | + per bit k: input=1 → acc[k]+=1, input=0 → acc[k]-=1 |
| 21 | + ↓ |
| 22 | +Threshold: |acc[k]| > f(N) ? |
| 23 | + sign → data bit |
| 24 | + magnitude → alpha bit |
| 25 | + ↓ |
| 26 | +Output: (data: [u64; 256], alpha: [u64; 256]) = 4 KB per concept |
| 27 | +``` |
| 28 | + |
| 29 | +## The i8 Accumulator |
| 30 | + |
| 31 | +```rust |
| 32 | +struct Accumulator { |
| 33 | + weights: [i8; 16384], // 16 KB — fits L1 cache |
| 34 | + count: u32, // how many inputs bundled |
| 35 | +} |
| 36 | + |
| 37 | +impl Accumulator { |
| 38 | + fn bundle_token(&mut self, token: &str) { |
| 39 | + let hash = blake3::hash(token.as_bytes()); |
| 40 | + let bits = expand_hash_to_bits(hash); // BLAKE3 → LFSR → 16384 bits |
| 41 | + for k in 0..16384 { |
| 42 | + if bits[k] { |
| 43 | + self.weights[k] = self.weights[k].saturating_add(1); |
| 44 | + } else { |
| 45 | + self.weights[k] = self.weights[k].saturating_sub(1); |
| 46 | + } |
| 47 | + } |
| 48 | + self.count += 1; |
| 49 | + } |
| 50 | + |
| 51 | + fn extract(&self) -> (Data, Alpha) { |
| 52 | + let threshold = self.alpha_threshold(); |
| 53 | + let mut data = [0u64; 256]; |
| 54 | + let mut alpha = [0u64; 256]; |
| 55 | + |
| 56 | + for k in 0..16384 { |
| 57 | + let word = k / 64; |
| 58 | + let bit = k % 64; |
| 59 | + |
| 60 | + // Sign → data |
| 61 | + if self.weights[k] > 0 { |
| 62 | + data[word] |= 1 << bit; |
| 63 | + } |
| 64 | + |
| 65 | + // Magnitude → alpha |
| 66 | + if self.weights[k].unsigned_abs() > threshold { |
| 67 | + alpha[word] |= 1 << bit; |
| 68 | + } |
| 69 | + } |
| 70 | + (Data(data), Alpha(alpha)) |
| 71 | + } |
| 72 | + |
| 73 | + fn alpha_threshold(&self) -> u8 { |
| 74 | + // Scale with bundle count |
| 75 | + // N=1: threshold=0 (single input, everything defined) |
| 76 | + // N=5: threshold=2 (need 70%+ agreement) |
| 77 | + // N=10: threshold=4 (need 70%+ agreement) |
| 78 | + // N=100: threshold=30 (need 65%+ agreement) |
| 79 | + match self.count { |
| 80 | + 0..=1 => 0, |
| 81 | + 2..=5 => (self.count as u8) / 2, |
| 82 | + 6..=20 => (self.count as u8) * 2 / 5, |
| 83 | + _ => (self.count as f32 * 0.3) as u8, |
| 84 | + } |
| 85 | + } |
| 86 | +} |
| 87 | +``` |
| 88 | + |
| 89 | +## Why i8 |
| 90 | + |
| 91 | +``` |
| 92 | +i8 range: -128 to +127 |
| 93 | +Saturating arithmetic: never overflows, clamps at bounds |
| 94 | +16384 × 1 byte = 16 KB accumulator → fits L1 cache entirely |
| 95 | +Compare: f32 accumulator = 64 KB → spills to L2 (4x slower) |
| 96 | +Compare: i16 accumulator = 32 KB → borderline L1 |
| 97 | +
|
| 98 | +i8 handles up to ~127 bundles before saturation. |
| 99 | +That covers: sentences (5-20 words), paragraphs (50-100 words), |
| 100 | +concepts accumulated over many encounters. |
| 101 | +
|
| 102 | +For corpus-scale accumulation (1000+ encounters): |
| 103 | + Option A: use i16 (32 KB, still L1/L2 boundary) |
| 104 | + Option B: periodically extract → rebundle at half magnitude |
| 105 | + Option C: accept saturation as "maximum confidence" |
| 106 | + Recommend B for production, C for simplicity. |
| 107 | +``` |
| 108 | + |
| 109 | +## Semantic Quality Without Embeddings |
| 110 | + |
| 111 | +``` |
| 112 | +"The king rules the kingdom" |
| 113 | + → bundle(BLAKE3("the"), BLAKE3("king"), BLAKE3("rules"), |
| 114 | + BLAKE3("the"), BLAKE3("kingdom")) |
| 115 | +
|
| 116 | +"The queen rules the kingdom" |
| 117 | + → bundle(BLAKE3("the"), BLAKE3("queen"), BLAKE3("rules"), |
| 118 | + BLAKE3("the"), BLAKE3("kingdom")) |
| 119 | +
|
| 120 | +Shared: "the"(×2), "rules", "kingdom" = 4/5 terms identical |
| 121 | +Different: "king" vs "queen" = 1/5 terms |
| 122 | +
|
| 123 | +Bundle Hamming distance: ~10-15% (very similar!) |
| 124 | +The CONTEXT overlap makes the BUNDLES similar. |
| 125 | +
|
| 126 | +"You shall know a word by the company it keeps" — Firth 1957 |
| 127 | +Implemented as majority voting on BLAKE3 hashes. No training. |
| 128 | +``` |
| 129 | + |
| 130 | +## SPO Encoding With i8 Bundles |
| 131 | + |
| 132 | +``` |
| 133 | +Triple: "Alice loves Bob" in context "Alice has always loved Bob deeply" |
| 134 | +
|
| 135 | +S_acc: i8 accumulator |
| 136 | + bundle("Alice", "has", "always") // subject + context |
| 137 | + extract → (s_data, s_alpha) |
| 138 | +
|
| 139 | +P_acc: i8 accumulator |
| 140 | + bundle("loves", "loved", "always", "deeply") // predicate + context |
| 141 | + extract → (p_data, p_alpha) |
| 142 | +
|
| 143 | +O_acc: i8 accumulator |
| 144 | + bundle("Bob", "deeply") // object + context |
| 145 | + extract → (o_data, o_alpha) |
| 146 | +
|
| 147 | +Bind: |
| 148 | + triple_data = s_data XOR ROLE_S XOR p_data XOR ROLE_P XOR o_data XOR ROLE_O |
| 149 | + triple_alpha = s_alpha AND p_alpha AND o_alpha |
| 150 | +
|
| 151 | +Store: 4 KB (2 KB data + 2 KB alpha) |
| 152 | +``` |
| 153 | + |
| 154 | +## Partial Triple Encoding |
| 155 | + |
| 156 | +``` |
| 157 | +"Alice loves ???" (unknown object) |
| 158 | +
|
| 159 | +S_acc: bundle("Alice", context...) → (s_data, s_alpha) — fully defined |
| 160 | +P_acc: bundle("loves", context...) → (p_data, p_alpha) — fully defined |
| 161 | +O_acc: empty → (zeros, zeros) — alpha = all zero |
| 162 | +
|
| 163 | +SP_ projection: |
| 164 | + data = s_data XOR ROLE_S XOR p_data XOR ROLE_P |
| 165 | + alpha = s_alpha AND p_alpha = fully defined |
| 166 | +
|
| 167 | +Full SPO attempt: |
| 168 | + alpha = s_alpha AND p_alpha AND o_alpha = s_alpha AND p_alpha AND 0...0 = 0...0 |
| 169 | + → Belichtungsmesser reads zero → early exit → never stored as complete triple |
| 170 | +
|
| 171 | +The alpha channel STRUCTURALLY PREVENTS storing unknowns as knowledge. |
| 172 | +``` |
| 173 | + |
| 174 | +## What This Eliminates |
| 175 | + |
| 176 | +``` |
| 177 | +ELIMINATED: REPLACED BY: |
| 178 | + core/fingerprint.rs from_content() BLAKE3 → i8 bundle |
| 179 | + spo/nsm_substrate.rs encode() BLAKE3 → i8 bundle |
| 180 | + spo/codebook_training.rs encode() BLAKE3 → i8 bundle |
| 181 | + spo/deepnsm_integration.rs encode() BLAKE3 → i8 bundle (training optional) |
| 182 | + spo/crystal_lm.rs encode_clean() BLAKE3 → i8 bundle |
| 183 | + spo/jina_api.rs Not needed |
| 184 | + spo/jina_cache.rs Not needed |
| 185 | + Any Jina API dependency Not needed |
| 186 | + Any float embedding anywhere Not needed |
| 187 | + Any GPU for inference Not needed |
| 188 | +
|
| 189 | +KEPT AS OPTIONAL ENHANCEMENT: |
| 190 | + NSM prime decomposition as preprocessing: |
| 191 | + text → NSM primes → BLAKE3(prime) per prime → i8 bundle |
| 192 | + Better semantics. Same pipeline. Optional. |
| 193 | + |
| 194 | + Codebook for O(1) known-concept lookup: |
| 195 | + If concept already in codebook → return cached fingerprint |
| 196 | + If not → encode via pipeline above, add to codebook |
| 197 | + Codebook is ACCELERATION, not REQUIREMENT. |
| 198 | +``` |
| 199 | + |
| 200 | +## Implementation Location |
| 201 | + |
| 202 | +``` |
| 203 | +hexagon/crates/hexagon-core/src/ |
| 204 | + encode.rs (NEW, ~200 lines) |
| 205 | + Accumulator struct |
| 206 | + bundle_token() |
| 207 | + bundle_tokens() — convenience for word list |
| 208 | + extract() → (Data, Alpha) |
| 209 | + alpha_threshold() |
| 210 | + expand_hash_to_bits() — BLAKE3 output → 16384 bits via LFSR |
| 211 | +
|
| 212 | + Depends on: blake3 crate only. Zero other deps. |
| 213 | + Fits in: hexagon-core (this IS a core instruction: ENCODE) |
| 214 | +
|
| 215 | +hexagon/crates/hexagon-cam/src/ |
| 216 | + codebook.rs — OPTIONAL known-concept cache |
| 217 | + Uses encode.rs for new concepts |
| 218 | + Returns cached (data, alpha) for known concepts |
| 219 | +``` |
| 220 | + |
| 221 | +## The Encoding IS the Seventh Instruction |
| 222 | + |
| 223 | +``` |
| 224 | +XOR.α bind / unbind |
| 225 | +POPCOUNT.α distance / similarity |
| 226 | +MAJORITY.α bundle / superpose |
| 227 | +AND.α/NOT.α 2^3 factorization |
| 228 | +BLAKE3.α seal / verify |
| 229 | +THRESHOLD.α σ-band gating |
| 230 | +ENCODE.α text → (data, alpha) via i8 bundle ← THIS |
| 231 | +
|
| 232 | +Still 6 RISC instructions for COMPUTATION. |
| 233 | +ENCODE is INPUT — how data enters the substrate. |
| 234 | +Not part of the compute loop. Part of the ingestion path. |
| 235 | +``` |
| 236 | + |
| 237 | +--- |
| 238 | + |
| 239 | +*"BLAKE3 gives you bits. Bundling gives you meaning. Alpha gives you honesty. No model needed."* |
0 commit comments