@@ -13,8 +13,20 @@ use std::sync::LazyLock;
1313
1414/// Embedded weight files (compiled into the binary via include_bytes!).
1515/// Zero file I/O at runtime — the weights ARE the binary.
16- static JINA_BASE17 : & [ u8 ] = include_bytes ! ( "weights/jina_base17_20k.bin" ) ;
17- static JINA_PALETTE : & [ u8 ] = include_bytes ! ( "weights/jina_palette_20k.bin" ) ;
16+ ///
17+ /// Naming convention: {model}_{aspect}_{vocab_size}k.bin
18+ /// - aspect = base17 (token embeddings) or palette (256-entry lookup)
19+ /// - vocab_size = approximate token count in thousands
20+ static JINA_V4_BASE17 : & [ u8 ] = include_bytes ! ( "weights/jina_base17_20k.bin" ) ;
21+ static JINA_V4_PALETTE : & [ u8 ] = include_bytes ! ( "weights/jina_palette_20k.bin" ) ;
22+
23+ // TODO(jina-v5-bake): When the bake pipeline produces Jina v5 weights
24+ // (151K Qwen3 BPE tokens, 1024D hidden → 34-byte Base17), add:
25+ // static JINA_V5_BASE17: &[u8] = include_bytes!("weights/jina_v5_base17_151k.bin");
26+ // static JINA_V5_PALETTE: &[u8] = include_bytes!("weights/jina_v5_palette_151k.bin");
27+ // Then swap the `JINA` LazyLock load line below to use JinaV5. See
28+ // `JINA` / `JINA_V4` / `JINA_V5` statics near end of file for the wiring.
29+
1830static GPT2_BASE17 : & [ u8 ] = include_bytes ! ( "weights/gpt2_base17_50k.bin" ) ;
1931static GPT2_PALETTE : & [ u8 ] = include_bytes ! ( "weights/gpt2_palette_50k.bin" ) ;
2032static BERT_BASE17 : & [ u8 ] = include_bytes ! ( "weights/bert_base17_30k.bin" ) ;
@@ -23,9 +35,91 @@ static BERT_PALETTE: &[u8] = include_bytes!("weights/bert_palette_30k.bin");
2335/// Which model's weights to use.
2436#[ derive( Clone , Copy , Debug , PartialEq , Eq ) ]
2537pub enum ModelSource {
26- /// Jina v4 text-retrieval (20K tokens, 2048D original).
38+ /// Jina v4 text-retrieval (20K tokens, 2048D original, XLM-R base).
39+ /// LEGACY route. Kept for backward compatibility and direct-access callers
40+ /// that specifically need v4 behavior. Weights pre-baked at
41+ /// `weights/jina_base17_20k.bin` + `weights/jina_palette_20k.bin`.
2742 JinaV4 ,
28- /// GPT-2 small (50K tokens, 768D original). Same BPE as Jina.
43+ /// Jina v5 small (151K tokens, 1024D hidden, Qwen 3.5 base, SiLU activation).
44+ /// Also known as **Reader-LM v3** (same model, alternate name — BERT 3.x
45+ /// architecture lineage; NOT the older Qwen2-based Reader-LM 1.5B/v1/v2).
46+ ///
47+ /// **MAIN ROUTE** per AdaWorldAPI model registry (`lance-graph/CLAUDE.md`
48+ /// → Model Registry → Production models): Jina v5 is the canonical
49+ /// ground-truth anchor. Same Qwen 3.x BPE as Reranker v3, Qwopus.
50+ ///
51+ /// # Storage format on disk (verified by probe)
52+ ///
53+ /// The downloaded safetensors at
54+ /// `lance-graph/crates/thinking-engine/data/jina-v5-onnx/model.safetensors`
55+ /// is **BF16**, not F16. Every tensor in that 1.19 GB file is stored as
56+ /// BF16 per the safetensors JSON header, verified by
57+ /// `crates/thinking-engine/examples/probe_jina_v5_safetensors.rs`. The
58+ /// embedding matrix is `embed_tokens.weight` shape `[151936, 1024]`
59+ /// (311 MB BF16). Earlier canonical notes that said "Jina v5 is published
60+ /// in F16 only" were incorrect for this specific export; other Jina v5
61+ /// exports (ONNX, GGUF) may use different dtypes.
62+ ///
63+ /// The tokenizer lives at `data/jina-v5-tokenizer.json` (flat under the
64+ /// `data/` directory — NOT under `data/jina-v5-onnx/`). The tokenizer
65+ /// reports vocab size = 151669, while the safetensors embedding matrix
66+ /// has 151936 rows. Rows `[151669, 151936)` are ghost/unreachable
67+ /// (fine-tune-trimmed vocabulary kept aligned for hardware efficiency).
68+ /// Pair samplers MUST use `min(tokenizer_vocab, embed_rows) = 151669`.
69+ ///
70+ /// # Precision hierarchy (workspace-wide rule, Jina v5 specifics)
71+ ///
72+ /// 1. **Ground truth is the source file, losslessly upcast on demand.**
73+ /// For this file, BF16 source → F32 via the trivial shift
74+ /// [`crate::hpc::quantized::BF16`] scalar method. No F32 Vec is
75+ /// materialized. No F32 "buffer" persists. F32 is a *method*, not a
76+ /// storage format — it lives in registers or a small stack window
77+ /// during computation and is discarded with the consumer.
78+ ///
79+ /// 2. **Atomic-clock F16 → F32 method** at
80+ /// [`crate::hpc::gguf::f16_to_f32`] (`src/hpc/gguf.rs:417`) is proven
81+ /// lossless bit-exact over all 65,536 F16 patterns (including
82+ /// subnormals, ±0, ±∞, and NaN payloads with correct IEEE 754 quiet
83+ /// bit). Used by any F16 source (other Jina exports, GGUF files,
84+ /// reranker weights). Not on the Jina v5 safetensors path since that
85+ /// file is BF16.
86+ ///
87+ /// 3. **Compute precision is BF16 with fused `mul_add`** via
88+ /// [`crate::hpc::quantized::bf16_gemm_f32`] (`src/hpc/quantized.rs:108`).
89+ /// F32-precision accumulation is a property of the hardware FMA
90+ /// (`VDPBF16PS` on AVX-512-BF16, `BFMMLA` on ARM SVE, AMX on Apple),
91+ /// invisible to the caller. The `F32x16::mul_add` / `F32x8::mul_add`
92+ /// lane types in [`crate::simd`] compile to the appropriate
93+ /// instruction for the target CPU.
94+ ///
95+ /// 4. **F16 → BF16 has no exponent-range issue.** BF16 has MORE exponent
96+ /// bits than F16 (8 vs 5), so every F16 value fits inside BF16 range
97+ /// with ~33 orders of magnitude of headroom. The lossy step of
98+ /// F16 → BF16 is a 3-bit mantissa truncation (10 → 7 bits), not an
99+ /// exponent-range violation. Earlier notes that said "F16 max ~65504
100+ /// overflows before reaching BF16 range" were backwards.
101+ ///
102+ /// 5. **F64 constants** (π, e, φ, Euler-γ from `std::f64::consts`) are
103+ /// used for calibration math (GammaProfile log/exp), preserved at full
104+ /// 52-bit mantissa precision, and converted to BF16 exactly once per
105+ /// profile as a splatted value. The calibration result is 28 bytes.
106+ ///
107+ /// 6. **Storage after calibration**: Base17 i16 fixed-point (34-byte
108+ /// plane) or palette u8 index. Certification against the BF16 source
109+ /// goes through a streaming harness that reads the source once per
110+ /// pass, upcasts in registers, and reports Pearson / Spearman /
111+ /// Cronbach α to 4 decimal places.
112+ ///
113+ /// # Weight baking status
114+ ///
115+ /// Compile-time embedded weights at `weights/jina_v5_*.bin` are not yet
116+ /// produced. Until they are, the `JINA` main-route LazyLock falls back
117+ /// to v4 bytes. When the certification harness proves lab BF16 at
118+ /// ≥ 0.9999 and bgz-hhtl-d at ≥ 0.9980 on the three metrics, the
119+ /// Jina v5 runtime artifacts can be produced from the certified
120+ /// derivation pipeline. See the TODO block above `JINA_V4_BASE17`.
121+ JinaV5 ,
122+ /// GPT-2 small (50K tokens, 768D original). Same BPE as Jina v4.
29123 Gpt2 ,
30124 /// BERT base uncased (30K tokens, 768D original). WordPiece tokenizer.
31125 Bert ,
@@ -190,9 +284,33 @@ fn build_similarity_table(palette: &JinaPalette) -> [f32; 256] {
190284// Global LazyLock runtimes — loaded once, used forever
191285// ============================================================================
192286
193- /// Jina v4 runtime (20K tokens). LazyLock: zero cost after first access.
287+ /// Jina **main route**. LazyLock: zero cost after first access.
288+ ///
289+ /// Today this loads Jina v4 bytes (20K tokens) because v5 weights are not yet
290+ /// baked into `weights/`. When the v5 bake pipeline produces
291+ /// `weights/jina_v5_base17_151k.bin` + `weights/jina_v5_palette_151k.bin`,
292+ /// swap the load line below to:
293+ ///
294+ /// ```ignore
295+ /// ModelRuntime::load(ModelSource::JinaV5, JINA_V5_BASE17, JINA_V5_PALETTE)
296+ /// ```
297+ ///
298+ /// Callers should use `JINA` for default behavior. Only use `JINA_V4`
299+ /// explicitly when v4-specific behavior is required (e.g., backward-compat
300+ /// tests).
194301pub static JINA : LazyLock < ModelRuntime > = LazyLock :: new ( || {
195- ModelRuntime :: load ( ModelSource :: JinaV4 , JINA_BASE17 , JINA_PALETTE )
302+ // TODO(jina-v5-bake): swap to JinaV5 when v5 weights exist.
303+ ModelRuntime :: load ( ModelSource :: JinaV4 , JINA_V4_BASE17 , JINA_V4_PALETTE )
304+ } ) ;
305+
306+ /// Jina **v4 explicit route** (20K tokens, XLM-R base). LEGACY.
307+ ///
308+ /// Use this when a caller specifically needs v4 behavior and should NOT be
309+ /// silently upgraded to v5 when the main route is swapped. Today this is
310+ /// functionally identical to `JINA` (both load v4 bytes), but after the v5
311+ /// bake `JINA` will load v5 while `JINA_V4` keeps loading v4.
312+ pub static JINA_V4 : LazyLock < ModelRuntime > = LazyLock :: new ( || {
313+ ModelRuntime :: load ( ModelSource :: JinaV4 , JINA_V4_BASE17 , JINA_V4_PALETTE )
196314} ) ;
197315
198316/// GPT-2 runtime (50K tokens). Same BPE as Jina → interoperable palettes.
@@ -211,12 +329,24 @@ mod tests {
211329
212330 #[ test]
213331 fn test_jina_runtime_loads ( ) {
332+ // Main route. Today this is v4; when v5 is baked, update this test to
333+ // assert source == JinaV5 and vocab_size == ~151000.
214334 let rt = & * JINA ;
215335 assert_eq ! ( rt. source, ModelSource :: JinaV4 ) ;
216336 assert_eq ! ( rt. vocab_size( ) , 20000 ) ;
217337 assert ! ( ( rt. similarity[ 0 ] - 1.0 ) . abs( ) < 0.01 , "self-similarity should be ~1.0" ) ;
218338 }
219339
340+ #[ test]
341+ fn test_jina_v4_explicit_route ( ) {
342+ // Legacy v4-specific accessor. After v5 bake, this test MUST still
343+ // pass (v4 is the backward-compat guarantee — never deleted).
344+ let rt = & * JINA_V4 ;
345+ assert_eq ! ( rt. source, ModelSource :: JinaV4 ) ;
346+ assert_eq ! ( rt. vocab_size( ) , 20000 ) ;
347+ assert ! ( ( rt. similarity[ 0 ] - 1.0 ) . abs( ) < 0.01 , "self-similarity should be ~1.0" ) ;
348+ }
349+
220350 #[ test]
221351 fn test_gpt2_runtime_loads ( ) {
222352 let rt = & * GPT2 ;
0 commit comments