@@ -788,3 +788,239 @@ mod tests {
788788 eprintln ! ( "══════════════════════════════════════════════════════════\n " ) ;
789789 }
790790}
791+
792+ // ═══════════════════════════════════════════════════════════════════════════
793+ // LENS ICC PROFILE — characterize encoding distortion vs ground truth
794+ // ═══════════════════════════════════════════════════════════════════════════
795+
796+ /// Encoding path that produced a distance table.
797+ #[ derive( Debug , Clone , Copy , PartialEq , Eq , Hash ) ]
798+ pub enum EncodingPath {
799+ /// burn+GGUF f32 cosine (ground truth, expensive).
800+ RawF32 ,
801+ /// HDR CDF u8 (unsigned, loses sign, gains distribution).
802+ HdrCdfU8 ,
803+ /// Signed i8 (preserves sign, linear quantization).
804+ SignedI8 ,
805+ /// Gamma+phi redistributed (nonlinear, role-aware).
806+ GammaPhiU8 ,
807+ /// Gamma+phi signed (best of both).
808+ GammaPhiI8 ,
809+ }
810+
811+ /// Lens ICC Profile: characterizes the distortion of one encoding path
812+ /// relative to ground truth (burn+GGUF f32 cosine).
813+ ///
814+ /// Like a camera lens profile in Lightroom: measures the transfer function
815+ /// between "what the weights actually say" and "what the table encodes."
816+ /// The γ offset partially corrects it. The ICC captures the residual.
817+ ///
818+ /// Size: ~2KB per lens per role. Total for 6 models × 6 roles = ~72KB.
819+ #[ derive( Debug , Clone ) ]
820+ pub struct LensProfile {
821+ /// Which model this profile describes.
822+ pub model_name : String ,
823+ /// Which role (Q, K, V, Gate, Up, Down).
824+ pub role : String ,
825+ /// Which encoding path.
826+ pub encoding : EncodingPath ,
827+ /// Transfer function: 256 sample points from cos=-1.0 to cos=+1.0.
828+ /// Maps ground_truth_cos → encoded_value.
829+ pub transfer_curve : Vec < f32 > ,
830+ /// Inverse: encoded_value → estimated_cos.
831+ pub inverse_curve : Vec < f32 > ,
832+ /// Per-centroid bias: systematic over/under-estimation per row.
833+ pub centroid_bias : Vec < f32 > ,
834+ /// Noise floor: below this absolute cosine, the encoding can't distinguish.
835+ pub noise_floor : f32 ,
836+ /// Effective dynamic range in bits (higher = more discrimination).
837+ pub effective_bits : f32 ,
838+ /// Signed ratio: fraction of negative entries in the raw cosine matrix.
839+ /// ~0.5 = symmetric (reranker), ~0.1 = positive-skewed (Jina v3).
840+ pub signed_ratio : f32 ,
841+ }
842+
843+ impl LensProfile {
844+ /// Build a profile by comparing encoded table against ground truth cosines.
845+ ///
846+ /// `ground_truth`: f32 cosine matrix (n×n, from burn+GGUF or rten)
847+ /// `encoded`: u8 or i8 distance table (n×n, from our encoding pipeline)
848+ /// `n`: number of centroids
849+ pub fn build (
850+ model_name : & str ,
851+ role : & str ,
852+ encoding : EncodingPath ,
853+ ground_truth : & [ f32 ] ,
854+ encoded : & [ u8 ] ,
855+ n : usize ,
856+ ) -> Self {
857+ // Build transfer curve: sample 256 points from cos range
858+ let mut transfer_curve = vec ! [ 0.0f32 ; 256 ] ;
859+ let mut inverse_curve = vec ! [ 0.0f32 ; 256 ] ;
860+ let mut centroid_bias = vec ! [ 0.0f32 ; n] ;
861+
862+ // Collect (cos, encoded) pairs
863+ let mut pairs: Vec < ( f32 , u8 ) > = Vec :: new ( ) ;
864+ let mut negative_count = 0usize ;
865+ let mut total_count = 0usize ;
866+
867+ for i in 0 ..n {
868+ let mut row_error = 0.0f32 ;
869+ let mut row_count = 0 ;
870+ for j in 0 ..n {
871+ if i == j { continue ; }
872+ let cos = ground_truth[ i * n + j] ;
873+ let enc = encoded[ i * n + j] ;
874+ pairs. push ( ( cos, enc) ) ;
875+ if cos < 0.0 { negative_count += 1 ; }
876+ total_count += 1 ;
877+ // Bias: expected encoded vs actual
878+ let expected = ( ( cos + 1.0 ) / 2.0 * 255.0 ) as u8 ; // linear mapping
879+ row_error += ( enc as f32 - expected as f32 ) . abs ( ) ;
880+ row_count += 1 ;
881+ }
882+ if row_count > 0 {
883+ centroid_bias[ i] = row_error / row_count as f32 ;
884+ }
885+ }
886+
887+ // Sort pairs by cosine value
888+ pairs. sort_by ( |a, b| a. 0 . partial_cmp ( & b. 0 ) . unwrap ( ) ) ;
889+
890+ // Sample transfer curve at 256 equidistant cosine points
891+ let n_pairs = pairs. len ( ) ;
892+ for k in 0 ..256 {
893+ let target_cos = -1.0 + k as f32 * 2.0 / 255.0 ;
894+ // Find nearest pair
895+ let idx = pairs. partition_point ( |p| p. 0 < target_cos) . min ( n_pairs - 1 ) ;
896+ transfer_curve[ k] = pairs[ idx] . 1 as f32 ;
897+ inverse_curve[ pairs[ idx] . 1 as usize ] = target_cos;
898+ }
899+
900+ // Noise floor: smallest cosine difference that produces different encoded values
901+ let mut noise_floor = 2.0f32 ;
902+ for w in pairs. windows ( 2 ) {
903+ if w[ 0 ] . 1 != w[ 1 ] . 1 {
904+ let delta = ( w[ 1 ] . 0 - w[ 0 ] . 0 ) . abs ( ) ;
905+ if delta < noise_floor { noise_floor = delta; }
906+ }
907+ }
908+
909+ // Effective bits: log2 of distinct encoded values
910+ let mut seen = [ false ; 256 ] ;
911+ for & ( _, e) in & pairs { seen[ e as usize ] = true ; }
912+ let distinct = seen. iter ( ) . filter ( |& & v| v) . count ( ) ;
913+ let effective_bits = ( distinct as f32 ) . log2 ( ) ;
914+
915+ let signed_ratio = if total_count > 0 {
916+ negative_count as f32 / total_count as f32
917+ } else { 0.0 } ;
918+
919+ Self {
920+ model_name : model_name. to_string ( ) ,
921+ role : role. to_string ( ) ,
922+ encoding,
923+ transfer_curve,
924+ inverse_curve,
925+ centroid_bias,
926+ noise_floor,
927+ effective_bits,
928+ signed_ratio,
929+ }
930+ }
931+ }
932+
933+ /// Standardized lens configuration for the 6-lane pipeline.
934+ #[ derive( Debug , Clone ) ]
935+ pub struct LensConfig {
936+ /// Model name (e.g., "jina-v3", "reranker-v3", "qwopus-27b").
937+ pub name : & ' static str ,
938+ /// Model family.
939+ pub family : LensFamily ,
940+ /// Vocabulary size.
941+ pub vocab_size : usize ,
942+ /// Number of centroids in the baked table.
943+ pub n_centroids : usize ,
944+ /// Tokenizer family (determines which tokenizer.json to load).
945+ pub tokenizer : TokenizerFamily ,
946+ /// Raw cosine range observed in the weight matrix.
947+ pub cos_range : ( f32 , f32 ) ,
948+ /// Gamma offset for HDR re-encoding (higher = more resolution near zero).
949+ pub gamma_offset : f32 ,
950+ /// Whether this lens uses signed i8 tables.
951+ pub is_signed : bool ,
952+ /// Whether this is a truth anchor for cross-model evaluation.
953+ pub is_truth_anchor : bool ,
954+ }
955+
956+ /// Model family.
957+ #[ derive( Debug , Clone , Copy , PartialEq , Eq , Hash ) ]
958+ pub enum LensFamily {
959+ /// Embedding model (symmetric similarity).
960+ Embedding ,
961+ /// Reranker (asymmetric relevance scoring).
962+ Reranker ,
963+ /// Reader model (HTML → text).
964+ Reader ,
965+ /// Language model (token generation).
966+ LanguageModel ,
967+ /// Mixture of Experts language model.
968+ MoE ,
969+ }
970+
971+ /// Tokenizer family.
972+ #[ derive( Debug , Clone , Copy , PartialEq , Eq , Hash ) ]
973+ pub enum TokenizerFamily {
974+ XlmRoberta ,
975+ Qwen2 ,
976+ Llama ,
977+ SentencePiece ,
978+ }
979+
980+ /// The 6-lane lens registry.
981+ pub static LENS_REGISTRY : & [ LensConfig ] = & [
982+ LensConfig {
983+ name : "jina-v3" , family : LensFamily :: Embedding ,
984+ vocab_size : 250_002 , n_centroids : 256 ,
985+ tokenizer : TokenizerFamily :: XlmRoberta ,
986+ cos_range : ( -0.067 , 0.234 ) , gamma_offset : 0.37 ,
987+ is_signed : false , is_truth_anchor : true ,
988+ } ,
989+ LensConfig {
990+ name : "bge-m3" , family : LensFamily :: Embedding ,
991+ vocab_size : 250_002 , n_centroids : 256 ,
992+ tokenizer : TokenizerFamily :: XlmRoberta ,
993+ cos_range : ( -0.07 , 0.23 ) , gamma_offset : 0.40 ,
994+ is_signed : false , is_truth_anchor : false ,
995+ } ,
996+ LensConfig {
997+ name : "reranker-v3" , family : LensFamily :: Reranker ,
998+ vocab_size : 151_936 , n_centroids : 256 ,
999+ tokenizer : TokenizerFamily :: Qwen2 ,
1000+ cos_range : ( -0.886 , 0.826 ) , gamma_offset : 1.50 ,
1001+ is_signed : false , // best candidate FOR signed
1002+ is_truth_anchor : false ,
1003+ } ,
1004+ LensConfig {
1005+ name : "reader-lm-1.5b" , family : LensFamily :: Reader ,
1006+ vocab_size : 151_936 , n_centroids : 256 ,
1007+ tokenizer : TokenizerFamily :: Qwen2 ,
1008+ cos_range : ( -0.095 , 0.336 ) , gamma_offset : 0.12 ,
1009+ is_signed : false , is_truth_anchor : false ,
1010+ } ,
1011+ LensConfig {
1012+ name : "qwopus-27b" , family : LensFamily :: LanguageModel ,
1013+ vocab_size : 248_320 , n_centroids : 4096 ,
1014+ tokenizer : TokenizerFamily :: Qwen2 ,
1015+ cos_range : ( -0.23 , 0.18 ) , gamma_offset : 1.50 ,
1016+ is_signed : false , is_truth_anchor : false ,
1017+ } ,
1018+ LensConfig {
1019+ name : "maverick-128e" , family : LensFamily :: MoE ,
1020+ vocab_size : 202_048 , n_centroids : 256 , // TBD: scale to 4096
1021+ tokenizer : TokenizerFamily :: Llama ,
1022+ cos_range : ( 0.0 , 0.0 ) , // TBD: stream and measure
1023+ gamma_offset : 0.0 , // TBD: calibrate
1024+ is_signed : false , is_truth_anchor : false ,
1025+ } ,
1026+ ] ;
0 commit comments