@@ -333,6 +333,180 @@ fn nars_revision(a: NarsTruth, b: NarsTruth) -> NarsTruth {
333333 NarsTruth :: new ( f. clamp ( 0.0 , 1.0 ) , c. clamp ( 0.0 , 0.99 ) )
334334}
335335
336+ // ============================================================================
337+ // MoE gate topology — expert clustering from router weights
338+ // ============================================================================
339+
340+ /// One expert's structural identity from the gate projection.
341+ #[ derive( Clone , Debug ) ]
342+ pub struct ExpertFingerprint {
343+ pub block : u32 ,
344+ pub expert_idx : usize ,
345+ pub base17 : Base17 ,
346+ }
347+
348+ /// Pairwise expert similarity within a block.
349+ #[ derive( Clone , Debug ) ]
350+ pub struct ExpertCluster {
351+ pub block : u32 ,
352+ pub n_experts : usize ,
353+ /// Mean pairwise L1 distance between experts (lower = more redundant).
354+ pub mean_pairwise_l1 : f64 ,
355+ /// Number of expert pairs with L1 < threshold (structurally interchangeable).
356+ pub redundant_pairs : usize ,
357+ /// Total pairs compared.
358+ pub total_pairs : usize ,
359+ /// Groups of structurally similar experts (L1 < threshold).
360+ pub groups : Vec < Vec < usize > > ,
361+ }
362+
363+ /// Extract MoE gate topology from a bgz7 file.
364+ ///
365+ /// Finds all `ffn_gate_inp` tensors (the router gate projections).
366+ /// Each row in the gate tensor = one expert's activation fingerprint.
367+ /// Returns per-block expert fingerprints.
368+ pub fn extract_gate_topology ( bgz7_path : & str ) -> Result < Vec < ExpertFingerprint > , String > {
369+ let tensors = read_bgz7_file ( bgz7_path) ?;
370+ let mut fingerprints = Vec :: new ( ) ;
371+
372+ for t in & tensors {
373+ // Match router gate tensors
374+ if !t. name . contains ( "gate_inp" ) && !t. name . contains ( "gate.weight" ) {
375+ continue ;
376+ }
377+ // Skip expert FFN gates (gate_exps) — we want the ROUTER gate
378+ if t. name . contains ( "_exps" ) {
379+ continue ;
380+ }
381+
382+ let block = extract_block ( & t. name ) . unwrap_or ( 0 ) ;
383+
384+ for ( expert_idx, row) in t. rows . iter ( ) . enumerate ( ) {
385+ fingerprints. push ( ExpertFingerprint {
386+ block,
387+ expert_idx,
388+ base17 : row. clone ( ) ,
389+ } ) ;
390+ }
391+
392+ eprintln ! ( " Gate: {} → {} experts in block {}" ,
393+ t. name, t. rows. len( ) , block) ;
394+ }
395+
396+ Ok ( fingerprints)
397+ }
398+
399+ /// Cluster experts within each block by Base17 L1 distance.
400+ ///
401+ /// `redundancy_threshold`: L1 below which two experts are "structurally interchangeable".
402+ /// Suggested: 500 (conservative), 1000 (aggressive).
403+ pub fn cluster_experts (
404+ fingerprints : & [ ExpertFingerprint ] ,
405+ redundancy_threshold : u32 ,
406+ ) -> Vec < ExpertCluster > {
407+ // Group by block
408+ let mut by_block: HashMap < u32 , Vec < & ExpertFingerprint > > = HashMap :: new ( ) ;
409+ for fp in fingerprints {
410+ by_block. entry ( fp. block ) . or_default ( ) . push ( fp) ;
411+ }
412+
413+ let mut clusters = Vec :: new ( ) ;
414+
415+ for ( block, experts) in & by_block {
416+ let n = experts. len ( ) ;
417+ let mut total_l1 = 0u64 ;
418+ let mut redundant = 0usize ;
419+ let total_pairs = n * ( n - 1 ) / 2 ;
420+
421+ // Pairwise L1
422+ let mut adjacency: Vec < Vec < bool > > = vec ! [ vec![ false ; n] ; n] ;
423+ for i in 0 ..n {
424+ for j in ( i + 1 ) ..n {
425+ let l1 = experts[ i] . base17 . l1 ( & experts[ j] . base17 ) ;
426+ total_l1 += l1 as u64 ;
427+ if l1 < redundancy_threshold {
428+ redundant += 1 ;
429+ adjacency[ i] [ j] = true ;
430+ adjacency[ j] [ i] = true ;
431+ }
432+ }
433+ }
434+
435+ let mean_l1 = if total_pairs > 0 { total_l1 as f64 / total_pairs as f64 } else { 0.0 } ;
436+
437+ // Simple connected-component grouping
438+ let mut visited = vec ! [ false ; n] ;
439+ let mut groups = Vec :: new ( ) ;
440+ for start in 0 ..n {
441+ if visited[ start] { continue ; }
442+ let mut group = vec ! [ start] ;
443+ visited[ start] = true ;
444+ let mut stack = vec ! [ start] ;
445+ while let Some ( node) = stack. pop ( ) {
446+ for neighbor in 0 ..n {
447+ if !visited[ neighbor] && adjacency[ node] [ neighbor] {
448+ visited[ neighbor] = true ;
449+ group. push ( neighbor) ;
450+ stack. push ( neighbor) ;
451+ }
452+ }
453+ }
454+ if group. len ( ) > 1 {
455+ groups. push ( group) ;
456+ }
457+ }
458+
459+ eprintln ! ( " Block {:>2}: {} experts, mean_L1={:.0}, redundant_pairs={}/{} ({:.0}%), groups={}" ,
460+ block, n, mean_l1, redundant, total_pairs,
461+ if total_pairs > 0 { redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 } ,
462+ groups. len( ) ) ;
463+
464+ clusters. push ( ExpertCluster {
465+ block : * block,
466+ n_experts : n,
467+ mean_pairwise_l1 : mean_l1,
468+ redundant_pairs : redundant,
469+ total_pairs,
470+ groups,
471+ } ) ;
472+ }
473+
474+ clusters. sort_by_key ( |c| c. block ) ;
475+ clusters
476+ }
477+
478+ /// Cross-reference gate topology with attention scaffold.
479+ ///
480+ /// For each scaffold block (where Q+O shifted), check if the gate
481+ /// in that block has high expert redundancy. High redundancy + scaffold
482+ /// = the reasoning change works THROUGH the router, not the experts.
483+ pub fn cross_reference_gate_scaffold (
484+ clusters : & [ ExpertCluster ] ,
485+ scaffold_blocks : & [ u32 ] ,
486+ ) -> Vec < ( u32 , bool , f64 ) > {
487+ let mut results = Vec :: new ( ) ;
488+
489+ for block in scaffold_blocks {
490+ if let Some ( cluster) = clusters. iter ( ) . find ( |c| c. block == * block) {
491+ let redundancy_pct = if cluster. total_pairs > 0 {
492+ cluster. redundant_pairs as f64 / cluster. total_pairs as f64
493+ } else { 0.0 } ;
494+
495+ let is_routing_dominated = redundancy_pct > 0.5 ;
496+ results. push ( ( * block, is_routing_dominated, redundancy_pct) ) ;
497+
498+ eprintln ! ( " Block {:>2}: scaffold={} routing_dominated={} redundancy={:.0}%" ,
499+ block, true , is_routing_dominated, redundancy_pct * 100.0 ) ;
500+ } else {
501+ // No gate in this block (dense layer, not MoE)
502+ results. push ( ( * block, false , 0.0 ) ) ;
503+ eprintln ! ( " Block {:>2}: scaffold={} (dense, no MoE gate)" , block, true ) ;
504+ }
505+ }
506+
507+ results
508+ }
509+
336510// ============================================================================
337511// Tests
338512// ============================================================================
@@ -516,4 +690,107 @@ mod tests {
516690 if truth. frequency > 0.5 { "shifted" } else { "stable" } ) ;
517691 }
518692 }
693+
694+ #[ test]
695+ #[ ignore] // Requires: Maverick bgz7 outputs from shard indexing
696+ fn test_maverick_gate_topology ( ) {
697+ // Load all Maverick shard bgz7 files and extract gate tensors
698+ let mut all_fingerprints = Vec :: new ( ) ;
699+
700+ for shard in 1 ..=18u32 {
701+ let path = format ! ( "/tmp/llama4_maverick_shard{:02}.bgz7" , shard) ;
702+ if !std:: fs:: metadata ( & path) . is_ok ( ) {
703+ // Try openchat/weights path
704+ let alt = format ! ( "src/hpc/openchat/weights/llama4_maverick_shard{:02}.bgz7" , shard) ;
705+ if !std:: fs:: metadata ( & alt) . is_ok ( ) {
706+ eprintln ! ( "SKIP shard {} (not found)" , shard) ;
707+ continue ;
708+ }
709+ match extract_gate_topology ( & alt) {
710+ Ok ( fps) => all_fingerprints. extend ( fps) ,
711+ Err ( e) => eprintln ! ( "WARN shard {}: {}" , shard, e) ,
712+ }
713+ continue ;
714+ }
715+ match extract_gate_topology ( & path) {
716+ Ok ( fps) => all_fingerprints. extend ( fps) ,
717+ Err ( e) => eprintln ! ( "WARN shard {}: {}" , shard, e) ,
718+ }
719+ }
720+
721+ eprintln ! ( ) ;
722+ eprintln ! ( "Total expert fingerprints: {}" , all_fingerprints. len( ) ) ;
723+
724+ if all_fingerprints. is_empty ( ) {
725+ eprintln ! ( "No gate tensors found — Maverick may not have been indexed yet" ) ;
726+ return ;
727+ }
728+
729+ // Cluster experts
730+ let clusters = cluster_experts ( & all_fingerprints, 500 ) ;
731+
732+ eprintln ! ( ) ;
733+ eprintln ! ( "━━━ Maverick Gate Topology ━━━" ) ;
734+ let total_redundant: usize = clusters. iter ( ) . map ( |c| c. redundant_pairs ) . sum ( ) ;
735+ let total_pairs: usize = clusters. iter ( ) . map ( |c| c. total_pairs ) . sum ( ) ;
736+ eprintln ! ( " Overall redundancy: {}/{} pairs ({:.0}%)" ,
737+ total_redundant, total_pairs,
738+ if total_pairs > 0 { total_redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 } ) ;
739+
740+ // NARS truth for expert redundancy
741+ let f = if total_pairs > 0 { total_redundant as f32 / total_pairs as f32 } else { 0.0 } ;
742+ let c = ( 1.0 - 1.0 / ( 1.0 + total_pairs as f32 ) ) . min ( 0.99 ) ;
743+ eprintln ! ( " NARS truth: f={:.3} c={:.3}" , f, c) ;
744+ eprintln ! ( " Interpretation: {:.0}% of expert pairs are structurally interchangeable" , f * 100.0 ) ;
745+ }
746+
747+ #[ test]
748+ #[ ignore] // Requires: both Maverick bgz7 + Qwen3.5 diff results
749+ fn test_cross_reference_gate_scaffold ( ) {
750+ // This test connects the two analyses:
751+ // 1. Attention scaffold from Qwen3.5 diff (which blocks have Q+O shift)
752+ // 2. Gate topology from Maverick (which blocks have redundant experts)
753+
754+ // Step 1: Run the Qwen3.5 diff to find scaffold blocks
755+ let base = "/tmp/qwen35_27b_base.bgz7" ;
756+ let dist = "/tmp/qwen35_27b_distilled_v1.bgz7" ;
757+
758+ if !std:: fs:: metadata ( base) . is_ok ( ) || !std:: fs:: metadata ( dist) . is_ok ( ) {
759+ eprintln ! ( "SKIP: Qwen3.5 bgz7 files not found" ) ;
760+ return ;
761+ }
762+
763+ let ( edges, _stats) = causal_diff ( base, dist, 100 ) . expect ( "diff failed" ) ;
764+ let scaffold_blocks = find_reasoning_scaffold ( & edges, 0.3 ) ;
765+
766+ // Step 2: Extract Maverick gate topology
767+ let mut all_fps = Vec :: new ( ) ;
768+ for shard in 1 ..=18u32 {
769+ let path = format ! ( "/tmp/llama4_maverick_shard{:02}.bgz7" , shard) ;
770+ if let Ok ( fps) = extract_gate_topology ( & path) {
771+ all_fps. extend ( fps) ;
772+ }
773+ }
774+
775+ if all_fps. is_empty ( ) {
776+ eprintln ! ( "SKIP: No Maverick gate fingerprints" ) ;
777+ return ;
778+ }
779+
780+ let clusters = cluster_experts ( & all_fps, 500 ) ;
781+
782+ // Step 3: Cross-reference
783+ eprintln ! ( ) ;
784+ eprintln ! ( "━━━ Cross-Reference: Attention Scaffold × Gate Topology ━━━" ) ;
785+ let results = cross_reference_gate_scaffold ( & clusters, & scaffold_blocks) ;
786+
787+ let routing_dominated: usize = results. iter ( ) . filter ( |( _, rd, _) | * rd) . count ( ) ;
788+ eprintln ! ( ) ;
789+ eprintln ! ( " Scaffold blocks: {}" , scaffold_blocks. len( ) ) ;
790+ eprintln ! ( " Routing-dominated: {}/{} ({:.0}%)" ,
791+ routing_dominated, results. len( ) ,
792+ if !results. is_empty( ) { routing_dominated as f64 / results. len( ) as f64 * 100.0 } else { 0.0 } ) ;
793+ eprintln ! ( " → {} = reasoning changes work THROUGH the router" ,
794+ if routing_dominated > results. len( ) / 2 { "YES" } else { "PARTIAL" } ) ;
795+ }
519796}
0 commit comments