Skip to content

Commit e900ad0

Browse files
authored
feat: MoE gate topology + expert clustering + scaffold cross-reference (#58)
extract_gate_topology() — pulls ffn_gate_inp Base17 rows from bgz7, one row per expert. Each row IS the expert's structural identity. cluster_experts() — pairwise L1 between experts within each block, connected-component grouping of structurally interchangeable experts. At threshold=500, Maverick's 123,000× compression predicts >90% redundancy. cross_reference_gate_scaffold() — links attention scaffold blocks (Q+O shifted from Qwen3.5 diff) with gate redundancy per block. Routing-dominated blocks = reasoning changes work through the router, not through the expert weights. Tests: - test_maverick_gate_topology: load all 18 Maverick bgz7 shards - test_cross_reference_gate_scaffold: full pipeline connecting Qwen3.5 attention diff with Maverick gate structure
1 parent 85f8d48 commit e900ad0

1 file changed

Lines changed: 277 additions & 0 deletions

File tree

src/hpc/causal_diff.rs

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,180 @@ fn nars_revision(a: NarsTruth, b: NarsTruth) -> NarsTruth {
333333
NarsTruth::new(f.clamp(0.0, 1.0), c.clamp(0.0, 0.99))
334334
}
335335

336+
// ============================================================================
337+
// MoE gate topology — expert clustering from router weights
338+
// ============================================================================
339+
340+
/// One expert's structural identity from the gate projection.
341+
#[derive(Clone, Debug)]
342+
pub struct ExpertFingerprint {
343+
pub block: u32,
344+
pub expert_idx: usize,
345+
pub base17: Base17,
346+
}
347+
348+
/// Pairwise expert similarity within a block.
349+
#[derive(Clone, Debug)]
350+
pub struct ExpertCluster {
351+
pub block: u32,
352+
pub n_experts: usize,
353+
/// Mean pairwise L1 distance between experts (lower = more redundant).
354+
pub mean_pairwise_l1: f64,
355+
/// Number of expert pairs with L1 < threshold (structurally interchangeable).
356+
pub redundant_pairs: usize,
357+
/// Total pairs compared.
358+
pub total_pairs: usize,
359+
/// Groups of structurally similar experts (L1 < threshold).
360+
pub groups: Vec<Vec<usize>>,
361+
}
362+
363+
/// Extract MoE gate topology from a bgz7 file.
364+
///
365+
/// Finds all `ffn_gate_inp` tensors (the router gate projections).
366+
/// Each row in the gate tensor = one expert's activation fingerprint.
367+
/// Returns per-block expert fingerprints.
368+
pub fn extract_gate_topology(bgz7_path: &str) -> Result<Vec<ExpertFingerprint>, String> {
369+
let tensors = read_bgz7_file(bgz7_path)?;
370+
let mut fingerprints = Vec::new();
371+
372+
for t in &tensors {
373+
// Match router gate tensors
374+
if !t.name.contains("gate_inp") && !t.name.contains("gate.weight") {
375+
continue;
376+
}
377+
// Skip expert FFN gates (gate_exps) — we want the ROUTER gate
378+
if t.name.contains("_exps") {
379+
continue;
380+
}
381+
382+
let block = extract_block(&t.name).unwrap_or(0);
383+
384+
for (expert_idx, row) in t.rows.iter().enumerate() {
385+
fingerprints.push(ExpertFingerprint {
386+
block,
387+
expert_idx,
388+
base17: row.clone(),
389+
});
390+
}
391+
392+
eprintln!(" Gate: {} → {} experts in block {}",
393+
t.name, t.rows.len(), block);
394+
}
395+
396+
Ok(fingerprints)
397+
}
398+
399+
/// Cluster experts within each block by Base17 L1 distance.
400+
///
401+
/// `redundancy_threshold`: L1 below which two experts are "structurally interchangeable".
402+
/// Suggested: 500 (conservative), 1000 (aggressive).
403+
pub fn cluster_experts(
404+
fingerprints: &[ExpertFingerprint],
405+
redundancy_threshold: u32,
406+
) -> Vec<ExpertCluster> {
407+
// Group by block
408+
let mut by_block: HashMap<u32, Vec<&ExpertFingerprint>> = HashMap::new();
409+
for fp in fingerprints {
410+
by_block.entry(fp.block).or_default().push(fp);
411+
}
412+
413+
let mut clusters = Vec::new();
414+
415+
for (block, experts) in &by_block {
416+
let n = experts.len();
417+
let mut total_l1 = 0u64;
418+
let mut redundant = 0usize;
419+
let total_pairs = n * (n - 1) / 2;
420+
421+
// Pairwise L1
422+
let mut adjacency: Vec<Vec<bool>> = vec![vec![false; n]; n];
423+
for i in 0..n {
424+
for j in (i + 1)..n {
425+
let l1 = experts[i].base17.l1(&experts[j].base17);
426+
total_l1 += l1 as u64;
427+
if l1 < redundancy_threshold {
428+
redundant += 1;
429+
adjacency[i][j] = true;
430+
adjacency[j][i] = true;
431+
}
432+
}
433+
}
434+
435+
let mean_l1 = if total_pairs > 0 { total_l1 as f64 / total_pairs as f64 } else { 0.0 };
436+
437+
// Simple connected-component grouping
438+
let mut visited = vec![false; n];
439+
let mut groups = Vec::new();
440+
for start in 0..n {
441+
if visited[start] { continue; }
442+
let mut group = vec![start];
443+
visited[start] = true;
444+
let mut stack = vec![start];
445+
while let Some(node) = stack.pop() {
446+
for neighbor in 0..n {
447+
if !visited[neighbor] && adjacency[node][neighbor] {
448+
visited[neighbor] = true;
449+
group.push(neighbor);
450+
stack.push(neighbor);
451+
}
452+
}
453+
}
454+
if group.len() > 1 {
455+
groups.push(group);
456+
}
457+
}
458+
459+
eprintln!(" Block {:>2}: {} experts, mean_L1={:.0}, redundant_pairs={}/{} ({:.0}%), groups={}",
460+
block, n, mean_l1, redundant, total_pairs,
461+
if total_pairs > 0 { redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 },
462+
groups.len());
463+
464+
clusters.push(ExpertCluster {
465+
block: *block,
466+
n_experts: n,
467+
mean_pairwise_l1: mean_l1,
468+
redundant_pairs: redundant,
469+
total_pairs,
470+
groups,
471+
});
472+
}
473+
474+
clusters.sort_by_key(|c| c.block);
475+
clusters
476+
}
477+
478+
/// Cross-reference gate topology with attention scaffold.
479+
///
480+
/// For each scaffold block (where Q+O shifted), check if the gate
481+
/// in that block has high expert redundancy. High redundancy + scaffold
482+
/// = the reasoning change works THROUGH the router, not the experts.
483+
pub fn cross_reference_gate_scaffold(
484+
clusters: &[ExpertCluster],
485+
scaffold_blocks: &[u32],
486+
) -> Vec<(u32, bool, f64)> {
487+
let mut results = Vec::new();
488+
489+
for block in scaffold_blocks {
490+
if let Some(cluster) = clusters.iter().find(|c| c.block == *block) {
491+
let redundancy_pct = if cluster.total_pairs > 0 {
492+
cluster.redundant_pairs as f64 / cluster.total_pairs as f64
493+
} else { 0.0 };
494+
495+
let is_routing_dominated = redundancy_pct > 0.5;
496+
results.push((*block, is_routing_dominated, redundancy_pct));
497+
498+
eprintln!(" Block {:>2}: scaffold={} routing_dominated={} redundancy={:.0}%",
499+
block, true, is_routing_dominated, redundancy_pct * 100.0);
500+
} else {
501+
// No gate in this block (dense layer, not MoE)
502+
results.push((*block, false, 0.0));
503+
eprintln!(" Block {:>2}: scaffold={} (dense, no MoE gate)", block, true);
504+
}
505+
}
506+
507+
results
508+
}
509+
336510
// ============================================================================
337511
// Tests
338512
// ============================================================================
@@ -516,4 +690,107 @@ mod tests {
516690
if truth.frequency > 0.5 { "shifted" } else { "stable" });
517691
}
518692
}
693+
694+
#[test]
695+
#[ignore] // Requires: Maverick bgz7 outputs from shard indexing
696+
fn test_maverick_gate_topology() {
697+
// Load all Maverick shard bgz7 files and extract gate tensors
698+
let mut all_fingerprints = Vec::new();
699+
700+
for shard in 1..=18u32 {
701+
let path = format!("/tmp/llama4_maverick_shard{:02}.bgz7", shard);
702+
if !std::fs::metadata(&path).is_ok() {
703+
// Try openchat/weights path
704+
let alt = format!("src/hpc/openchat/weights/llama4_maverick_shard{:02}.bgz7", shard);
705+
if !std::fs::metadata(&alt).is_ok() {
706+
eprintln!("SKIP shard {} (not found)", shard);
707+
continue;
708+
}
709+
match extract_gate_topology(&alt) {
710+
Ok(fps) => all_fingerprints.extend(fps),
711+
Err(e) => eprintln!("WARN shard {}: {}", shard, e),
712+
}
713+
continue;
714+
}
715+
match extract_gate_topology(&path) {
716+
Ok(fps) => all_fingerprints.extend(fps),
717+
Err(e) => eprintln!("WARN shard {}: {}", shard, e),
718+
}
719+
}
720+
721+
eprintln!();
722+
eprintln!("Total expert fingerprints: {}", all_fingerprints.len());
723+
724+
if all_fingerprints.is_empty() {
725+
eprintln!("No gate tensors found — Maverick may not have been indexed yet");
726+
return;
727+
}
728+
729+
// Cluster experts
730+
let clusters = cluster_experts(&all_fingerprints, 500);
731+
732+
eprintln!();
733+
eprintln!("━━━ Maverick Gate Topology ━━━");
734+
let total_redundant: usize = clusters.iter().map(|c| c.redundant_pairs).sum();
735+
let total_pairs: usize = clusters.iter().map(|c| c.total_pairs).sum();
736+
eprintln!(" Overall redundancy: {}/{} pairs ({:.0}%)",
737+
total_redundant, total_pairs,
738+
if total_pairs > 0 { total_redundant as f64 / total_pairs as f64 * 100.0 } else { 0.0 });
739+
740+
// NARS truth for expert redundancy
741+
let f = if total_pairs > 0 { total_redundant as f32 / total_pairs as f32 } else { 0.0 };
742+
let c = (1.0 - 1.0 / (1.0 + total_pairs as f32)).min(0.99);
743+
eprintln!(" NARS truth: f={:.3} c={:.3}", f, c);
744+
eprintln!(" Interpretation: {:.0}% of expert pairs are structurally interchangeable", f * 100.0);
745+
}
746+
747+
#[test]
748+
#[ignore] // Requires: both Maverick bgz7 + Qwen3.5 diff results
749+
fn test_cross_reference_gate_scaffold() {
750+
// This test connects the two analyses:
751+
// 1. Attention scaffold from Qwen3.5 diff (which blocks have Q+O shift)
752+
// 2. Gate topology from Maverick (which blocks have redundant experts)
753+
754+
// Step 1: Run the Qwen3.5 diff to find scaffold blocks
755+
let base = "/tmp/qwen35_27b_base.bgz7";
756+
let dist = "/tmp/qwen35_27b_distilled_v1.bgz7";
757+
758+
if !std::fs::metadata(base).is_ok() || !std::fs::metadata(dist).is_ok() {
759+
eprintln!("SKIP: Qwen3.5 bgz7 files not found");
760+
return;
761+
}
762+
763+
let (edges, _stats) = causal_diff(base, dist, 100).expect("diff failed");
764+
let scaffold_blocks = find_reasoning_scaffold(&edges, 0.3);
765+
766+
// Step 2: Extract Maverick gate topology
767+
let mut all_fps = Vec::new();
768+
for shard in 1..=18u32 {
769+
let path = format!("/tmp/llama4_maverick_shard{:02}.bgz7", shard);
770+
if let Ok(fps) = extract_gate_topology(&path) {
771+
all_fps.extend(fps);
772+
}
773+
}
774+
775+
if all_fps.is_empty() {
776+
eprintln!("SKIP: No Maverick gate fingerprints");
777+
return;
778+
}
779+
780+
let clusters = cluster_experts(&all_fps, 500);
781+
782+
// Step 3: Cross-reference
783+
eprintln!();
784+
eprintln!("━━━ Cross-Reference: Attention Scaffold × Gate Topology ━━━");
785+
let results = cross_reference_gate_scaffold(&clusters, &scaffold_blocks);
786+
787+
let routing_dominated: usize = results.iter().filter(|(_, rd, _)| *rd).count();
788+
eprintln!();
789+
eprintln!(" Scaffold blocks: {}", scaffold_blocks.len());
790+
eprintln!(" Routing-dominated: {}/{} ({:.0}%)",
791+
routing_dominated, results.len(),
792+
if !results.is_empty() { routing_dominated as f64 / results.len() as f64 * 100.0 } else { 0.0 });
793+
eprintln!(" → {} = reasoning changes work THROUGH the router",
794+
if routing_dominated > results.len() / 2 { "YES" } else { "PARTIAL" });
795+
}
519796
}

0 commit comments

Comments
 (0)