Skip to content

Commit aa9f0af

Browse files
committed
Add all-18-shards streaming index test for Llama 4 Maverick BF16
Processes 801.47 GB (18 shards × ~43-48 GB each) of Maverick 17B-128E sequentially with 256 MB HTTP range chunks, tail-deleting output files to stay within 26 GB disk budget. Keeps last 3 outputs, drops writer handles before cleanup, and accumulates per-type compression stats across the full model. https://claude.ai/code/session_01HmdXNPit7QsTCfhJFef3Ee
1 parent f7dd8d6 commit aa9f0af

1 file changed

Lines changed: 248 additions & 0 deletions

File tree

src/hpc/gguf_indexer.rs

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,4 +696,252 @@ mod tests {
696696
// BF16 dequant to f32 doubles the size, so original_bytes > source size
697697
assert!(stats.original_bytes > 0);
698698
}
699+
700+
#[test]
701+
#[ignore] // Streams ~801 GB from HuggingFace — takes ~8-10 hours
702+
fn test_stream_index_llama4_maverick_bf16_all_shards() {
703+
use super::super::http_reader::HttpRangeReader;
704+
use std::io::BufWriter;
705+
706+
let repo = "unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF";
707+
708+
let shards: [(u8, &str, u64); 18] = [
709+
( 1, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00001-of-00018.gguf", 46_166_870_240),
710+
( 2, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00002-of-00018.gguf", 42_949_673_376),
711+
( 3, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00003-of-00018.gguf", 42_949_673_376),
712+
( 4, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00004-of-00018.gguf", 42_949_673_376),
713+
( 5, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00005-of-00018.gguf", 47_943_931_840),
714+
( 6, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00006-of-00018.gguf", 42_949_673_376),
715+
( 7, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00007-of-00018.gguf", 42_949_673_376),
716+
( 8, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00008-of-00018.gguf", 42_949_673_376),
717+
( 9, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00009-of-00018.gguf", 47_922_960_288),
718+
(10, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00010-of-00018.gguf", 42_949_673_376),
719+
(11, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00011-of-00018.gguf", 42_949_673_376),
720+
(12, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00012-of-00018.gguf", 47_912_433_568),
721+
(13, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00013-of-00018.gguf", 42_949_673_376),
722+
(14, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00014-of-00018.gguf", 42_949_673_376),
723+
(15, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00015-of-00018.gguf", 42_949_673_376),
724+
(16, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00016-of-00018.gguf", 47_912_474_624),
725+
(17, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00017-of-00018.gguf", 42_949_673_376),
726+
(18, "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00018-of-00018.gguf", 48_214_491_296),
727+
];
728+
729+
let mut grand_total_source: u64 = 0;
730+
let mut grand_total_compressed: u64 = 0;
731+
let mut grand_total_original: u64 = 0;
732+
let mut grand_total_tensors: usize = 0;
733+
let mut grand_by_type: [(usize, u64, u64); 6] = [(0, 0, 0); 6];
734+
735+
// Track output files for tail deletion (keep last 3, delete older)
736+
let mut output_files: Vec<String> = Vec::new();
737+
let keep_recent: usize = 3;
738+
739+
for (shard_num, filename, size) in shards.iter() {
740+
let url = format!(
741+
"https://huggingface.co/{}/resolve/main/{}",
742+
repo, filename
743+
);
744+
let out_path = format!("/tmp/llama4_maverick_shard{:02}.bgz7", shard_num);
745+
746+
eprintln!();
747+
eprintln!(
748+
"━━━ Shard {}/18 ({:.2} GB) ━━━",
749+
shard_num,
750+
*size as f64 / 1e9
751+
);
752+
eprintln!(" URL: {}", url);
753+
eprintln!(
754+
" Free disk target: keep {} most recent output files",
755+
keep_recent
756+
);
757+
758+
// 256 MB chunks — proven chunk size from Scout
759+
let mut reader =
760+
HttpRangeReader::with_chunk_size(url.clone(), *size, 256 * 1024 * 1024);
761+
762+
let out = std::fs::File::create(&out_path).expect("create output");
763+
let mut writer = BufWriter::new(out);
764+
765+
let stats = stream_index_gguf(
766+
&mut reader,
767+
&mut writer,
768+
Some(&|name, layer_type, orig, comp| {
769+
let ratio = if comp > 0 {
770+
orig as f64 / comp as f64
771+
} else {
772+
0.0
773+
};
774+
eprintln!(
775+
" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
776+
name, layer_type, orig, comp, ratio
777+
);
778+
}),
779+
)
780+
.unwrap_or_else(|e| panic!("stream_index_gguf shard {} failed: {}", shard_num, e));
781+
782+
// UNLOCK: drop writer BEFORE any file operations
783+
drop(writer);
784+
let out_size = std::fs::metadata(&out_path).map(|m| m.len()).unwrap_or(0);
785+
786+
// Per-shard summary
787+
eprintln!();
788+
eprintln!(
789+
" Shard {:02} result: {:.2} GB → {:.2} MB ({:.0}×)",
790+
shard_num,
791+
*size as f64 / 1e9,
792+
out_size as f64 / 1e6,
793+
stats.overall_ratio()
794+
);
795+
eprintln!(
796+
" Tensors: {} indexed, {} skipped",
797+
stats.tensors_indexed, stats.tensors_skipped
798+
);
799+
eprintln!(
800+
" Downloaded: {:.2} GB",
801+
reader.bytes_downloaded() as f64 / 1e9
802+
);
803+
804+
let type_names = [
805+
"Attention",
806+
"FeedForward",
807+
"Conv2D",
808+
"Norm",
809+
"Embedding",
810+
"Skip",
811+
];
812+
for (j, name) in type_names.iter().enumerate() {
813+
let (count, orig, comp) = stats.by_type[j];
814+
if count > 0 {
815+
let ratio = if comp > 0 {
816+
orig as f64 / comp as f64
817+
} else {
818+
0.0
819+
};
820+
eprintln!(
821+
" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.0}×)",
822+
name,
823+
count,
824+
orig as f64 / 1e9,
825+
comp as f64 / 1e6,
826+
ratio
827+
);
828+
grand_by_type[j].0 += count;
829+
grand_by_type[j].1 += orig;
830+
grand_by_type[j].2 += comp;
831+
}
832+
}
833+
834+
// Accumulate
835+
grand_total_source += *size;
836+
grand_total_compressed += out_size;
837+
grand_total_original += stats.original_bytes;
838+
grand_total_tensors += stats.tensors_indexed;
839+
840+
// TAIL DELETION: track this file, delete old ones
841+
output_files.push(out_path.clone());
842+
843+
while output_files.len() > keep_recent {
844+
let old_path = output_files.remove(0);
845+
match std::fs::remove_file(&old_path) {
846+
Ok(()) => eprintln!(
847+
" Tail cleanup: deleted {} (keeping last {})",
848+
old_path, keep_recent
849+
),
850+
Err(e) => eprintln!(" Tail cleanup warning: {} — {}", old_path, e),
851+
}
852+
}
853+
854+
// Drop reader to release any HTTP/temp state
855+
drop(reader);
856+
857+
assert!(
858+
stats.tensors_indexed > 0,
859+
"shard {} should have indexed tensors",
860+
shard_num
861+
);
862+
863+
eprintln!(
864+
" Progress: {}/{} shards complete ({:.1}%)",
865+
shard_num,
866+
18,
867+
*shard_num as f64 / 18.0 * 100.0
868+
);
869+
}
870+
871+
// Final cleanup: remove remaining output files
872+
for path in &output_files {
873+
if let Err(e) = std::fs::remove_file(path) {
874+
eprintln!(" Final cleanup warning: {} — {}", path, e);
875+
}
876+
}
877+
878+
// Grand total (all 18 shards)
879+
eprintln!();
880+
eprintln!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
881+
eprintln!("LLAMA 4 MAVERICK 17B-128E — FULL MODEL (ALL 18 SHARDS)");
882+
eprintln!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
883+
eprintln!(
884+
" Source (BF16): {:>10.2} GB",
885+
grand_total_source as f64 / 1e9
886+
);
887+
eprintln!(
888+
" Original (f32): {:>10.2} GB",
889+
grand_total_original as f64 / 1e9
890+
);
891+
eprintln!(
892+
" Compressed: {:>10.2} MB",
893+
grand_total_compressed as f64 / 1e6
894+
);
895+
eprintln!(
896+
" Overall ratio: {:>10.0}×",
897+
grand_total_original as f64 / grand_total_compressed as f64
898+
);
899+
eprintln!(" Tensors indexed: {}", grand_total_tensors);
900+
eprintln!();
901+
902+
let type_names = [
903+
"Attention",
904+
"FeedForward",
905+
"Conv2D",
906+
"Norm",
907+
"Embedding",
908+
"Skip",
909+
];
910+
for (j, name) in type_names.iter().enumerate() {
911+
let (count, orig, comp) = grand_by_type[j];
912+
if count > 0 {
913+
let ratio = if comp > 0 {
914+
orig as f64 / comp as f64
915+
} else {
916+
0.0
917+
};
918+
eprintln!(
919+
" {:<12} {:>4} tensors: {:>10.2} GB → {:>8.2} MB ({:.0}×)",
920+
name,
921+
count,
922+
orig as f64 / 1e9,
923+
comp as f64 / 1e6,
924+
ratio
925+
);
926+
}
927+
}
928+
eprintln!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
929+
930+
// Sanity checks — Maverick at 128 experts should have many more tensors
931+
assert!(
932+
grand_total_tensors > 500,
933+
"should have many tensors across all 18 shards: got {}",
934+
grand_total_tensors
935+
);
936+
assert!(
937+
grand_total_compressed < 500_000_000,
938+
"full model should be under 500 MB: was {} MB",
939+
grand_total_compressed / 1_000_000
940+
);
941+
assert!(
942+
grand_total_compressed > 50_000_000,
943+
"full model should be over 50 MB (sanity): was {} MB",
944+
grand_total_compressed / 1_000_000
945+
);
946+
}
699947
}

0 commit comments

Comments
 (0)