@@ -696,4 +696,252 @@ mod tests {
696696 // BF16 dequant to f32 doubles the size, so original_bytes > source size
697697 assert ! ( stats. original_bytes > 0 ) ;
698698 }
699+
700+ #[ test]
701+ #[ ignore] // Streams ~801 GB from HuggingFace — takes ~8-10 hours
702+ fn test_stream_index_llama4_maverick_bf16_all_shards ( ) {
703+ use super :: super :: http_reader:: HttpRangeReader ;
704+ use std:: io:: BufWriter ;
705+
706+ let repo = "unsloth/Llama-4-Maverick-17B-128E-Instruct-GGUF" ;
707+
708+ let shards: [ ( u8 , & str , u64 ) ; 18 ] = [
709+ ( 1 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00001-of-00018.gguf" , 46_166_870_240 ) ,
710+ ( 2 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00002-of-00018.gguf" , 42_949_673_376 ) ,
711+ ( 3 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00003-of-00018.gguf" , 42_949_673_376 ) ,
712+ ( 4 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00004-of-00018.gguf" , 42_949_673_376 ) ,
713+ ( 5 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00005-of-00018.gguf" , 47_943_931_840 ) ,
714+ ( 6 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00006-of-00018.gguf" , 42_949_673_376 ) ,
715+ ( 7 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00007-of-00018.gguf" , 42_949_673_376 ) ,
716+ ( 8 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00008-of-00018.gguf" , 42_949_673_376 ) ,
717+ ( 9 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00009-of-00018.gguf" , 47_922_960_288 ) ,
718+ ( 10 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00010-of-00018.gguf" , 42_949_673_376 ) ,
719+ ( 11 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00011-of-00018.gguf" , 42_949_673_376 ) ,
720+ ( 12 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00012-of-00018.gguf" , 47_912_433_568 ) ,
721+ ( 13 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00013-of-00018.gguf" , 42_949_673_376 ) ,
722+ ( 14 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00014-of-00018.gguf" , 42_949_673_376 ) ,
723+ ( 15 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00015-of-00018.gguf" , 42_949_673_376 ) ,
724+ ( 16 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00016-of-00018.gguf" , 47_912_474_624 ) ,
725+ ( 17 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00017-of-00018.gguf" , 42_949_673_376 ) ,
726+ ( 18 , "BF16/Llama-4-Maverick-17B-128E-Instruct-BF16-00018-of-00018.gguf" , 48_214_491_296 ) ,
727+ ] ;
728+
729+ let mut grand_total_source: u64 = 0 ;
730+ let mut grand_total_compressed: u64 = 0 ;
731+ let mut grand_total_original: u64 = 0 ;
732+ let mut grand_total_tensors: usize = 0 ;
733+ let mut grand_by_type: [ ( usize , u64 , u64 ) ; 6 ] = [ ( 0 , 0 , 0 ) ; 6 ] ;
734+
735+ // Track output files for tail deletion (keep last 3, delete older)
736+ let mut output_files: Vec < String > = Vec :: new ( ) ;
737+ let keep_recent: usize = 3 ;
738+
739+ for ( shard_num, filename, size) in shards. iter ( ) {
740+ let url = format ! (
741+ "https://huggingface.co/{}/resolve/main/{}" ,
742+ repo, filename
743+ ) ;
744+ let out_path = format ! ( "/tmp/llama4_maverick_shard{:02}.bgz7" , shard_num) ;
745+
746+ eprintln ! ( ) ;
747+ eprintln ! (
748+ "━━━ Shard {}/18 ({:.2} GB) ━━━" ,
749+ shard_num,
750+ * size as f64 / 1e9
751+ ) ;
752+ eprintln ! ( " URL: {}" , url) ;
753+ eprintln ! (
754+ " Free disk target: keep {} most recent output files" ,
755+ keep_recent
756+ ) ;
757+
758+ // 256 MB chunks — proven chunk size from Scout
759+ let mut reader =
760+ HttpRangeReader :: with_chunk_size ( url. clone ( ) , * size, 256 * 1024 * 1024 ) ;
761+
762+ let out = std:: fs:: File :: create ( & out_path) . expect ( "create output" ) ;
763+ let mut writer = BufWriter :: new ( out) ;
764+
765+ let stats = stream_index_gguf (
766+ & mut reader,
767+ & mut writer,
768+ Some ( & |name, layer_type, orig, comp| {
769+ let ratio = if comp > 0 {
770+ orig as f64 / comp as f64
771+ } else {
772+ 0.0
773+ } ;
774+ eprintln ! (
775+ " {:60} {:12?} {:>12} → {:>8} ({:.0}×)" ,
776+ name, layer_type, orig, comp, ratio
777+ ) ;
778+ } ) ,
779+ )
780+ . unwrap_or_else ( |e| panic ! ( "stream_index_gguf shard {} failed: {}" , shard_num, e) ) ;
781+
782+ // UNLOCK: drop writer BEFORE any file operations
783+ drop ( writer) ;
784+ let out_size = std:: fs:: metadata ( & out_path) . map ( |m| m. len ( ) ) . unwrap_or ( 0 ) ;
785+
786+ // Per-shard summary
787+ eprintln ! ( ) ;
788+ eprintln ! (
789+ " Shard {:02} result: {:.2} GB → {:.2} MB ({:.0}×)" ,
790+ shard_num,
791+ * size as f64 / 1e9 ,
792+ out_size as f64 / 1e6 ,
793+ stats. overall_ratio( )
794+ ) ;
795+ eprintln ! (
796+ " Tensors: {} indexed, {} skipped" ,
797+ stats. tensors_indexed, stats. tensors_skipped
798+ ) ;
799+ eprintln ! (
800+ " Downloaded: {:.2} GB" ,
801+ reader. bytes_downloaded( ) as f64 / 1e9
802+ ) ;
803+
804+ let type_names = [
805+ "Attention" ,
806+ "FeedForward" ,
807+ "Conv2D" ,
808+ "Norm" ,
809+ "Embedding" ,
810+ "Skip" ,
811+ ] ;
812+ for ( j, name) in type_names. iter ( ) . enumerate ( ) {
813+ let ( count, orig, comp) = stats. by_type [ j] ;
814+ if count > 0 {
815+ let ratio = if comp > 0 {
816+ orig as f64 / comp as f64
817+ } else {
818+ 0.0
819+ } ;
820+ eprintln ! (
821+ " {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.0}×)" ,
822+ name,
823+ count,
824+ orig as f64 / 1e9 ,
825+ comp as f64 / 1e6 ,
826+ ratio
827+ ) ;
828+ grand_by_type[ j] . 0 += count;
829+ grand_by_type[ j] . 1 += orig;
830+ grand_by_type[ j] . 2 += comp;
831+ }
832+ }
833+
834+ // Accumulate
835+ grand_total_source += * size;
836+ grand_total_compressed += out_size;
837+ grand_total_original += stats. original_bytes ;
838+ grand_total_tensors += stats. tensors_indexed ;
839+
840+ // TAIL DELETION: track this file, delete old ones
841+ output_files. push ( out_path. clone ( ) ) ;
842+
843+ while output_files. len ( ) > keep_recent {
844+ let old_path = output_files. remove ( 0 ) ;
845+ match std:: fs:: remove_file ( & old_path) {
846+ Ok ( ( ) ) => eprintln ! (
847+ " Tail cleanup: deleted {} (keeping last {})" ,
848+ old_path, keep_recent
849+ ) ,
850+ Err ( e) => eprintln ! ( " Tail cleanup warning: {} — {}" , old_path, e) ,
851+ }
852+ }
853+
854+ // Drop reader to release any HTTP/temp state
855+ drop ( reader) ;
856+
857+ assert ! (
858+ stats. tensors_indexed > 0 ,
859+ "shard {} should have indexed tensors" ,
860+ shard_num
861+ ) ;
862+
863+ eprintln ! (
864+ " Progress: {}/{} shards complete ({:.1}%)" ,
865+ shard_num,
866+ 18 ,
867+ * shard_num as f64 / 18.0 * 100.0
868+ ) ;
869+ }
870+
871+ // Final cleanup: remove remaining output files
872+ for path in & output_files {
873+ if let Err ( e) = std:: fs:: remove_file ( path) {
874+ eprintln ! ( " Final cleanup warning: {} — {}" , path, e) ;
875+ }
876+ }
877+
878+ // Grand total (all 18 shards)
879+ eprintln ! ( ) ;
880+ eprintln ! ( "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ) ;
881+ eprintln ! ( "LLAMA 4 MAVERICK 17B-128E — FULL MODEL (ALL 18 SHARDS)" ) ;
882+ eprintln ! ( "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ) ;
883+ eprintln ! (
884+ " Source (BF16): {:>10.2} GB" ,
885+ grand_total_source as f64 / 1e9
886+ ) ;
887+ eprintln ! (
888+ " Original (f32): {:>10.2} GB" ,
889+ grand_total_original as f64 / 1e9
890+ ) ;
891+ eprintln ! (
892+ " Compressed: {:>10.2} MB" ,
893+ grand_total_compressed as f64 / 1e6
894+ ) ;
895+ eprintln ! (
896+ " Overall ratio: {:>10.0}×" ,
897+ grand_total_original as f64 / grand_total_compressed as f64
898+ ) ;
899+ eprintln ! ( " Tensors indexed: {}" , grand_total_tensors) ;
900+ eprintln ! ( ) ;
901+
902+ let type_names = [
903+ "Attention" ,
904+ "FeedForward" ,
905+ "Conv2D" ,
906+ "Norm" ,
907+ "Embedding" ,
908+ "Skip" ,
909+ ] ;
910+ for ( j, name) in type_names. iter ( ) . enumerate ( ) {
911+ let ( count, orig, comp) = grand_by_type[ j] ;
912+ if count > 0 {
913+ let ratio = if comp > 0 {
914+ orig as f64 / comp as f64
915+ } else {
916+ 0.0
917+ } ;
918+ eprintln ! (
919+ " {:<12} {:>4} tensors: {:>10.2} GB → {:>8.2} MB ({:.0}×)" ,
920+ name,
921+ count,
922+ orig as f64 / 1e9 ,
923+ comp as f64 / 1e6 ,
924+ ratio
925+ ) ;
926+ }
927+ }
928+ eprintln ! ( "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" ) ;
929+
930+ // Sanity checks — Maverick at 128 experts should have many more tensors
931+ assert ! (
932+ grand_total_tensors > 500 ,
933+ "should have many tensors across all 18 shards: got {}" ,
934+ grand_total_tensors
935+ ) ;
936+ assert ! (
937+ grand_total_compressed < 500_000_000 ,
938+ "full model should be under 500 MB: was {} MB" ,
939+ grand_total_compressed / 1_000_000
940+ ) ;
941+ assert ! (
942+ grand_total_compressed > 50_000_000 ,
943+ "full model should be over 50 MB (sanity): was {} MB" ,
944+ grand_total_compressed / 1_000_000
945+ ) ;
946+ }
699947}
0 commit comments