Skip to content

Commit c21572b

Browse files
committed
test: add Llama 4 Scout BF16 shard 5 streaming integration test
Streams 18.2 GB BF16 shard directly from HuggingFace via HTTP range reader. Zero disk usage for source GGUF. Validates BF16 dequant path and MoE tensor handling on real Llama 4 weights. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
1 parent 11595a5 commit c21572b

1 file changed

Lines changed: 61 additions & 0 deletions

File tree

src/hpc/gguf_indexer.rs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,4 +635,65 @@ mod tests {
635635

636636
assert!(stats.tensors_indexed > 0);
637637
}
638+
639+
#[test]
640+
#[ignore] // Streams BF16 shard 5 (18.2 GB) from HuggingFace
641+
fn test_stream_index_llama4_bf16_shard5() {
642+
use super::super::http_reader::HttpRangeReader;
643+
use std::io::BufWriter;
644+
645+
let repo = "unsloth/Llama-4-Scout-17B-16E-Instruct-GGUF";
646+
let filename = "BF16/Llama-4-Scout-17B-16E-Instruct-BF16-00005-of-00005.gguf";
647+
let size: u64 = 18_220_000_000; // ~18.2 GB from metadata
648+
649+
let url = format!("https://huggingface.co/{}/resolve/main/{}", repo, filename);
650+
eprintln!("Streaming shard 5: {:.2} GB", size as f64 / 1e9);
651+
eprintln!(" URL: {}", url);
652+
653+
// 16 MB chunks for fewer HTTP round-trips
654+
let mut reader = HttpRangeReader::with_chunk_size(url, size, 16 * 1024 * 1024);
655+
656+
let out_path = "/tmp/llama4_scout_shard5.bgz7";
657+
let out = std::fs::File::create(out_path).expect("create output");
658+
let mut writer = BufWriter::new(out);
659+
660+
let stats = stream_index_gguf(
661+
&mut reader,
662+
&mut writer,
663+
Some(&|name, layer_type, orig, comp| {
664+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
665+
eprintln!(" {:60} {:12?} {:>12} → {:>8} ({:.0}×)",
666+
name, layer_type, orig, comp, ratio);
667+
}),
668+
).expect("stream_index_gguf");
669+
670+
drop(writer);
671+
let out_size = std::fs::metadata(out_path).map(|m| m.len()).unwrap_or(0);
672+
673+
eprintln!();
674+
eprintln!("=== Llama 4 Scout BF16 Shard 5 → bgz17 ===");
675+
eprintln!(" Source: {:.2} GB (BF16, streamed from HF)", size as f64 / 1e9);
676+
eprintln!(" Output: {:.2} MB", out_size as f64 / 1e6);
677+
eprintln!(" Downloaded: {:.2} GB", reader.bytes_downloaded() as f64 / 1e9);
678+
eprintln!(" Tensors: {} indexed, {} skipped",
679+
stats.tensors_indexed, stats.tensors_skipped);
680+
eprintln!(" Original (f32): {:.2} GB", stats.original_bytes as f64 / 1e9);
681+
eprintln!(" Compressed: {:.2} MB", stats.compressed_bytes as f64 / 1e6);
682+
eprintln!(" Ratio: {:.1}×", stats.overall_ratio());
683+
eprintln!(" Peak tensor: {:.2} MB", stats.peak_tensor_bytes as f64 / 1e6);
684+
685+
let type_names = ["Attention", "FeedForward", "Conv2D", "Norm", "Embedding", "Skip"];
686+
for (i, name) in type_names.iter().enumerate() {
687+
let (count, orig, comp) = stats.by_type[i];
688+
if count > 0 {
689+
let ratio = if comp > 0 { orig as f64 / comp as f64 } else { 0.0 };
690+
eprintln!(" {:<12} {:>3} tensors: {:>10.2} GB → {:>8.2} MB ({:.1}×)",
691+
name, count, orig as f64 / 1e9, comp as f64 / 1e6, ratio);
692+
}
693+
}
694+
695+
assert!(stats.tensors_indexed > 0);
696+
// BF16 dequant to f32 doubles the size, so original_bytes > source size
697+
assert!(stats.original_bytes > 0);
698+
}
638699
}

0 commit comments

Comments
 (0)