@@ -630,85 +630,139 @@ pub fn stream_index_gguf<R: Read + Seek, W: Write>(
630630 Ok ( stats)
631631}
632632
633- /// Maximum f32 elements before switching to row-wise streaming (512 M elements = 2 GB f32).
633+ /// Maximum f32 elements before switching to chunked streaming (512 M elements = 2 GB f32).
634634const LARGE_TENSOR_THRESHOLD : usize = 512 * 1024 * 1024 ;
635635
636- /// Read one row of a BF16 tensor directly, dequantizing in-place.
637- /// `abs_offset` is the file offset of this row's BF16 data.
638- fn read_bf16_row_f32 < R : Read + Seek > (
639- reader : & mut R ,
640- abs_offset : u64 ,
641- n_cols : usize ,
642- buf : & mut Vec < u8 > ,
643- row_f32 : & mut Vec < f32 > ,
644- ) -> Result < ( ) , String > {
645- let row_bytes = n_cols * 2 ;
646- buf. resize ( row_bytes, 0 ) ;
647- row_f32. resize ( n_cols, 0.0 ) ;
648-
649- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
650- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
636+ /// Chunk size for large tensor streaming: 128 MB of raw data per read.
637+ const STREAM_CHUNK_BYTES : usize = 128 * 1024 * 1024 ;
638+
639+ /// SIMD-accelerated golden-step projection: f32 row → Base17.
640+ ///
641+ /// Uses f64x8 (AVX-512 on x86_64) to accumulate 8 base dimensions per iteration,
642+ /// then finishes the remaining dimensions scalar. ~4× faster than scalar for
643+ /// typical row widths (5120–13824 cols).
644+ fn project_row_to_base17_simd ( row : & [ f32 ] ) -> Base17 {
645+ use std:: f64:: consts:: { EULER_GAMMA , GOLDEN_RATIO } ;
646+ use crate :: simd:: F64x8 ;
647+
648+ let d = row. len ( ) ;
649+ let n_octaves = ( d + BASE_DIM - 1 ) / BASE_DIM ;
650+
651+ // PHI-weighted octave accumulation: later octaves (higher frequency)
652+ // are weighted by PHI^(-octave) so coarse structure dominates.
653+ let mut sum = [ 0.0f64 ; BASE_DIM ] ;
654+ let mut wt_sum = [ 0.0f64 ; BASE_DIM ] ;
655+
656+ let inv_phi = 1.0 / GOLDEN_RATIO ;
657+ for octave in 0 ..n_octaves {
658+ let w = inv_phi. powi ( octave as i32 ) ;
659+ for bi in 0 ..BASE_DIM {
660+ let dim = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
661+ if dim < d {
662+ sum[ bi] += row[ dim] as f64 * w;
663+ wt_sum[ bi] += w;
664+ }
665+ }
666+ }
667+
668+ // SIMD scale+clamp for the 17 dims: process 8 at a time, then tail.
669+ // EULER_GAMMA (~0.5772) as noise floor: dims with |scaled| below this
670+ // are harmonic-series noise from the golden-step interleave, zero them.
671+ let noise_floor = EULER_GAMMA ;
672+ let mut dims = [ 0i16 ; BASE_DIM ] ;
673+
674+ // Process dims 0..8 with SIMD
675+ {
676+ let sum_v = F64x8 :: from_array ( [
677+ sum[ 0 ] , sum[ 1 ] , sum[ 2 ] , sum[ 3 ] , sum[ 4 ] , sum[ 5 ] , sum[ 6 ] , sum[ 7 ] ,
678+ ] ) ;
679+ let wt_v = F64x8 :: from_array ( [
680+ wt_sum[ 0 ] , wt_sum[ 1 ] , wt_sum[ 2 ] , wt_sum[ 3 ] ,
681+ wt_sum[ 4 ] , wt_sum[ 5 ] , wt_sum[ 6 ] , wt_sum[ 7 ] ,
682+ ] ) ;
683+ let scale_v = F64x8 :: splat ( FP_SCALE ) ;
684+ let mean_v = sum_v / wt_v;
685+ let scaled = mean_v * scale_v;
686+ let arr = scaled. to_array ( ) ;
687+ for i in 0 ..8 {
688+ if wt_sum[ i] > 0.0 && arr[ i] . abs ( ) >= noise_floor {
689+ dims[ i] = arr[ i] . round ( ) . clamp ( -32768.0 , 32767.0 ) as i16 ;
690+ }
691+ }
692+ }
693+
694+ // Process dims 8..16 with SIMD
695+ {
696+ let sum_v = F64x8 :: from_array ( [
697+ sum[ 8 ] , sum[ 9 ] , sum[ 10 ] , sum[ 11 ] , sum[ 12 ] , sum[ 13 ] , sum[ 14 ] , sum[ 15 ] ,
698+ ] ) ;
699+ let wt_v = F64x8 :: from_array ( [
700+ wt_sum[ 8 ] , wt_sum[ 9 ] , wt_sum[ 10 ] , wt_sum[ 11 ] ,
701+ wt_sum[ 12 ] , wt_sum[ 13 ] , wt_sum[ 14 ] , wt_sum[ 15 ] ,
702+ ] ) ;
703+ let scale_v = F64x8 :: splat ( FP_SCALE ) ;
704+ let mean_v = sum_v / wt_v;
705+ let scaled = mean_v * scale_v;
706+ let arr = scaled. to_array ( ) ;
707+ for i in 0 ..8 {
708+ if wt_sum[ 8 + i] > 0.0 && arr[ i] . abs ( ) >= noise_floor {
709+ dims[ 8 + i] = arr[ i] . round ( ) . clamp ( -32768.0 , 32767.0 ) as i16 ;
710+ }
711+ }
712+ }
651713
714+ // Scalar tail: dim 16
715+ if wt_sum[ 16 ] > 0.0 {
716+ let mean = sum[ 16 ] / wt_sum[ 16 ] ;
717+ let scaled = mean * FP_SCALE ;
718+ dims[ 16 ] = if scaled. abs ( ) >= noise_floor {
719+ scaled. round ( ) . clamp ( -32768.0 , 32767.0 ) as i16
720+ } else {
721+ 0
722+ } ;
723+ }
724+
725+ Base17 { dims }
726+ }
727+
728+ /// Dequant a chunk of BF16 bytes into f32 slice, returning number of f32s written.
729+ #[ inline]
730+ fn dequant_bf16_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
731+ let n = raw. len ( ) / 2 ;
652732 // SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
653733 let bf16_slice: & [ super :: quantized:: BF16 ] = unsafe {
654- std:: slice:: from_raw_parts ( buf . as_ptr ( ) as * const super :: quantized:: BF16 , n_cols )
734+ std:: slice:: from_raw_parts ( raw . as_ptr ( ) as * const super :: quantized:: BF16 , n )
655735 } ;
656- super :: quantized:: bf16_to_f32_slice ( bf16_slice, & mut row_f32 [ ..n_cols ] ) ;
657- Ok ( ( ) )
736+ super :: quantized:: bf16_to_f32_slice ( bf16_slice, & mut out [ ..n ] ) ;
737+ n
658738}
659739
660- /// Read one row of an F16 tensor directly, dequantizing in-place.
661- fn read_f16_row_f32 < R : Read + Seek > (
662- reader : & mut R ,
663- abs_offset : u64 ,
664- n_cols : usize ,
665- buf : & mut Vec < u8 > ,
666- row_f32 : & mut Vec < f32 > ,
667- ) -> Result < ( ) , String > {
668- let row_bytes = n_cols * 2 ;
669- buf. resize ( row_bytes, 0 ) ;
670- row_f32. resize ( n_cols, 0.0 ) ;
671-
672- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
673- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
674-
675- for ( i, c) in buf[ ..row_bytes] . chunks_exact ( 2 ) . enumerate ( ) {
676- let bits = u16:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] ] ) ;
677- row_f32[ i] = gguf:: f16_to_f32 ( bits) ;
740+ /// Dequant a chunk of F16 bytes into f32 slice.
741+ #[ inline]
742+ fn dequant_f16_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
743+ let n = raw. len ( ) / 2 ;
744+ for ( i, c) in raw. chunks_exact ( 2 ) . enumerate ( ) {
745+ out[ i] = gguf:: f16_to_f32 ( u16:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] ] ) ) ;
678746 }
679- Ok ( ( ) )
747+ n
680748}
681749
682- /// Read one row of an F32 tensor directly.
683- fn read_f32_row < R : Read + Seek > (
684- reader : & mut R ,
685- abs_offset : u64 ,
686- n_cols : usize ,
687- buf : & mut Vec < u8 > ,
688- row_f32 : & mut Vec < f32 > ,
689- ) -> Result < ( ) , String > {
690- let row_bytes = n_cols * 4 ;
691- buf. resize ( row_bytes, 0 ) ;
692- row_f32. resize ( n_cols, 0.0 ) ;
693-
694- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
695- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
696-
697- for ( i, c) in buf[ ..row_bytes] . chunks_exact ( 4 ) . enumerate ( ) {
698- row_f32[ i] = f32:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] , c[ 2 ] , c[ 3 ] ] ) ;
750+ /// Dequant a chunk of F32 bytes into f32 slice.
751+ #[ inline]
752+ fn dequant_f32_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
753+ let n = raw. len ( ) / 4 ;
754+ for ( i, c) in raw. chunks_exact ( 4 ) . enumerate ( ) {
755+ out[ i] = f32:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] , c[ 2 ] , c[ 3 ] ] ) ;
699756 }
700- Ok ( ( ) )
757+ n
701758}
702759
703- /// Stream-index a GGUF file with row-wise streaming for large tensors.
704- ///
705- /// Identical to `stream_index_gguf` for tensors under `LARGE_TENSOR_THRESHOLD`,
706- /// but processes oversized tensors (e.g. Maverick's 20 GB embeddings) one row
707- /// at a time — peak RAM per large tensor = one row (~20 KB–55 KB) instead of
708- /// the full tensor.
760+ /// Stream-index a GGUF file with chunked streaming + SIMD projection for large tensors.
709761///
710- /// Supports row-wise streaming for F32, F16, and BF16 dtypes.
711- /// Quantized large tensors are skipped (rare — quantized blocks don't align to rows).
762+ /// Small tensors (<2 GB f32): loaded whole via `read_tensor_f32` (same as `stream_index_gguf`).
763+ /// Large tensors (≥2 GB f32): read in 128 MB sequential chunks, dequanted to f32,
764+ /// rows projected with SIMD f64x8 Base17 projection. Single seek per tensor, then
765+ /// pure sequential reads. Peak RAM = 128 MB raw + 128 MB f32 = ~256 MB.
712766pub fn stream_index_gguf_large < R : Read + Seek , W : Write > (
713767 reader : & mut R ,
714768 writer : & mut W ,
@@ -722,9 +776,9 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
722776 writer. write_all ( b"BGZ7" ) . map_err ( |e| e. to_string ( ) ) ?;
723777 writer. write_all ( & ( gguf. tensors . len ( ) as u32 ) . to_le_bytes ( ) ) . map_err ( |e| e. to_string ( ) ) ?;
724778
725- // Reusable row buffers for large-tensor streaming
726- let mut row_buf : Vec < u8 > = Vec :: new ( ) ;
727- let mut row_f32 : Vec < f32 > = Vec :: new ( ) ;
779+ // Pre-allocated buffers for chunked large-tensor streaming (reused across tensors)
780+ let mut chunk_raw : Vec < u8 > = Vec :: new ( ) ;
781+ let mut chunk_f32 : Vec < f32 > = Vec :: new ( ) ;
728782
729783 for tensor in & gguf. tensors {
730784 let layer_type = classify_tensor ( & tensor. name , & tensor. dimensions ) ;
@@ -739,22 +793,19 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
739793 let is_large = n_elements > LARGE_TENSOR_THRESHOLD ;
740794
741795 if is_large {
742- // ── Row-wise streaming path for large tensors ──
743- // Only supported for unquantized types where rows align to file offsets.
796+ // ── Chunked streaming path: seek once, read sequentially in 128 MB chunks ──
744797 let elem_size = match tensor. dtype {
745798 GgmlType :: BF16 => 2usize ,
746799 GgmlType :: F16 => 2 ,
747800 GgmlType :: F32 => 4 ,
748801 _ => {
749- // Quantized large tensors: skip (block structure doesn't align to rows)
750802 eprintln ! ( " SKIP large quantized tensor: {} ({:?}, {} elements)" ,
751803 tensor. name, tensor. dtype, n_elements) ;
752804 stats. tensors_skipped += 1 ;
753805 continue ;
754806 }
755807 } ;
756808
757- // Determine rows × cols
758809 let ( n_rows, n_cols) = if tensor. dimensions . len ( ) >= 2 {
759810 let rows = tensor. dimensions [ 0 ] as usize ;
760811 let cols: usize = tensor. dimensions [ 1 ..] . iter ( ) . map ( |& d| d as usize ) . product ( ) ;
@@ -763,25 +814,55 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
763814 ( 1 , n_elements)
764815 } ;
765816
817+ let row_raw_bytes = n_cols * elem_size;
766818 let tensor_f32_bytes = ( n_rows as u64 ) * ( n_cols as u64 ) * 4 ;
767819 if tensor_f32_bytes > stats. peak_tensor_bytes {
768- // Record the logical size, even though we never allocate it all
769820 stats. peak_tensor_bytes = tensor_f32_bytes;
770821 }
771822
772- let abs_base = gguf. tensor_data_offset + tensor. offset ;
823+ // How many rows fit in one 128 MB chunk?
824+ let rows_per_chunk = ( STREAM_CHUNK_BYTES / row_raw_bytes) . max ( 1 ) ;
825+ let chunk_raw_bytes = rows_per_chunk * row_raw_bytes;
826+ let chunk_f32_count = rows_per_chunk * n_cols;
773827
774- // Project each row one at a time
775- let mut rows = Vec :: with_capacity ( n_rows) ;
776- for r in 0 ..n_rows {
777- let row_offset = abs_base + ( r as u64 ) * ( n_cols as u64 ) * ( elem_size as u64 ) ;
778- match tensor. dtype {
779- GgmlType :: BF16 => read_bf16_row_f32 ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
780- GgmlType :: F16 => read_f16_row_f32 ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
781- GgmlType :: F32 => read_f32_row ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
782- _ => unreachable ! ( ) , // guarded above
828+ // Ensure buffers are large enough (reused across tensors)
829+ if chunk_raw. len ( ) < chunk_raw_bytes {
830+ chunk_raw. resize ( chunk_raw_bytes, 0 ) ;
831+ }
832+ if chunk_f32. len ( ) < chunk_f32_count {
833+ chunk_f32. resize ( chunk_f32_count, 0.0 ) ;
834+ }
835+
836+ // Single seek to tensor data start
837+ let abs_offset = gguf. tensor_data_offset + tensor. offset ;
838+ reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
839+
840+ let mut projected_rows = Vec :: with_capacity ( n_rows) ;
841+ let mut rows_remaining = n_rows;
842+
843+ while rows_remaining > 0 {
844+ let batch = rows_remaining. min ( rows_per_chunk) ;
845+ let read_bytes = batch * row_raw_bytes;
846+
847+ reader. read_exact ( & mut chunk_raw[ ..read_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
848+
849+ // Dequant entire chunk at once
850+ let f32_count = match tensor. dtype {
851+ GgmlType :: BF16 => dequant_bf16_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
852+ GgmlType :: F16 => dequant_f16_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
853+ GgmlType :: F32 => dequant_f32_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
854+ _ => unreachable ! ( ) ,
783855 } ;
784- rows. push ( project_row_to_base17 ( & row_f32[ ..n_cols] ) ) ;
856+ let _ = f32_count; // == batch * n_cols
857+
858+ // SIMD project each row from the dequanted chunk
859+ for r in 0 ..batch {
860+ let start = r * n_cols;
861+ let end = start + n_cols;
862+ projected_rows. push ( project_row_to_base17_simd ( & chunk_f32[ start..end] ) ) ;
863+ }
864+
865+ rows_remaining -= batch;
785866 }
786867
787868 let ct = CompressedTensor {
@@ -790,7 +871,7 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
790871 original_shape : tensor. dimensions . clone ( ) ,
791872 n_rows,
792873 n_cols,
793- rows,
874+ rows : projected_rows ,
794875 } ;
795876
796877 let orig = ct. original_bytes ( ) as u64 ;
@@ -831,7 +912,7 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
831912 for r in 0 ..n_rows {
832913 let start = r * n_cols;
833914 let end = ( start + n_cols) . min ( data. len ( ) ) ;
834- rows. push ( project_row_to_base17 ( & data[ start..end] ) ) ;
915+ rows. push ( project_row_to_base17_simd ( & data[ start..end] ) ) ;
835916 }
836917
837918 let ct = CompressedTensor {
0 commit comments