@@ -342,85 +342,139 @@ pub fn stream_index_gguf<R: Read + Seek, W: Write>(
342342 Ok ( stats)
343343}
344344
345- /// Maximum f32 elements before switching to row-wise streaming (512 M elements = 2 GB f32).
345+ /// Maximum f32 elements before switching to chunked streaming (512 M elements = 2 GB f32).
346346const LARGE_TENSOR_THRESHOLD : usize = 512 * 1024 * 1024 ;
347347
348- /// Read one row of a BF16 tensor directly, dequantizing in-place.
349- /// `abs_offset` is the file offset of this row's BF16 data.
350- fn read_bf16_row_f32 < R : Read + Seek > (
351- reader : & mut R ,
352- abs_offset : u64 ,
353- n_cols : usize ,
354- buf : & mut Vec < u8 > ,
355- row_f32 : & mut Vec < f32 > ,
356- ) -> Result < ( ) , String > {
357- let row_bytes = n_cols * 2 ;
358- buf. resize ( row_bytes, 0 ) ;
359- row_f32. resize ( n_cols, 0.0 ) ;
360-
361- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
362- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
348+ /// Chunk size for large tensor streaming: 128 MB of raw data per read.
349+ const STREAM_CHUNK_BYTES : usize = 128 * 1024 * 1024 ;
350+
351+ /// SIMD-accelerated golden-step projection: f32 row → Base17.
352+ ///
353+ /// Uses f64x8 (AVX-512 on x86_64) to accumulate 8 base dimensions per iteration,
354+ /// then finishes the remaining dimensions scalar. ~4× faster than scalar for
355+ /// typical row widths (5120–13824 cols).
356+ fn project_row_to_base17_simd ( row : & [ f32 ] ) -> Base17 {
357+ use std:: f64:: consts:: { EULER_GAMMA , GOLDEN_RATIO } ;
358+ use crate :: simd:: F64x8 ;
359+
360+ let d = row. len ( ) ;
361+ let n_octaves = ( d + BASE_DIM - 1 ) / BASE_DIM ;
362+
363+ // PHI-weighted octave accumulation: later octaves (higher frequency)
364+ // are weighted by PHI^(-octave) so coarse structure dominates.
365+ let mut sum = [ 0.0f64 ; BASE_DIM ] ;
366+ let mut wt_sum = [ 0.0f64 ; BASE_DIM ] ;
367+
368+ let inv_phi = 1.0 / GOLDEN_RATIO ;
369+ for octave in 0 ..n_octaves {
370+ let w = inv_phi. powi ( octave as i32 ) ;
371+ for bi in 0 ..BASE_DIM {
372+ let dim = octave * BASE_DIM + GOLDEN_POS [ bi] as usize ;
373+ if dim < d {
374+ sum[ bi] += row[ dim] as f64 * w;
375+ wt_sum[ bi] += w;
376+ }
377+ }
378+ }
363379
380+ // SIMD scale+clamp for the 17 dims: process 8 at a time, then tail.
381+ // EULER_GAMMA (~0.5772) as noise floor: dims with |scaled| below this
382+ // are harmonic-series noise from the golden-step interleave, zero them.
383+ let noise_floor = EULER_GAMMA ;
384+ let mut dims = [ 0i16 ; BASE_DIM ] ;
385+
386+ // Process dims 0..8 with SIMD
387+ {
388+ let sum_v = F64x8 :: from_array ( [
389+ sum[ 0 ] , sum[ 1 ] , sum[ 2 ] , sum[ 3 ] , sum[ 4 ] , sum[ 5 ] , sum[ 6 ] , sum[ 7 ] ,
390+ ] ) ;
391+ let wt_v = F64x8 :: from_array ( [
392+ wt_sum[ 0 ] , wt_sum[ 1 ] , wt_sum[ 2 ] , wt_sum[ 3 ] ,
393+ wt_sum[ 4 ] , wt_sum[ 5 ] , wt_sum[ 6 ] , wt_sum[ 7 ] ,
394+ ] ) ;
395+ let scale_v = F64x8 :: splat ( FP_SCALE ) ;
396+ let mean_v = sum_v / wt_v;
397+ let scaled = mean_v * scale_v;
398+ let arr = scaled. to_array ( ) ;
399+ for i in 0 ..8 {
400+ if wt_sum[ i] > 0.0 && arr[ i] . abs ( ) >= noise_floor {
401+ dims[ i] = arr[ i] . round ( ) . clamp ( -32768.0 , 32767.0 ) as i16 ;
402+ }
403+ }
404+ }
405+
406+ // Process dims 8..16 with SIMD
407+ {
408+ let sum_v = F64x8 :: from_array ( [
409+ sum[ 8 ] , sum[ 9 ] , sum[ 10 ] , sum[ 11 ] , sum[ 12 ] , sum[ 13 ] , sum[ 14 ] , sum[ 15 ] ,
410+ ] ) ;
411+ let wt_v = F64x8 :: from_array ( [
412+ wt_sum[ 8 ] , wt_sum[ 9 ] , wt_sum[ 10 ] , wt_sum[ 11 ] ,
413+ wt_sum[ 12 ] , wt_sum[ 13 ] , wt_sum[ 14 ] , wt_sum[ 15 ] ,
414+ ] ) ;
415+ let scale_v = F64x8 :: splat ( FP_SCALE ) ;
416+ let mean_v = sum_v / wt_v;
417+ let scaled = mean_v * scale_v;
418+ let arr = scaled. to_array ( ) ;
419+ for i in 0 ..8 {
420+ if wt_sum[ 8 + i] > 0.0 && arr[ i] . abs ( ) >= noise_floor {
421+ dims[ 8 + i] = arr[ i] . round ( ) . clamp ( -32768.0 , 32767.0 ) as i16 ;
422+ }
423+ }
424+ }
425+
426+ // Scalar tail: dim 16
427+ if wt_sum[ 16 ] > 0.0 {
428+ let mean = sum[ 16 ] / wt_sum[ 16 ] ;
429+ let scaled = mean * FP_SCALE ;
430+ dims[ 16 ] = if scaled. abs ( ) >= noise_floor {
431+ scaled. round ( ) . clamp ( -32768.0 , 32767.0 ) as i16
432+ } else {
433+ 0
434+ } ;
435+ }
436+
437+ Base17 { dims }
438+ }
439+
440+ /// Dequant a chunk of BF16 bytes into f32 slice, returning number of f32s written.
441+ #[ inline]
442+ fn dequant_bf16_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
443+ let n = raw. len ( ) / 2 ;
364444 // SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
365445 let bf16_slice: & [ super :: quantized:: BF16 ] = unsafe {
366- std:: slice:: from_raw_parts ( buf . as_ptr ( ) as * const super :: quantized:: BF16 , n_cols )
446+ std:: slice:: from_raw_parts ( raw . as_ptr ( ) as * const super :: quantized:: BF16 , n )
367447 } ;
368- super :: quantized:: bf16_to_f32_slice ( bf16_slice, & mut row_f32 [ ..n_cols ] ) ;
369- Ok ( ( ) )
448+ super :: quantized:: bf16_to_f32_slice ( bf16_slice, & mut out [ ..n ] ) ;
449+ n
370450}
371451
372- /// Read one row of an F16 tensor directly, dequantizing in-place.
373- fn read_f16_row_f32 < R : Read + Seek > (
374- reader : & mut R ,
375- abs_offset : u64 ,
376- n_cols : usize ,
377- buf : & mut Vec < u8 > ,
378- row_f32 : & mut Vec < f32 > ,
379- ) -> Result < ( ) , String > {
380- let row_bytes = n_cols * 2 ;
381- buf. resize ( row_bytes, 0 ) ;
382- row_f32. resize ( n_cols, 0.0 ) ;
383-
384- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
385- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
386-
387- for ( i, c) in buf[ ..row_bytes] . chunks_exact ( 2 ) . enumerate ( ) {
388- let bits = u16:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] ] ) ;
389- row_f32[ i] = gguf:: f16_to_f32 ( bits) ;
452+ /// Dequant a chunk of F16 bytes into f32 slice.
453+ #[ inline]
454+ fn dequant_f16_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
455+ let n = raw. len ( ) / 2 ;
456+ for ( i, c) in raw. chunks_exact ( 2 ) . enumerate ( ) {
457+ out[ i] = gguf:: f16_to_f32 ( u16:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] ] ) ) ;
390458 }
391- Ok ( ( ) )
459+ n
392460}
393461
394- /// Read one row of an F32 tensor directly.
395- fn read_f32_row < R : Read + Seek > (
396- reader : & mut R ,
397- abs_offset : u64 ,
398- n_cols : usize ,
399- buf : & mut Vec < u8 > ,
400- row_f32 : & mut Vec < f32 > ,
401- ) -> Result < ( ) , String > {
402- let row_bytes = n_cols * 4 ;
403- buf. resize ( row_bytes, 0 ) ;
404- row_f32. resize ( n_cols, 0.0 ) ;
405-
406- reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
407- reader. read_exact ( & mut buf[ ..row_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
408-
409- for ( i, c) in buf[ ..row_bytes] . chunks_exact ( 4 ) . enumerate ( ) {
410- row_f32[ i] = f32:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] , c[ 2 ] , c[ 3 ] ] ) ;
462+ /// Dequant a chunk of F32 bytes into f32 slice.
463+ #[ inline]
464+ fn dequant_f32_chunk ( raw : & [ u8 ] , out : & mut [ f32 ] ) -> usize {
465+ let n = raw. len ( ) / 4 ;
466+ for ( i, c) in raw. chunks_exact ( 4 ) . enumerate ( ) {
467+ out[ i] = f32:: from_le_bytes ( [ c[ 0 ] , c[ 1 ] , c[ 2 ] , c[ 3 ] ] ) ;
411468 }
412- Ok ( ( ) )
469+ n
413470}
414471
415- /// Stream-index a GGUF file with row-wise streaming for large tensors.
416- ///
417- /// Identical to `stream_index_gguf` for tensors under `LARGE_TENSOR_THRESHOLD`,
418- /// but processes oversized tensors (e.g. Maverick's 20 GB embeddings) one row
419- /// at a time — peak RAM per large tensor = one row (~20 KB–55 KB) instead of
420- /// the full tensor.
472+ /// Stream-index a GGUF file with chunked streaming + SIMD projection for large tensors.
421473///
422- /// Supports row-wise streaming for F32, F16, and BF16 dtypes.
423- /// Quantized large tensors are skipped (rare — quantized blocks don't align to rows).
474+ /// Small tensors (<2 GB f32): loaded whole via `read_tensor_f32` (same as `stream_index_gguf`).
475+ /// Large tensors (≥2 GB f32): read in 128 MB sequential chunks, dequanted to f32,
476+ /// rows projected with SIMD f64x8 Base17 projection. Single seek per tensor, then
477+ /// pure sequential reads. Peak RAM = 128 MB raw + 128 MB f32 = ~256 MB.
424478pub fn stream_index_gguf_large < R : Read + Seek , W : Write > (
425479 reader : & mut R ,
426480 writer : & mut W ,
@@ -434,9 +488,9 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
434488 writer. write_all ( b"BGZ7" ) . map_err ( |e| e. to_string ( ) ) ?;
435489 writer. write_all ( & ( gguf. tensors . len ( ) as u32 ) . to_le_bytes ( ) ) . map_err ( |e| e. to_string ( ) ) ?;
436490
437- // Reusable row buffers for large-tensor streaming
438- let mut row_buf : Vec < u8 > = Vec :: new ( ) ;
439- let mut row_f32 : Vec < f32 > = Vec :: new ( ) ;
491+ // Pre-allocated buffers for chunked large-tensor streaming (reused across tensors)
492+ let mut chunk_raw : Vec < u8 > = Vec :: new ( ) ;
493+ let mut chunk_f32 : Vec < f32 > = Vec :: new ( ) ;
440494
441495 for tensor in & gguf. tensors {
442496 let layer_type = classify_tensor ( & tensor. name , & tensor. dimensions ) ;
@@ -451,22 +505,19 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
451505 let is_large = n_elements > LARGE_TENSOR_THRESHOLD ;
452506
453507 if is_large {
454- // ── Row-wise streaming path for large tensors ──
455- // Only supported for unquantized types where rows align to file offsets.
508+ // ── Chunked streaming path: seek once, read sequentially in 128 MB chunks ──
456509 let elem_size = match tensor. dtype {
457510 GgmlType :: BF16 => 2usize ,
458511 GgmlType :: F16 => 2 ,
459512 GgmlType :: F32 => 4 ,
460513 _ => {
461- // Quantized large tensors: skip (block structure doesn't align to rows)
462514 eprintln ! ( " SKIP large quantized tensor: {} ({:?}, {} elements)" ,
463515 tensor. name, tensor. dtype, n_elements) ;
464516 stats. tensors_skipped += 1 ;
465517 continue ;
466518 }
467519 } ;
468520
469- // Determine rows × cols
470521 let ( n_rows, n_cols) = if tensor. dimensions . len ( ) >= 2 {
471522 let rows = tensor. dimensions [ 0 ] as usize ;
472523 let cols: usize = tensor. dimensions [ 1 ..] . iter ( ) . map ( |& d| d as usize ) . product ( ) ;
@@ -475,25 +526,55 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
475526 ( 1 , n_elements)
476527 } ;
477528
529+ let row_raw_bytes = n_cols * elem_size;
478530 let tensor_f32_bytes = ( n_rows as u64 ) * ( n_cols as u64 ) * 4 ;
479531 if tensor_f32_bytes > stats. peak_tensor_bytes {
480- // Record the logical size, even though we never allocate it all
481532 stats. peak_tensor_bytes = tensor_f32_bytes;
482533 }
483534
484- let abs_base = gguf. tensor_data_offset + tensor. offset ;
535+ // How many rows fit in one 128 MB chunk?
536+ let rows_per_chunk = ( STREAM_CHUNK_BYTES / row_raw_bytes) . max ( 1 ) ;
537+ let chunk_raw_bytes = rows_per_chunk * row_raw_bytes;
538+ let chunk_f32_count = rows_per_chunk * n_cols;
485539
486- // Project each row one at a time
487- let mut rows = Vec :: with_capacity ( n_rows) ;
488- for r in 0 ..n_rows {
489- let row_offset = abs_base + ( r as u64 ) * ( n_cols as u64 ) * ( elem_size as u64 ) ;
490- match tensor. dtype {
491- GgmlType :: BF16 => read_bf16_row_f32 ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
492- GgmlType :: F16 => read_f16_row_f32 ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
493- GgmlType :: F32 => read_f32_row ( reader, row_offset, n_cols, & mut row_buf, & mut row_f32) ?,
494- _ => unreachable ! ( ) , // guarded above
540+ // Ensure buffers are large enough (reused across tensors)
541+ if chunk_raw. len ( ) < chunk_raw_bytes {
542+ chunk_raw. resize ( chunk_raw_bytes, 0 ) ;
543+ }
544+ if chunk_f32. len ( ) < chunk_f32_count {
545+ chunk_f32. resize ( chunk_f32_count, 0.0 ) ;
546+ }
547+
548+ // Single seek to tensor data start
549+ let abs_offset = gguf. tensor_data_offset + tensor. offset ;
550+ reader. seek ( SeekFrom :: Start ( abs_offset) ) . map_err ( |e| e. to_string ( ) ) ?;
551+
552+ let mut projected_rows = Vec :: with_capacity ( n_rows) ;
553+ let mut rows_remaining = n_rows;
554+
555+ while rows_remaining > 0 {
556+ let batch = rows_remaining. min ( rows_per_chunk) ;
557+ let read_bytes = batch * row_raw_bytes;
558+
559+ reader. read_exact ( & mut chunk_raw[ ..read_bytes] ) . map_err ( |e| e. to_string ( ) ) ?;
560+
561+ // Dequant entire chunk at once
562+ let f32_count = match tensor. dtype {
563+ GgmlType :: BF16 => dequant_bf16_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
564+ GgmlType :: F16 => dequant_f16_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
565+ GgmlType :: F32 => dequant_f32_chunk ( & chunk_raw[ ..read_bytes] , & mut chunk_f32) ,
566+ _ => unreachable ! ( ) ,
495567 } ;
496- rows. push ( project_row_to_base17 ( & row_f32[ ..n_cols] ) ) ;
568+ let _ = f32_count; // == batch * n_cols
569+
570+ // SIMD project each row from the dequanted chunk
571+ for r in 0 ..batch {
572+ let start = r * n_cols;
573+ let end = start + n_cols;
574+ projected_rows. push ( project_row_to_base17_simd ( & chunk_f32[ start..end] ) ) ;
575+ }
576+
577+ rows_remaining -= batch;
497578 }
498579
499580 let ct = CompressedTensor {
@@ -502,7 +583,7 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
502583 original_shape : tensor. dimensions . clone ( ) ,
503584 n_rows,
504585 n_cols,
505- rows,
586+ rows : projected_rows ,
506587 } ;
507588
508589 let orig = ct. original_bytes ( ) as u64 ;
@@ -543,7 +624,7 @@ pub fn stream_index_gguf_large<R: Read + Seek, W: Write>(
543624 for r in 0 ..n_rows {
544625 let start = r * n_cols;
545626 let end = ( start + n_cols) . min ( data. len ( ) ) ;
546- rows. push ( project_row_to_base17 ( & data[ start..end] ) ) ;
627+ rows. push ( project_row_to_base17_simd ( & data[ start..end] ) ) ;
547628 }
548629
549630 let ct = CompressedTensor {
0 commit comments