1+ use anyhow:: Result ;
2+ use memmap2:: Mmap ;
3+ use std:: fs:: File ;
4+ use std:: time:: Instant ;
5+ use valori_kernel:: adapters:: sift_batch:: SiftBatchLoader ;
6+ use bytemuck:: cast_slice; // The "Senior" way to cast types
7+
8+ const Q16_SCALE : f32 = 65536.0 ;
9+
10+ fn main ( ) -> Result < ( ) > {
11+ println ! ( "🚀 Starting SIFT1M Granular Benchmark..." ) ;
12+
13+ let path = "data/sift/sift/sift_base.fvecs" ;
14+ let file = File :: open ( path) . expect ( "Failed to open SIFT file." ) ;
15+ let mmap = unsafe { Mmap :: map ( & file) ? } ;
16+
17+ // Initialize Loader
18+ let mut loader = SiftBatchLoader :: new ( & mmap)
19+ . ok_or_else ( || anyhow:: anyhow!( "Invalid SIFT format" ) ) ?;
20+
21+ let dim = loader. dim ( ) ;
22+ let total = loader. len ( ) ;
23+ let batch_size = 10_000 ;
24+
25+ // We calculate stride manually to inline the logic and avoid function overhead
26+ // Header (4B) + Data (dim * 4B)
27+ let stride = 4 + ( dim * 4 ) ;
28+
29+ println ! ( "📊 Dataset: {} Vectors | Dim: {}" , total, dim) ;
30+
31+ // ==========================================================
32+ // TEST 1: RAW I/O (Baseline)
33+ // ==========================================================
34+ println ! ( "\n Test 1: Raw Memory Bandwidth (No Parsing)..." ) ;
35+ loader = SiftBatchLoader :: new ( & mmap) . unwrap ( ) ; // Reset cursor
36+ let start_io = Instant :: now ( ) ;
37+ let mut bytes_checksum: u64 = 0 ;
38+
39+ while let Some ( ( raw_bytes, _count) ) = loader. next_batch ( batch_size) {
40+ // Force OS to page-in data by reading every byte.
41+ // We use a simple sum which the compiler can SIMD optimize,
42+ // ensuring we hit memory bandwidth limits, not CPU limits.
43+ let chunk_sum: u64 = raw_bytes. iter ( ) . map ( |& b| b as u64 ) . sum ( ) ;
44+ bytes_checksum = bytes_checksum. wrapping_add ( chunk_sum) ;
45+ }
46+ std:: hint:: black_box ( bytes_checksum) ; // Ensure calculation isn't deleted
47+
48+ let time_io = start_io. elapsed ( ) ;
49+ // approximate bytes read (total file size)
50+ let total_bytes = mmap. len ( ) ;
51+ println ! ( " -> Time: {:.4}s | {:.2} GB/s" ,
52+ time_io. as_secs_f64( ) ,
53+ ( total_bytes as f64 / 1_024.0 / 1_024.0 / 1_024.0 ) / time_io. as_secs_f64( )
54+ ) ;
55+
56+ // ==========================================================
57+ // TEST 2: PARSING COST (Bytemuck Cast)
58+ // ==========================================================
59+ println ! ( "\n Test 2: Structure Cost (Bytes -> &[f32])..." ) ;
60+ loader = SiftBatchLoader :: new ( & mmap) . unwrap ( ) ;
61+ let start_parse = Instant :: now ( ) ;
62+ let mut _check_parse: f32 = 0.0 ;
63+
64+ while let Some ( ( raw_bytes, count) ) = loader. next_batch ( batch_size) {
65+ for i in 0 ..count {
66+ let offset = i * stride;
67+ // Zero-Copy Slice: Skip 4 byte header, take the rest
68+ // Note: f32 requires 4-byte alignment. SIFT stride is (4 + 128*4) = 516.
69+ // 516 is divisible by 4, so address alignment is preserved!
70+ let vec_bytes = & raw_bytes[ offset + 4 .. offset + stride] ;
71+
72+ // bytemuck::cast_slice is SAFE. It checks alignment and length.
73+ // If this panics, your data is corrupt.
74+ let vec_f32: & [ f32 ] = cast_slice ( vec_bytes) ;
75+
76+
77+ // Sum ALL floats to ensure we read all memory, making this comparable to Test 1.
78+ for & val in vec_f32 {
79+ _check_parse += val;
80+ }
81+ }
82+ }
83+
84+ let time_parse = start_parse. elapsed ( ) ;
85+ println ! ( " - Checksum (f32): {:.2} (Ignore)" , _check_parse) ;
86+ println ! ( " -> Time: {:.4}s | Overhead: {:.4}s" ,
87+ time_parse. as_secs_f64( ) ,
88+ time_parse. checked_sub( time_io) . unwrap_or( std:: time:: Duration :: ZERO ) . as_secs_f64( )
89+ ) ;
90+
91+ // ==========================================================
92+ // TEST 3: MATH COST (f32 -> Q16.16)
93+ // ==========================================================
94+ println ! ( "\n Test 3: Determinism Cost (Math Ops)..." ) ;
95+ loader = SiftBatchLoader :: new ( & mmap) . unwrap ( ) ;
96+ let start_math = Instant :: now ( ) ;
97+ let mut check_math: i64 = 0 ;
98+
99+ while let Some ( ( raw_bytes, count) ) = loader. next_batch ( batch_size) {
100+ for i in 0 ..count {
101+ let offset = i * stride;
102+ let vec_bytes = & raw_bytes[ offset + 4 .. offset + stride] ;
103+ let vec_f32: & [ f32 ] = cast_slice ( vec_bytes) ;
104+
105+ // THE HOT LOOP
106+ for & val in vec_f32 {
107+ let fixed = ( val * Q16_SCALE ) as i32 ;
108+ check_math = check_math. wrapping_add ( fixed as i64 ) ;
109+ }
110+ }
111+ }
112+
113+ let time_math = start_math. elapsed ( ) ;
114+
115+ // Fix: Don't subtract Test 2 if Test 3 is faster (due to SIMD)
116+ // Just report the raw math time, which is the "Hot Cache" performance.
117+ println ! ( " -> Time: {:.4}s" , time_math. as_secs_f64( ) ) ;
118+
119+ println ! ( "--------------------------------------------------" ) ;
120+ println ! ( "📉 COST ANALYSIS:" ) ;
121+ println ! ( " - Cold I/O (Disk): {:.4}s" , time_io. as_secs_f64( ) ) ;
122+ println ! ( " - Hot Math (Memory): {:.4}s" , time_math. as_secs_f64( ) ) ;
123+ println ! ( "--------------------------------------------------" ) ;
124+
125+ let total_ops = total as f64 * dim as f64 ;
126+ println ! ( "⚡ Hot Throughput: {:.2} Billion ops/sec" ,
127+ total_ops / time_math. as_secs_f64( ) / 1_000_000_000.0
128+ ) ;
129+
130+ Ok ( ( ) )
131+ }
0 commit comments