|
| 1 | +use memmap2::Mmap; |
| 2 | + |
| 3 | +/// A Zero-Copy, Batch-Optimized SIFT1M Loader. |
| 4 | +/// |
| 5 | +/// Efficiently loads SIFT/FVECS format vectors from a memory-mapped file. |
| 6 | +/// Format: `[int32 dim] [float32 data...] [int32 dim] [float32 data...]` |
| 7 | +/// |
| 8 | +/// # Lifetimes |
| 9 | +/// - `'a`: The lifetime of the underlying `Mmap`. The returned batches leverage zero-copy |
| 10 | +/// slicing and thus are tied to this lifetime. |
| 11 | +pub struct SiftBatchLoader<'a> { |
| 12 | + mmap: &'a Mmap, |
| 13 | + base_offset: usize, |
| 14 | + cursor: usize, |
| 15 | + dim: usize, |
| 16 | + total_vectors: usize, |
| 17 | + vector_stride: usize, |
| 18 | +} |
| 19 | + |
| 20 | +impl<'a> SiftBatchLoader<'a> { |
| 21 | + /// Initialize a loader starting from the beginning of the mmap. |
| 22 | + pub fn new(mmap: &'a Mmap) -> Option<Self> { |
| 23 | + Self::with_offset(mmap, 0) |
| 24 | + } |
| 25 | + |
| 26 | + /// Initialize a loader starting from a specific byte offset. |
| 27 | + /// |
| 28 | + /// Useful for skipping headers or processing file shards. |
| 29 | + /// Returns `None` if the offset is out of bounds or the file is too small to contain even a header. |
| 30 | + pub fn with_offset(mmap: &'a Mmap, base_offset: usize) -> Option<Self> { |
| 31 | + if base_offset >= mmap.len() { |
| 32 | + return None; |
| 33 | + } |
| 34 | + |
| 35 | + // Need at least 4 bytes for dimension |
| 36 | + if mmap.len() - base_offset < 4 { |
| 37 | + return None; |
| 38 | + } |
| 39 | + |
| 40 | + // Read dim from [base_offset..base_offset+4] |
| 41 | + // SAFETY: Bounds checked above. |
| 42 | + let dim_slice = &mmap[base_offset..base_offset + 4]; |
| 43 | + let dim = u32::from_le_bytes(dim_slice.try_into().unwrap()) as usize; |
| 44 | + |
| 45 | + // Calculate stride: 4 bytes (dim header) + dim * 4 bytes (f32 data) |
| 46 | + let vector_stride = 4 + dim * 4; |
| 47 | + |
| 48 | + if vector_stride == 0 { |
| 49 | + return None; // Avoiding infinite loops on garbage data |
| 50 | + } |
| 51 | + |
| 52 | + // Calculate total vectors |
| 53 | + let available_bytes = mmap.len() - base_offset; |
| 54 | + let total_vectors = available_bytes / vector_stride; |
| 55 | + |
| 56 | + // Alignment check (Debug only) |
| 57 | + #[cfg(debug_assertions)] |
| 58 | + { |
| 59 | + if vector_stride % 16 != 0 { |
| 60 | + // Log warning or comment. Since we can't easily log in no_std/kernel easily without |
| 61 | + // bringing in `log` or `tracing` (which we have in workspace but maybe not here), |
| 62 | + // we'll just print to stderr if standard generic logging isn't set up. |
| 63 | + // Or better, just a comment here for future SIMD work. |
| 64 | + // println!("WARN: SIFT vector stride {} is not 16-byte aligned. SIMD loads may be unaligned.", vector_stride); |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + Some(Self { |
| 69 | + mmap, |
| 70 | + base_offset, |
| 71 | + cursor: 0, |
| 72 | + dim, |
| 73 | + total_vectors, |
| 74 | + vector_stride, |
| 75 | + }) |
| 76 | + } |
| 77 | + |
| 78 | + /// Returns the dimension of vectors in this file. |
| 79 | + pub fn dim(&self) -> usize { |
| 80 | + self.dim |
| 81 | + } |
| 82 | + |
| 83 | + /// Returns the number of vectors available. |
| 84 | + pub fn len(&self) -> usize { |
| 85 | + self.total_vectors |
| 86 | + } |
| 87 | + |
| 88 | + /// Returns the next batch of raw bytes containing vectors. |
| 89 | + /// |
| 90 | + /// Returns `Option<(slice, count)>`. |
| 91 | + /// - `slice`: The raw byte slice containing the batch. |
| 92 | + /// - `count`: The number of vectors in this batch. |
| 93 | + pub fn next_batch(&mut self, batch_size: usize) -> Option<(&'a [u8], usize)> { |
| 94 | + if self.cursor >= self.total_vectors { |
| 95 | + return None; |
| 96 | + } |
| 97 | + |
| 98 | + let remaining = self.total_vectors - self.cursor; |
| 99 | + let count = std::cmp::min(batch_size, remaining); |
| 100 | + |
| 101 | + let start_idx = self.cursor; |
| 102 | + let _end_idx = start_idx + count; |
| 103 | + |
| 104 | + let byte_start = self.base_offset + (start_idx * self.vector_stride); |
| 105 | + let byte_len = count * self.vector_stride; |
| 106 | + let byte_end = byte_start + byte_len; |
| 107 | + |
| 108 | + // SAFETY: |
| 109 | + // 1. `base_offset` is validated in `new`. |
| 110 | + // 2. `total_vectors` is calculated based on `mmap.len()` and `vector_stride`. |
| 111 | + // 3. `cursor` is bounded by `total_vectors`. |
| 112 | + // 4. Therefore `byte_end` <= `mmap.len()`. |
| 113 | + let slice = &self.mmap[byte_start..byte_end]; |
| 114 | + |
| 115 | + self.cursor += count; |
| 116 | + |
| 117 | + Some((slice, count)) |
| 118 | + } |
| 119 | + |
| 120 | + /// Helper to parse a raw vector from a slice (skip the 4-byte header). |
| 121 | + /// Returns the f32 slice. |
| 122 | + pub fn parse_vector(data: &[u8]) -> &[f32] { |
| 123 | + let (_header, content) = data.split_at(4); |
| 124 | + // SAFETY: We assume the caller knows this is a valid SIFT record slice |
| 125 | + // generated by this loader. |
| 126 | + unsafe { |
| 127 | + std::slice::from_raw_parts( |
| 128 | + content.as_ptr() as *const f32, |
| 129 | + content.len() / 4 |
| 130 | + ) |
| 131 | + } |
| 132 | + } |
| 133 | +} |
| 134 | + |
| 135 | +#[cfg(test)] |
| 136 | +mod tests { |
| 137 | + use super::*; |
| 138 | + use std::io::Write; |
| 139 | + use tempfile::NamedTempFile; |
| 140 | + |
| 141 | + fn create_mock_fvecs(dim: usize, count: usize, offset_bytes: usize) -> (NamedTempFile, Vec<f32>) { |
| 142 | + let mut file = NamedTempFile::new().unwrap(); |
| 143 | + let mut all_floats = Vec::new(); |
| 144 | + |
| 145 | + // Write garbage offset |
| 146 | + if offset_bytes > 0 { |
| 147 | + file.write_all(&vec![0u8; offset_bytes]).unwrap(); |
| 148 | + } |
| 149 | + |
| 150 | + for i in 0..count { |
| 151 | + // Write dim |
| 152 | + file.write_all(&(dim as i32).to_le_bytes()).unwrap(); |
| 153 | + // Write vector |
| 154 | + for j in 0..dim { |
| 155 | + let val = (i * dim + j) as f32; |
| 156 | + all_floats.push(val); |
| 157 | + file.write_all(&val.to_le_bytes()).unwrap(); |
| 158 | + } |
| 159 | + } |
| 160 | + file.flush().unwrap(); |
| 161 | + (file, all_floats) |
| 162 | + } |
| 163 | + |
| 164 | + #[test] |
| 165 | + fn test_sift_loader_basic() { |
| 166 | + let dim = 4; |
| 167 | + let count = 10; |
| 168 | + let (file, expected_data) = create_mock_fvecs(dim, count, 0); |
| 169 | + |
| 170 | + // Mmap |
| 171 | + let mmap = unsafe { Mmap::map(file.as_file()).unwrap() }; |
| 172 | + |
| 173 | + let mut loader = SiftBatchLoader::new(&mmap).expect("Failed to create loader"); |
| 174 | + assert_eq!(loader.dim(), dim); |
| 175 | + assert_eq!(loader.len(), count); |
| 176 | + |
| 177 | + // Read in batches of 3 |
| 178 | + let (slice, c) = loader.next_batch(3).unwrap(); |
| 179 | + assert_eq!(c, 3); |
| 180 | + assert_eq!(slice.len(), 3 * (4 + dim * 4)); |
| 181 | + |
| 182 | + let (slice, c) = loader.next_batch(3).unwrap(); |
| 183 | + assert_eq!(c, 3); |
| 184 | + |
| 185 | + let (slice, c) = loader.next_batch(3).unwrap(); |
| 186 | + assert_eq!(c, 3); |
| 187 | + |
| 188 | + let (slice, c) = loader.next_batch(3).unwrap(); |
| 189 | + assert_eq!(c, 1); // Leftover |
| 190 | + |
| 191 | + assert!(loader.next_batch(3).is_none()); |
| 192 | + } |
| 193 | + |
| 194 | + #[test] |
| 195 | + fn test_sift_loader_offset() { |
| 196 | + let dim = 128; |
| 197 | + let count = 5; |
| 198 | + let offset = 123; // Arbitrary offset |
| 199 | + let (file, _) = create_mock_fvecs(dim, count, offset); |
| 200 | + |
| 201 | + let mmap = unsafe { Mmap::map(file.as_file()).unwrap() }; |
| 202 | + |
| 203 | + let mut loader = SiftBatchLoader::with_offset(&mmap, offset).expect("Failed with offset"); |
| 204 | + assert_eq!(loader.dim(), dim); |
| 205 | + assert_eq!(loader.len(), count); |
| 206 | + |
| 207 | + let (_, c) = loader.next_batch(100).unwrap(); |
| 208 | + assert_eq!(c, 5); |
| 209 | + } |
| 210 | +} |
0 commit comments