Skip to content

Commit 5b98eee

Browse files
committed
perf: use BF16 SIMD batch converter for GGUF dequantization
Replace scalar bf16_to_f32 loop with quantized::bf16_to_f32_slice batch path. Same BF16 repr (transparent u16), zero-copy reinterpret of raw bytes to BF16 slice, then batch convert to f32. https://claude.ai/code/session_01Y69Vnw751w75iVSBRws7o7
1 parent c21572b commit 5b98eee

1 file changed

Lines changed: 11 additions & 6 deletions

File tree

src/hpc/gguf.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,12 +215,17 @@ pub fn read_tensor_f32<R: Read + Seek>(
215215
GgmlType::BF16 => {
216216
let mut buf = vec![0u8; n_elements * 2];
217217
reader.read_exact(&mut buf).map_err(|e| e.to_string())?;
218-
Ok(buf.chunks_exact(2)
219-
.map(|c| {
220-
let bits = u16::from_le_bytes([c[0], c[1]]);
221-
bf16_to_f32(bits)
222-
})
223-
.collect())
218+
// Reinterpret u8 pairs as BF16 (same repr) and batch-convert via quantized.rs
219+
// SAFETY: BF16 is #[repr(transparent)] over u16, same layout as [u8; 2] LE pairs.
220+
let bf16_slice: &[super::quantized::BF16] = unsafe {
221+
std::slice::from_raw_parts(
222+
buf.as_ptr() as *const super::quantized::BF16,
223+
n_elements,
224+
)
225+
};
226+
let mut result = vec![0.0f32; n_elements];
227+
super::quantized::bf16_to_f32_slice(bf16_slice, &mut result);
228+
Ok(result)
224229
}
225230
GgmlType::Q8_0 => {
226231
dequantize_q8_0(reader, n_elements)

0 commit comments

Comments
 (0)