Skip to content

Commit b00df16

Browse files
committed
fix: remove halftone interpolation — all 17 bins from real data
The halftone path only sampled 9 of 17 golden-step positions and interpolated the other 8 as neighbor averages. Those 8 bins carried no real signal — dead planes in the projection. Fix: stride octaves (skip every Nth octave) but sample ALL 17 golden-step positions per sampled octave. Every bin gets direct measurement from actual weight values. stride=16, 5120-col row: 19 octaves × 17 positions = 323 samples across all 17 bins (was 171 samples across 9 real + 8 fake bins). https://claude.ai/code/session_01HmdXNPit7QsTCfhJFef3Ee
1 parent eb93d0d commit b00df16

1 file changed

Lines changed: 25 additions & 73 deletions

File tree

src/hpc/gguf_indexer.rs

Lines changed: 25 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -283,58 +283,28 @@ pub fn project_8rows_bf16_simd(
283283
use crate::simd::F64x8;
284284

285285
let n_octaves = (n_cols + BASE_DIM - 1) / BASE_DIM;
286-
let use_halftone = octave_stride > 1;
287286

288287
let mut sums: [F64x8; BASE_DIM] = [F64x8::splat(0.0); BASE_DIM];
289288
let mut counts: [u32; BASE_DIM] = [0; BASE_DIM];
290289

291-
if use_halftone {
292-
let mut octave = 0;
293-
while octave < n_octaves {
294-
for hi in 0..9 {
295-
let col = octave * BASE_DIM + HALFTONE_POS[hi] as usize;
296-
if col < n_cols {
297-
let bin = HALFTONE_TO_BIN[hi] as usize;
298-
let offsets: [usize; 8] = [
299-
row_starts[0] + col, row_starts[1] + col,
300-
row_starts[2] + col, row_starts[3] + col,
301-
row_starts[4] + col, row_starts[5] + col,
302-
row_starts[6] + col, row_starts[7] + col,
303-
];
304-
sums[bin] += gather_bf16_x8(buf, &offsets);
305-
counts[bin] += 1;
306-
}
307-
}
308-
octave += octave_stride;
309-
}
310-
311-
// Interpolate odd bins from even neighbors (per-lane, still SIMD)
312-
for odd in (1..BASE_DIM).step_by(2) {
313-
let left = sums[odd - 1];
314-
let right = sums[(odd + 1) % BASE_DIM];
315-
let left_c = counts[odd - 1].max(1);
316-
let right_c = counts[(odd + 1) % BASE_DIM].max(1);
317-
let left_mean = left * F64x8::splat(1.0 / left_c as f64);
318-
let right_mean = right * F64x8::splat(1.0 / right_c as f64);
319-
sums[odd] = (left_mean + right_mean) * F64x8::splat(0.5);
320-
counts[odd] = 1;
321-
}
322-
} else {
323-
for octave in 0..n_octaves {
324-
for bi in 0..BASE_DIM {
325-
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
326-
if col < n_cols {
327-
let offsets: [usize; 8] = [
328-
row_starts[0] + col, row_starts[1] + col,
329-
row_starts[2] + col, row_starts[3] + col,
330-
row_starts[4] + col, row_starts[5] + col,
331-
row_starts[6] + col, row_starts[7] + col,
332-
];
333-
sums[bi] += gather_bf16_x8(buf, &offsets);
334-
counts[bi] += 1;
335-
}
290+
// All 17 golden-step positions per sampled octave. Stride skips octaves,
291+
// NOT positions — every bin gets real data from actual weight values.
292+
let mut octave = 0;
293+
while octave < n_octaves {
294+
for bi in 0..BASE_DIM {
295+
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
296+
if col < n_cols {
297+
let offsets: [usize; 8] = [
298+
row_starts[0] + col, row_starts[1] + col,
299+
row_starts[2] + col, row_starts[3] + col,
300+
row_starts[4] + col, row_starts[5] + col,
301+
row_starts[6] + col, row_starts[7] + col,
302+
];
303+
sums[bi] += gather_bf16_x8(buf, &offsets);
304+
counts[bi] += 1;
336305
}
337306
}
307+
octave += octave_stride;
338308
}
339309

340310
// Finalize: mean → scale → clamp → i16, all 8 lanes parallel
@@ -365,39 +335,21 @@ pub fn project_8rows_bf16_simd(
365335
pub fn project_1row_bf16_strided(row: &[u16], octave_stride: usize) -> Base17 {
366336
let d = row.len();
367337
let n_octaves = (d + BASE_DIM - 1) / BASE_DIM;
368-
let use_halftone = octave_stride > 1;
369338

370339
let mut sum = [0.0f64; BASE_DIM];
371340
let mut count = [0u32; BASE_DIM];
372341

373-
if use_halftone {
374-
let mut octave = 0;
375-
while octave < n_octaves {
376-
for hi in 0..9 {
377-
let col = octave * BASE_DIM + HALFTONE_POS[hi] as usize;
378-
if col < d {
379-
sum[HALFTONE_TO_BIN[hi] as usize] += bf16_to_f64(row[col]);
380-
count[HALFTONE_TO_BIN[hi] as usize] += 1;
381-
}
382-
}
383-
octave += octave_stride;
384-
}
385-
for odd in (1..BASE_DIM).step_by(2) {
386-
let lc = count[odd - 1].max(1) as f64;
387-
let rc = count[(odd + 1) % BASE_DIM].max(1) as f64;
388-
sum[odd] = (sum[odd - 1] / lc + sum[(odd + 1) % BASE_DIM] / rc) * 0.5;
389-
count[odd] = 1;
390-
}
391-
} else {
392-
for octave in 0..n_octaves {
393-
for bi in 0..BASE_DIM {
394-
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
395-
if col < d {
396-
sum[bi] += bf16_to_f64(row[col]);
397-
count[bi] += 1;
398-
}
342+
// All 17 positions per sampled octave — no halftone, all bins real
343+
let mut octave = 0;
344+
while octave < n_octaves {
345+
for bi in 0..BASE_DIM {
346+
let col = octave * BASE_DIM + GOLDEN_POS[bi] as usize;
347+
if col < d {
348+
sum[bi] += bf16_to_f64(row[col]);
349+
count[bi] += 1;
399350
}
400351
}
352+
octave += octave_stride;
401353
}
402354

403355
let mut dims = [0i16; BASE_DIM];

0 commit comments

Comments
 (0)