Skip to content

Commit 87fa3ce

Browse files
committed
Make search results deterministic with a rowIndex tie-break
1 parent 40e1a22 commit 87fa3ce

3 files changed

Lines changed: 97 additions & 8 deletions

File tree

src/search/heap.js

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,22 @@ export function isBetter(a, b, metric) {
3232
return a > b
3333
}
3434

35+
/**
36+
* Total order on scored entries: better score wins; ties broken by lower
37+
* rowIndex. The tiebreak makes selection independent of the order chunks
38+
* arrive (parallel reads complete in nondeterministic order), so identical
39+
* queries always return identical results.
40+
*
41+
* @param {{ rowIndex: number, score: number }} a
42+
* @param {{ rowIndex: number, score: number }} b
43+
* @param {DistanceMetric} metric
44+
* @returns {boolean}
45+
*/
46+
function betterEntry(a, b, metric) {
47+
if (a.score !== b.score) return isBetter(a.score, b.score, metric)
48+
return a.rowIndex < b.rowIndex
49+
}
50+
3551
/**
3652
* Bounded heap by score (uses metric to decide "better"). Linear-scan to
3753
* find the worst entry; fine for the small topK we care about.
@@ -48,17 +64,19 @@ export function pushHeap(heap, candidate, topK, metric) {
4864
}
4965
let worstIdx = 0
5066
for (let i = 1; i < heap.length; i += 1) {
51-
if (isBetter(heap[worstIdx].score, heap[i].score, metric)) {
67+
if (betterEntry(heap[worstIdx], heap[i], metric)) {
5268
worstIdx = i
5369
}
5470
}
55-
if (isBetter(candidate.score, heap[worstIdx].score, metric)) {
71+
if (betterEntry(candidate, heap[worstIdx], metric)) {
5672
heap[worstIdx] = candidate
5773
}
5874
}
5975

6076
/**
61-
* Bounded heap for Hamming candidates (lower hamming is better).
77+
* Bounded heap for Hamming candidates (lower hamming is better). Ties on
78+
* hamming are broken by lower rowIndex so the kept candidate set does not
79+
* depend on the (parallel, nondeterministic) order chunks are scored.
6280
*
6381
* @param {{ rowIndex: number, hamming: number }[]} heap
6482
* @param {{ rowIndex: number, hamming: number }} candidate
@@ -71,21 +89,24 @@ export function pushHammingHeap(heap, candidate, candidatesK) {
7189
}
7290
let worstIdx = 0
7391
for (let i = 1; i < heap.length; i += 1) {
74-
if (heap[i].hamming > heap[worstIdx].hamming) worstIdx = i
92+
const h = heap[i]; const w = heap[worstIdx]
93+
if (h.hamming > w.hamming || (h.hamming === w.hamming && h.rowIndex > w.rowIndex)) worstIdx = i
7594
}
76-
if (candidate.hamming < heap[worstIdx].hamming) {
95+
const w = heap[worstIdx]
96+
if (candidate.hamming < w.hamming || (candidate.hamming === w.hamming && candidate.rowIndex < w.rowIndex)) {
7797
heap[worstIdx] = candidate
7898
}
7999
}
80100

81101
/**
82-
* Sort results best-first under the chosen metric.
102+
* Sort results best-first under the chosen metric, ties broken by lower
103+
* rowIndex for a deterministic order.
83104
*
84105
* @param {{ rowIndex: number, score: number }[]} results
85106
* @param {DistanceMetric} metric
86107
* @returns {{ rowIndex: number, score: number }[]}
87108
*/
88109
export function sortHeap(results, metric) {
89110
const dir = metric === 'euclidean' ? 1 : -1
90-
return results.slice().sort((a, b) => dir * (a.score - b.score))
111+
return results.slice().sort((a, b) => dir * (a.score - b.score) || a.rowIndex - b.rowIndex)
91112
}

src/search/rerank.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ export async function searchRerank({ file, metadata, meta, queryF32, scoringMetr
118118
}))
119119

120120
const dir = reportedMetric === 'euclidean' ? 1 : -1
121-
scored.sort((a, b) => dir * (a.score - b.score))
121+
// Tie-break by rowIndex so the winners are independent of the order the
122+
// parallel candidate reads completed in (deterministic results).
123+
scored.sort((a, b) => dir * (a.score - b.score) || a.rowIndex - b.rowIndex)
122124
const winners = scored.slice(0, topK)
123125

124126
// Phase 3: fetch ids for just the top-K winners.

test/heap.test.js

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import { describe, expect, it } from 'vitest'
2+
import { pushHammingHeap, pushHeap, sortHeap } from '../src/search/heap.js'
3+
4+
/**
5+
* Run a sequence of candidates through a bounded heap and return the kept
6+
* rowIndices, sorted ascending for comparison.
7+
* @param {{ rowIndex: number, hamming: number }[]} candidates
8+
* @param {number} k
9+
* @returns {number[]}
10+
*/
11+
function keepHamming(candidates, k) {
12+
/** @type {{ rowIndex: number, hamming: number }[]} */
13+
const heap = []
14+
for (const c of candidates) pushHammingHeap(heap, c, k)
15+
return heap.map(e => e.rowIndex).sort((a, b) => a - b)
16+
}
17+
18+
describe('heap tie-breaking is deterministic', () => {
19+
it('pushHammingHeap keeps the lowest rowIndices when hamming ties, regardless of insertion order', () => {
20+
// Five rows all at the same hamming distance; keep 3 -> must keep rows 0,1,2.
21+
const rows = [0, 1, 2, 3, 4].map(rowIndex => ({ rowIndex, hamming: 7 }))
22+
const forward = keepHamming(rows, 3)
23+
const reversed = keepHamming([...rows].reverse(), 3)
24+
const shuffled = keepHamming([rows[3], rows[0], rows[4], rows[2], rows[1]], 3)
25+
expect(forward).toEqual([0, 1, 2])
26+
expect(reversed).toEqual([0, 1, 2])
27+
expect(shuffled).toEqual([0, 1, 2])
28+
})
29+
30+
it('pushHammingHeap prefers strictly nearer candidates over ties', () => {
31+
const rows = [
32+
{ rowIndex: 5, hamming: 2 },
33+
{ rowIndex: 9, hamming: 9 },
34+
{ rowIndex: 1, hamming: 9 },
35+
{ rowIndex: 8, hamming: 1 },
36+
]
37+
// keep 2 -> the two nearest by hamming (1 then 2): rows 8 and 5.
38+
expect(keepHamming(rows, 2)).toEqual([5, 8])
39+
expect(keepHamming([...rows].reverse(), 2)).toEqual([5, 8])
40+
})
41+
42+
it('pushHeap keeps best score, breaking ties by lower rowIndex (order-independent)', () => {
43+
// cosine: higher score is better. Tied at 0.9 -> keep lower rowIndices.
44+
const rows = [0, 1, 2, 3].map(rowIndex => ({ rowIndex, score: 0.9 }))
45+
/** @type {{ rowIndex: number, score: number }[]} */
46+
const a = []
47+
for (const c of rows) pushHeap(a, c, 2, 'cosine')
48+
/** @type {{ rowIndex: number, score: number }[]} */
49+
const b = []
50+
for (const c of [...rows].reverse()) pushHeap(b, c, 2, 'cosine')
51+
expect(a.map(e => e.rowIndex).sort((x, y) => x - y)).toEqual([0, 1])
52+
expect(b.map(e => e.rowIndex).sort((x, y) => x - y)).toEqual([0, 1])
53+
})
54+
55+
it('sortHeap orders tied scores by ascending rowIndex', () => {
56+
const results = [
57+
{ rowIndex: 4, score: 0.5 },
58+
{ rowIndex: 1, score: 0.9 },
59+
{ rowIndex: 7, score: 0.9 },
60+
{ rowIndex: 2, score: 0.9 },
61+
]
62+
expect(sortHeap(results, 'cosine').map(e => e.rowIndex)).toEqual([1, 2, 7, 4])
63+
// euclidean: lower score is better, ties still ascend by rowIndex.
64+
expect(sortHeap(results, 'euclidean').map(e => e.rowIndex)).toEqual([4, 1, 2, 7])
65+
})
66+
})

0 commit comments

Comments
 (0)