Skip to content

Commit ac7b44b

Browse files
authored
Merge pull request #25 from DKB0512/dkb/optimize-synthetic-embedding
perf(embedding): Optimize hash, addFeat and norm function
2 parents 6ac6cd4 + 57a2f7b commit ac7b44b

1 file changed

Lines changed: 30 additions & 10 deletions

File tree

backend/src/embedding/index.ts

Lines changed: 30 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,21 +147,36 @@ async function embedWithLocal(t: string, s: string): Promise<number[]> {
147147
}
148148

149149
const hash = (v: string) => {
150-
let h = 0x811c9dc5
151-
for (let i = 0; i < v.length; i++) h = Math.imul(h ^ v.charCodeAt(i), 16777619)
152-
return h >>> 0
150+
let h = 0x811c9dc5 | 0;
151+
const len = v.length | 0;
152+
for (let i = 0; i < len; i++) {
153+
h = Math.imul(h ^ v.charCodeAt(i), 16777619);
154+
}
155+
return h >>> 0;
153156
}
154157

155158
const addFeat = (vec: Float32Array, dim: number, key: string, w: number) => {
156-
const h = hash(key)
157-
vec[h % dim] += w * ((h & 1) ? -1 : 1)
159+
const h = hash(key);
160+
const value = w * (1 - ((h & 1) << 1));
161+
if ((dim > 0) && (dim & (dim - 1)) === 0) {
162+
vec[h & (dim - 1)] += value;
163+
} else {
164+
vec[h % dim] += value;
165+
}
158166
}
159167

160168
const norm = (vec: Float32Array) => {
161-
let n = 0
162-
for (let i = 0; i < vec.length; i++) n += vec[i] * vec[i]
163-
n = Math.sqrt(n)
164-
if (n) for (let i = 0; i < vec.length; i++) vec[i] /= n
169+
let n = 0;
170+
const len = vec.length;
171+
for (let i = 0; i < len; i++) {
172+
const v = vec[i];
173+
n += v * v;
174+
}
175+
if (n === 0) return;
176+
const invSqrt = 1 / Math.sqrt(n);
177+
for (let i = 0; i < len; i++) {
178+
vec[i] *= invSqrt;
179+
}
165180
}
166181

167182
function generateSyntheticEmbedding(t: string, s: string): number[] {
@@ -174,7 +189,11 @@ function generateSyntheticEmbedding(t: string, s: string): number[] {
174189
}
175190
const et = Array.from(addSynonymTokens(ct))
176191
const tc = new Map<string, number>()
177-
et.forEach(tok => tc.set(tok, (tc.get(tok) || 0) + 1))
192+
const etLength: number = et.length;
193+
for (let i = 0; i < etLength; i++) {
194+
const tok = et[i];
195+
tc.set(tok, (tc.get(tok) || 0) + 1)
196+
}
178197

179198
for (const [tok, c] of tc) {
180199
const w = Math.log(1 + c) + 1
@@ -196,6 +215,7 @@ const resizeVector = (v: number[], t: number) => {
196215
if (v.length > t) return v.slice(0, t)
197216
return [...v, ...Array(t - v.length).fill(0)]
198217
}
218+
199219
export async function embedMultiSector(id: string, text: string, sectors: string[], chunks?: Array<{ text: string }>): Promise<EmbeddingResult[]> {
200220
const r: EmbeddingResult[] = []
201221
await q.ins_log.run(id, 'multi-sector', 'pending', Date.now(), null)

0 commit comments

Comments
 (0)