Skip to content

Commit 09e664f

Browse files
authored
Merge pull request #153 from zfaustk/fix-multilingual-dedup-tokenization
fix(openmemory): prevent multilingual simhash collisions across JS and Python
2 parents ca3a711 + 81d19af commit 09e664f

6 files changed

Lines changed: 187 additions & 6 deletions

File tree

packages/openmemory-js/src/memory/hsg.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import crypto from "node:crypto";
2-
import { canonical_token_set } from "../utils/text";
2+
import { canonical_token_set, stable_text_fallback_hash } from "../utils/text";
33
import { inc_q, dec_q, on_query_hit } from "./decay";
44
import { env, tier } from "../core/cfg";
55
import { cos_sim, buf_to_vec, vec_to_buf } from "../utils/index";
@@ -295,6 +295,9 @@ export function boosted_sim(s: number): number {
295295
}
296296
export function compute_simhash(text: string): string {
297297
const tokens = canonical_token_set(text);
298+
if (!tokens.size) {
299+
return stable_text_fallback_hash(text);
300+
}
298301
const hashes = Array.from(tokens).map((t) => {
299302
let h = 0;
300303
for (let i = 0; i < t.length; i++) {

packages/openmemory-js/src/utils/text.ts

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import crypto from "node:crypto";
2+
13
const syn_grps = [
24
["prefer", "like", "love", "enjoy", "favor"],
35
["theme", "mode", "style", "layout"],
@@ -32,13 +34,28 @@ const stem_rules: Array<[RegExp, string]> = [
3234
[/ed$/, ""],
3335
[/s$/, ""],
3436
];
35-
const tok_pat = /[a-z0-9]+/gi;
37+
const cjk_pat = /[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af]+/u;
38+
const tok_pat = /[a-z0-9]+|[\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af]+/giu;
39+
40+
const expand_cjk_token = (tok: string): string[] => {
41+
if (tok.length <= 1) return [tok];
42+
const expanded: string[] = [];
43+
for (let i = 0; i < tok.length - 1; i++) {
44+
expanded.push(tok.slice(i, i + 2));
45+
}
46+
return expanded;
47+
};
3648

3749
export const tokenize = (text: string): string[] => {
3850
const toks: string[] = [];
3951
let m: RegExpExecArray | null;
4052
while ((m = tok_pat.exec(text))) {
41-
toks.push(m[0].toLowerCase());
53+
const tok = m[0];
54+
if (cjk_pat.test(tok)) {
55+
toks.push(...expand_cjk_token(tok));
56+
continue;
57+
}
58+
toks.push(tok.toLowerCase());
4259
}
4360
return toks;
4461
};
@@ -102,6 +119,10 @@ export const canonical_token_set = (text: string): Set<string> => {
102119
return new Set(canonical_tokens_from_text(text));
103120
};
104121

122+
export const stable_text_fallback_hash = (text: string): string => {
123+
return crypto.createHash("blake2b512").update(text, "utf8").digest("hex").slice(0, 16);
124+
};
125+
105126
export const add_synonym_tokens = (toks: Iterable<string>): Set<string> => {
106127
const res = new Set<string>();
107128
for (const tok of toks) {
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import assert from "node:assert/strict";
2+
import { canonical_tokens_from_text, tokenize } from "../src/utils/text";
3+
import { compute_simhash } from "../src/memory/hsg";
4+
5+
const left = "我喜欢健身";
6+
const right = "我喜欢普洱茶";
7+
8+
assert.deepEqual(tokenize(right), ["我喜", "喜欢", "欢普", "普洱", "洱茶"]);
9+
10+
const leftTokens = canonical_tokens_from_text(left);
11+
const rightTokens = canonical_tokens_from_text(right);
12+
13+
assert.ok(leftTokens.length > 0);
14+
assert.ok(rightTokens.length > 0);
15+
assert.notDeepEqual(new Set(leftTokens), new Set(rightTokens));
16+
assert.notEqual(compute_simhash(left), compute_simhash(right));
17+
assert.notEqual(compute_simhash("!!!"), compute_simhash("???"));
18+
assert.equal(compute_simhash("!!!"), compute_simhash("!!!"));
19+
20+
console.log("test_multilingual_dedup.ts passed");

packages/openmemory-py/src/openmemory/memory/hsg.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from ..core.config import env
1313
from ..core.constants import SECTOR_CONFIGS
1414
from ..core.vector_store import vector_store as store
15-
from ..utils.text import canonical_token_set, canonical_tokens_from_text
15+
from ..utils.text import canonical_token_set, canonical_tokens_from_text, stable_text_fallback_hash
1616
from ..utils.chunking import chunk_text
1717
from ..utils.keyword import keyword_filter_memories, compute_keyword_overlap
1818
from ..utils.vectors import buf_to_vec, vec_to_buf, cos_sim
@@ -164,6 +164,8 @@ def boosted_sim(s: float) -> float:
164164

165165
def compute_simhash(text: str) -> str:
166166
tokens = canonical_token_set(text)
167+
if not tokens:
168+
return stable_text_fallback_hash(text)
167169
hashes = []
168170
for t in tokens:
169171
h = 0

packages/openmemory-py/src/openmemory/utils/text.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import hashlib
23
from typing import List, Set, Dict
34

45
SYN_GRPS = [
@@ -35,10 +36,24 @@
3536
(r"s$", ""),
3637
]
3738

38-
TOK_PAT = re.compile(r"[a-z0-9]+")
39+
CJK_PAT = r"\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\u3040-\u30ff\uac00-\ud7af"
40+
TOK_PAT = re.compile(rf"[a-z0-9]+|[{CJK_PAT}]+", re.I)
41+
42+
43+
def _expand_cjk_token(tok: str) -> List[str]:
44+
if len(tok) <= 1:
45+
return [tok]
46+
return [tok[i : i + 2] for i in range(len(tok) - 1)]
3947

4048
def tokenize(text: str) -> List[str]:
41-
return [m.lower() for m in TOK_PAT.findall(text)]
49+
res: List[str] = []
50+
for tok in TOK_PAT.findall(text):
51+
low = tok.lower()
52+
if re.fullmatch(rf"[{CJK_PAT}]+", tok):
53+
res.extend(_expand_cjk_token(tok))
54+
else:
55+
res.append(low)
56+
return res
4257

4358
def stem(tok: str) -> str:
4459
if len(tok) <= 3: return tok
@@ -85,3 +100,7 @@ def build_fts_query(text: str) -> str:
85100

86101
def canonical_token_set(text: str) -> Set[str]:
87102
return set(canonical_tokens_from_text(text))
103+
104+
105+
def stable_text_fallback_hash(text: str) -> str:
106+
return hashlib.blake2b(text.encode("utf-8"), digest_size=8).hexdigest()
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from __future__ import annotations
2+
3+
import importlib.util
4+
import sys
5+
import types
6+
from pathlib import Path
7+
8+
9+
ROOT = Path(__file__).resolve().parents[1] / "src" / "openmemory"
10+
11+
12+
def _ensure_pkg(name: str) -> types.ModuleType:
13+
mod = sys.modules.get(name)
14+
if mod is None:
15+
mod = types.ModuleType(name)
16+
mod.__path__ = [] # type: ignore[attr-defined]
17+
sys.modules[name] = mod
18+
return mod
19+
20+
21+
def _stub_module(name: str, **attrs: object) -> None:
22+
mod = types.ModuleType(name)
23+
for key, value in attrs.items():
24+
setattr(mod, key, value)
25+
sys.modules[name] = mod
26+
27+
28+
def _load_module(name: str, path: Path):
29+
spec = importlib.util.spec_from_file_location(name, path)
30+
module = importlib.util.module_from_spec(spec)
31+
assert spec and spec.loader
32+
sys.modules[name] = module
33+
spec.loader.exec_module(module)
34+
return module
35+
36+
37+
def _load_text_and_hsg():
38+
_ensure_pkg("openmemory")
39+
_ensure_pkg("openmemory.utils")
40+
_ensure_pkg("openmemory.memory")
41+
_ensure_pkg("openmemory.core")
42+
_ensure_pkg("openmemory.ops")
43+
44+
text = _load_module("openmemory.utils.text", ROOT / "utils" / "text.py")
45+
46+
_stub_module("openmemory.core.db", q=None, db=None, transaction=lambda: None)
47+
_stub_module("openmemory.core.config", env=types.SimpleNamespace())
48+
_stub_module("openmemory.core.constants", SECTOR_CONFIGS={})
49+
_stub_module("openmemory.core.vector_store", vector_store=None)
50+
_stub_module("openmemory.utils.chunking", chunk_text=lambda *args, **kwargs: [])
51+
_stub_module(
52+
"openmemory.utils.keyword",
53+
keyword_filter_memories=lambda *args, **kwargs: [],
54+
compute_keyword_overlap=lambda *args, **kwargs: 0.0,
55+
)
56+
_stub_module(
57+
"openmemory.utils.vectors",
58+
buf_to_vec=lambda *args, **kwargs: [],
59+
vec_to_buf=lambda *args, **kwargs: b"",
60+
cos_sim=lambda *args, **kwargs: 0.0,
61+
)
62+
_stub_module(
63+
"openmemory.memory.embed",
64+
embed_multi_sector=lambda *args, **kwargs: {},
65+
embed_for_sector=lambda *args, **kwargs: [],
66+
calc_mean_vec=lambda *args, **kwargs: [],
67+
)
68+
_stub_module(
69+
"openmemory.memory.decay",
70+
inc_q=lambda *args, **kwargs: None,
71+
dec_q=lambda *args, **kwargs: None,
72+
on_query_hit=lambda *args, **kwargs: None,
73+
calc_recency_score=lambda *args, **kwargs: 0.0,
74+
pick_tier=lambda *args, **kwargs: "cold",
75+
)
76+
_stub_module(
77+
"openmemory.ops.dynamics",
78+
calculateCrossSectorResonanceScore=lambda *args, **kwargs: 0.0,
79+
applyRetrievalTraceReinforcementToMemory=lambda *args, **kwargs: None,
80+
propagateAssociativeReinforcementToLinkedNodes=lambda *args, **kwargs: None,
81+
)
82+
_stub_module("openmemory.memory.user_summary", update_user_summary=lambda *args, **kwargs: None)
83+
84+
hsg = _load_module("openmemory.memory.hsg", ROOT / "memory" / "hsg.py")
85+
return text, hsg
86+
87+
88+
TEXT, HSG = _load_text_and_hsg()
89+
90+
91+
def test_tokenize_expands_cjk_bigrams():
92+
assert TEXT.tokenize("我喜欢普洱茶") == ["我喜", "喜欢", "欢普", "普洱", "洱茶"]
93+
94+
95+
def test_canonical_tokens_keep_distinct_chinese_content():
96+
left = TEXT.canonical_tokens_from_text("我喜欢健身")
97+
right = TEXT.canonical_tokens_from_text("我喜欢普洱茶")
98+
99+
assert left
100+
assert right
101+
assert set(left) != set(right)
102+
103+
104+
def test_compute_simhash_avoids_constant_hash_for_distinct_chinese_inputs():
105+
left = HSG.compute_simhash("我喜欢健身")
106+
right = HSG.compute_simhash("我喜欢普洱茶")
107+
108+
assert left != right
109+
110+
111+
def test_compute_simhash_uses_stable_fallback_when_tokenizer_finds_nothing():
112+
left = HSG.compute_simhash("!!!")
113+
right = HSG.compute_simhash("???")
114+
115+
assert left != right
116+
assert left == HSG.compute_simhash("!!!")

0 commit comments

Comments
 (0)