Skip to content
This repository was archived by the owner on Jun 3, 2026. It is now read-only.

Commit 2601823

Browse files
committed
Add memory lifecycle metadata support
1 parent d1090f2 commit 2601823

3 files changed

Lines changed: 415 additions & 14 deletions

File tree

src/storage/local.py

Lines changed: 184 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,22 @@
1313
import sqlite3
1414
import uuid
1515
from pathlib import Path
16-
from typing import Any, Dict, List, Optional, Sequence, Tuple
16+
from typing import Any, Dict, List, Optional, Sequence
1717

1818
from src.config import settings
1919
from src.storage.base import BaseVectorStore, IndexStats, SearchResult
20+
from src.storage.memory_lifecycle import (
21+
CONTENT_HASH_KEY,
22+
FORGET_REASON_KEY,
23+
FORGOTTEN_AT_KEY,
24+
IS_CURRENT_KEY,
25+
PARENT_MEMORY_ID_KEY,
26+
VERSION_KEY,
27+
build_lifecycle_metadata,
28+
compute_memory_hash,
29+
is_retrievable_memory,
30+
utc_now_iso,
31+
)
2032
from src.utils.exceptions import VectorStoreValidationError
2133

2234

@@ -43,6 +55,9 @@ def _cosine_similarity(a: Sequence[float], b: Sequence[float]) -> float:
4355
return max(0.0, min(1.0, (dot / (norm_a * norm_b) + 1.0) / 2.0))
4456

4557

58+
_DEDUP_SCOPE_KEYS = ("user_id", "tenant_id", "org_id", "workspace_id", "project_id")
59+
60+
4661
class SQLiteVectorStore(BaseVectorStore):
4762
"""Small embedded vector store for single-user local testing.
4863
@@ -101,16 +116,29 @@ def add(
101116

102117
ids = ids or [str(uuid.uuid4()) for _ in texts]
103118
metadata = metadata or [{} for _ in texts]
104-
rows = [
105-
(
106-
self._namespace,
107-
vec_id,
108-
text,
109-
json.dumps([float(v) for v in embedding]),
110-
json.dumps(meta or {}),
119+
output_ids: List[str] = []
120+
rows = []
121+
for text, embedding, vec_id, meta in zip(texts, embeddings, ids, metadata):
122+
lifecycle_meta = build_lifecycle_metadata(text, meta)
123+
existing_id = self._find_current_by_hash(
124+
lifecycle_meta[CONTENT_HASH_KEY],
125+
lifecycle_meta,
111126
)
112-
for text, embedding, vec_id, meta in zip(texts, embeddings, ids, metadata)
113-
]
127+
if existing_id:
128+
output_ids.append(existing_id)
129+
continue
130+
output_ids.append(vec_id)
131+
rows.append(
132+
(
133+
self._namespace,
134+
vec_id,
135+
text,
136+
json.dumps([float(v) for v in embedding]),
137+
json.dumps(lifecycle_meta),
138+
)
139+
)
140+
if not rows:
141+
return output_ids
114142
self._conn.executemany(
115143
"""
116144
INSERT INTO xmem_vectors(namespace, id, content, embedding, metadata)
@@ -124,7 +152,7 @@ def add(
124152
rows,
125153
)
126154
self._conn.commit()
127-
return ids
155+
return output_ids
128156

129157
def search(
130158
self,
@@ -145,7 +173,7 @@ def search(
145173
results: List[SearchResult] = []
146174
for row in rows:
147175
meta = json.loads(row["metadata"] or "{}")
148-
if not _metadata_matches(meta, filters):
176+
if not is_retrievable_memory(meta) or not _metadata_matches(meta, filters):
149177
continue
150178
embedding = json.loads(row["embedding"])
151179
results.append(
@@ -175,6 +203,8 @@ def update(
175203
return False
176204
current_meta = json.loads(row["metadata"] or "{}")
177205
current_meta.update(metadata or {})
206+
new_text = text if text is not None else row["content"]
207+
current_meta[CONTENT_HASH_KEY] = compute_memory_hash(new_text)
178208
new_embedding = embedding if embedding is not None else json.loads(row["embedding"])
179209
if len(new_embedding) != self._dimension:
180210
raise VectorStoreValidationError(
@@ -188,7 +218,7 @@ def update(
188218
WHERE namespace = ? AND id = ?
189219
""",
190220
(
191-
text if text is not None else row["content"],
221+
new_text,
192222
json.dumps([float(v) for v in new_embedding]),
193223
json.dumps(current_meta),
194224
self._namespace,
@@ -198,6 +228,115 @@ def update(
198228
self._conn.commit()
199229
return True
200230

231+
def add_version(
232+
self,
233+
parent_id: str,
234+
text: str,
235+
embedding: List[float],
236+
id: Optional[str] = None,
237+
metadata: Optional[Dict[str, Any]] = None,
238+
) -> Optional[str]:
239+
"""Create a new current memory version and keep the parent as history."""
240+
241+
parent = self.get([parent_id])
242+
if not parent:
243+
return None
244+
if len(embedding) != self._dimension:
245+
raise VectorStoreValidationError(
246+
f"Embedding dimension {len(embedding)} doesn't match {self._dimension}",
247+
operation="add_version",
248+
)
249+
250+
parent_meta = dict(parent[0]["metadata"] or {})
251+
root_parent_id = parent_meta.get(PARENT_MEMORY_ID_KEY) or parent_id
252+
next_version = int(parent_meta.get(VERSION_KEY) or 1) + 1
253+
new_id = id or str(uuid.uuid4())
254+
new_meta = build_lifecycle_metadata(
255+
text,
256+
metadata,
257+
parent_memory_id=root_parent_id,
258+
version=next_version,
259+
is_current=True,
260+
)
261+
existing_id = self._find_current_by_hash(new_meta[CONTENT_HASH_KEY], new_meta)
262+
if existing_id:
263+
if existing_id == parent_id:
264+
return existing_id
265+
parent_meta[IS_CURRENT_KEY] = False
266+
with self._conn:
267+
self._conn.execute(
268+
"""
269+
UPDATE xmem_vectors
270+
SET metadata = ?, updated_at = CURRENT_TIMESTAMP
271+
WHERE namespace = ? AND id = ?
272+
""",
273+
(json.dumps(parent_meta), self._namespace, parent_id),
274+
)
275+
return existing_id
276+
277+
parent_meta[IS_CURRENT_KEY] = False
278+
with self._conn:
279+
self._conn.execute(
280+
"""
281+
UPDATE xmem_vectors
282+
SET metadata = ?, updated_at = CURRENT_TIMESTAMP
283+
WHERE namespace = ? AND id = ?
284+
""",
285+
(json.dumps(parent_meta), self._namespace, parent_id),
286+
)
287+
self._conn.execute(
288+
"""
289+
INSERT INTO xmem_vectors(namespace, id, content, embedding, metadata)
290+
VALUES (?, ?, ?, ?, ?)
291+
""",
292+
(
293+
self._namespace,
294+
new_id,
295+
text,
296+
json.dumps([float(v) for v in embedding]),
297+
json.dumps(new_meta),
298+
),
299+
)
300+
return new_id
301+
302+
def forget(
303+
self,
304+
ids: List[str],
305+
reason: Optional[str] = None,
306+
hard_delete: bool = False,
307+
) -> bool:
308+
"""Soft-forget memories by default, preserving audit history."""
309+
310+
if hard_delete:
311+
return self.delete(ids)
312+
if not ids:
313+
return True
314+
315+
placeholders = ",".join("?" for _ in ids)
316+
rows = self._conn.execute(
317+
f"SELECT id, metadata FROM xmem_vectors "
318+
f"WHERE namespace = ? AND id IN ({placeholders})",
319+
[self._namespace, *ids],
320+
).fetchall()
321+
322+
now = utc_now_iso()
323+
updates = []
324+
for row in rows:
325+
meta = json.loads(row["metadata"] or "{}")
326+
meta[IS_CURRENT_KEY] = False
327+
meta[FORGOTTEN_AT_KEY] = now
328+
meta[FORGET_REASON_KEY] = reason
329+
updates.append((json.dumps(meta), self._namespace, row["id"]))
330+
331+
if updates:
332+
self._conn.executemany(
333+
"UPDATE xmem_vectors SET metadata = ?, updated_at = CURRENT_TIMESTAMP "
334+
"WHERE namespace = ? AND id = ?",
335+
updates,
336+
)
337+
self._conn.commit()
338+
return True
339+
201340
def delete(self, ids: List[str]) -> bool:
202341
if not ids:
203342
return True
@@ -239,10 +378,41 @@ def search_by_metadata(
239378
results: List[SearchResult] = []
240379
for row in rows:
241380
meta = json.loads(row["metadata"] or "{}")
242-
if _metadata_matches(meta, filters):
381+
if is_retrievable_memory(meta) and _metadata_matches(meta, filters):
243382
results.append(SearchResult(id=row["id"], content=row["content"], score=1.0, metadata=meta))
244383
return results[:top_k]
245384

385+
def _find_current_by_hash(
386+
self,
387+
content_hash: str,
388+
metadata: Optional[Dict[str, Any]] = None,
389+
) -> Optional[str]:
390+
clauses = [
391+
"namespace = ?",
392+
f"json_extract(metadata, '$.{CONTENT_HASH_KEY}') = ?",
393+
f"json_extract(metadata, '$.{IS_CURRENT_KEY}') = 1",
394+
f"json_extract(metadata, '$.{FORGOTTEN_AT_KEY}') IS NULL",
395+
]
396+
params: List[Any] = [self._namespace, content_hash]
397+
scope = {
398+
key: (metadata or {}).get(key)
399+
for key in _DEDUP_SCOPE_KEYS
400+
if (metadata or {}).get(key) is not None
401+
}
402+
if scope:
403+
for key, value in scope.items():
404+
clauses.append(f"json_extract(metadata, '$.{key}') = ?")
405+
params.append(value)
406+
else:
407+
for key in _DEDUP_SCOPE_KEYS:
408+
clauses.append(f"json_type(metadata, '$.{key}') IS NULL")
409+
410+
row = self._conn.execute(
411+
f"SELECT id FROM xmem_vectors WHERE {' AND '.join(clauses)} LIMIT 1",
412+
params,
413+
).fetchone()
414+
return row["id"] if row else None
415+
246416
async def search_by_text(
247417
self,
248418
query_text: str,

src/storage/memory_lifecycle.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""Memory lifecycle metadata helpers.
2+
3+
These helpers keep duplicate detection, version lineage, and soft-forget
4+
metadata consistent across vector-store implementations.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import hashlib
10+
import re
11+
from datetime import datetime, timezone
12+
from typing import Any, Dict, Optional
13+
14+
CONTENT_HASH_KEY = "content_hash"
15+
PARENT_MEMORY_ID_KEY = "parent_memory_id"
16+
VERSION_KEY = "version"
17+
IS_CURRENT_KEY = "is_current"
18+
FORGOTTEN_AT_KEY = "forgotten_at"
19+
FORGET_REASON_KEY = "forget_reason"
20+
21+
22+
def normalize_memory_content(content: str) -> str:
23+
"""Normalize memory text before hashing to catch whitespace-only duplicates."""
24+
25+
return re.sub(r"\s+", " ", content.strip()).casefold()
26+
27+
28+
def compute_memory_hash(content: str) -> str:
29+
"""Return a stable SHA-256 digest for normalized memory content."""
30+
31+
normalized = normalize_memory_content(content)
32+
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()
33+
34+
35+
def utc_now_iso() -> str:
36+
return datetime.now(timezone.utc).isoformat()
37+
38+
39+
def build_lifecycle_metadata(
40+
content: str,
41+
metadata: Optional[Dict[str, Any]] = None,
42+
*,
43+
parent_memory_id: Optional[str] = None,
44+
version: int = 1,
45+
is_current: bool = True,
46+
) -> Dict[str, Any]:
47+
"""Merge caller metadata with lifecycle fields without losing custom keys."""
48+
49+
merged = dict(metadata or {})
50+
merged[CONTENT_HASH_KEY] = compute_memory_hash(content)
51+
merged[PARENT_MEMORY_ID_KEY] = parent_memory_id
52+
merged[VERSION_KEY] = version
53+
merged[IS_CURRENT_KEY] = is_current
54+
merged[FORGOTTEN_AT_KEY] = None
55+
merged[FORGET_REASON_KEY] = None
56+
return merged
57+
58+
59+
def is_retrievable_memory(metadata: Optional[Dict[str, Any]]) -> bool:
60+
"""Return False for superseded or soft-forgotten memory records."""
61+
62+
meta = metadata or {}
63+
return meta.get(IS_CURRENT_KEY, True) is not False and not meta.get(FORGOTTEN_AT_KEY)

0 commit comments

Comments
 (0)