Skip to content

Commit 9fd0364

Browse files
FZ2000Frank
andauthored
fix(#36,#45): stable memory dedup key + memory add uses new schema
* fix(#36,#45): stable memory dedup key, memory add new schema - #36: Replace str(id(e)) fallback in merge_memory() with a stable SHA-256 hash of (content + source_tool + source_file + category). The old str(id(e)) is a Python object identifier that changes on every process start, causing identical entries loaded from disk to get different keys and never be deduplicated. The new fallback is deterministic across runs. - #45: apc memory add now creates entries using the new schema: id = SHA-256 of 'manual:category:text' (content-stable) source_tool = 'manual' source_file = 'memory_add' label = 'Manual [category]' Replaces the legacy entry_id (timestamp prefix) which made it impossible to detect duplicate entries from two add calls with the same text but different timestamps. Tests: 9 new tests in test_memory_bugs_36_45.py. * fix: memory list shows content for manually-added entries After #45 changed memory add to use the new schema (source_tool='manual'), manually-added entries were displayed in the '[Collected Files]' section showing only tool/file path, not the text content. Separate manual entries (source_tool=='manual') from collected file entries in memory list so manual adds continue to display their text content grouped by category, matching the previous UX and the docker integration test. --------- Co-authored-by: Frank <frank@Franks-Mac-mini.local>
1 parent 26bc021 commit 9fd0364

3 files changed

Lines changed: 213 additions & 13 deletions

File tree

src/cache.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,11 +95,30 @@ def merge_memory(existing: List[Dict], new: List[Dict]) -> List[Dict]:
9595
9696
Supports both old format (entry_id key) and new format (id key based on
9797
content hash of source_tool:source_file:content).
98+
99+
The fallback key is a SHA-256 of the entry content — never str(id(e)) which
100+
changes every Python process invocation and would cause duplication (#36).
98101
"""
102+
import hashlib
103+
104+
def _stable_fallback(e: Dict) -> str:
105+
"""Deterministic key for entries that lack an explicit id field."""
106+
# Use content + source fields to build a stable hash so that the
107+
# same entry loaded from disk on two different runs gets the same key.
108+
raw = "|".join(
109+
[
110+
str(e.get("content", "")),
111+
str(e.get("source_tool", "")),
112+
str(e.get("source_file", "")),
113+
str(e.get("category", "")),
114+
]
115+
)
116+
return "fallback:" + hashlib.sha256(raw.encode()).hexdigest()[:16]
99117

100118
def _key(e: Dict) -> str:
101-
# New format uses "id" (content-hash), old format uses "entry_id"
102-
return e.get("id") or e.get("entry_id") or str(id(e))
119+
# New format uses "id" (content-hash), old format uses "entry_id".
120+
# Fallback to a stable content hash — never str(id(e)) (#36).
121+
return e.get("id") or e.get("entry_id") or _stable_fallback(e)
103122

104123
index = {_key(e): e for e in existing}
105124
for e in new:

src/memory.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@ def _is_raw_file_entry(entry: dict) -> bool:
2121
return "source_file" in entry
2222

2323

24+
def _is_manual_entry(entry: dict) -> bool:
25+
"""Detect manually-added entries (source_tool == 'manual')."""
26+
return entry.get("source_tool") == "manual"
27+
28+
2429
@click.group()
2530
def memory():
2631
"""Manage AI memory entries (local cache)."""
@@ -39,15 +44,20 @@ def memory():
3944
)
4045
def add(text, category):
4146
"""Add a memory entry to local cache. Usage: apc memory add "your text" """
42-
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
43-
short_hash = hashlib.md5(text.encode()).hexdigest()[:6]
44-
entry_id = f"{ts}_{short_hash}"
47+
# Use the new schema (id + source_tool) so that:
48+
# 1. The same text added twice is idempotent — content-hash id is stable (#45)
49+
# 2. merge_memory deduplicates via 'id', not a timestamp-based entry_id
50+
content_id = hashlib.sha256(f"manual:{category}:{text}".encode()).hexdigest()[:16]
51+
now = datetime.now(timezone.utc).isoformat()
4552

4653
new_entry = {
47-
"entry_id": entry_id,
54+
"id": content_id,
55+
"source_tool": "manual",
56+
"source_file": "memory_add",
57+
"label": f"Manual [{category}]",
4858
"category": category,
4959
"content": text,
50-
"source": "manual_add",
60+
"collected_at": now,
5161
}
5262

5363
existing = load_memory()
@@ -65,11 +75,26 @@ def list_entries():
6575
click.echo("No memory entries found. Run 'apc collect' or 'apc memory add \"...\"' first.")
6676
return
6777

68-
# Separate raw-file entries from legacy entries
69-
raw_files = [e for e in entries if _is_raw_file_entry(e)]
78+
# Separate entries into three groups:
79+
# 1. manually-added (new schema, source_tool=="manual") — show content
80+
# 2. raw collected files (new schema, source_tool != "manual") — show path + size
81+
# 3. legacy format (no source_file key) — show content grouped by category
82+
manual = [e for e in entries if _is_raw_file_entry(e) and _is_manual_entry(e)]
83+
raw_files = [e for e in entries if _is_raw_file_entry(e) and not _is_manual_entry(e)]
7084
legacy = [e for e in entries if not _is_raw_file_entry(e)]
7185

72-
# Display raw-file entries
86+
# Display manually-added entries grouped by category
87+
if manual:
88+
by_category: dict = {}
89+
for entry in manual:
90+
cat = entry.get("category", "manual")
91+
by_category.setdefault(cat, []).append(entry)
92+
for cat, items in sorted(by_category.items()):
93+
click.echo(f"\n[{cat}]")
94+
for item in items:
95+
click.echo(f" - {item.get('content', '')} (manual)")
96+
97+
# Display collected file entries (tool memory files)
7398
if raw_files:
7499
click.echo("\n[Collected Files]")
75100
for entry in raw_files:
@@ -84,12 +109,12 @@ def list_entries():
84109

85110
# Display legacy entries grouped by category
86111
if legacy:
87-
by_category = {}
112+
by_category2: dict = {}
88113
for entry in legacy:
89114
cat = entry.get("category", "unknown")
90-
by_category.setdefault(cat, []).append(entry)
115+
by_category2.setdefault(cat, []).append(entry)
91116

92-
for cat, items in sorted(by_category.items()):
117+
for cat, items in sorted(by_category2.items()):
93118
click.echo(f"\n[{cat}]")
94119
for item in items:
95120
source = item.get("source", "")

tests/test_memory_bugs_36_45.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Tests for memory schema and deduplication fixes (#36, #45)."""
2+
3+
import unittest
4+
from unittest.mock import patch
5+
6+
7+
class TestMergeMemoryDedup(unittest.TestCase):
8+
"""#36 — Memory entries duplicate on every apc collect."""
9+
10+
def test_same_content_no_id_not_duplicated(self):
11+
"""Entries lacking id/entry_id are deduplicated by stable content hash."""
12+
from cache import merge_memory
13+
14+
entry = {"content": "hello", "source_tool": "test", "category": "pref"}
15+
# Same logical entry as two separate dict instances (simulates load from disk)
16+
entry1 = dict(entry)
17+
entry2 = dict(entry)
18+
19+
result = merge_memory([entry1], [entry2])
20+
self.assertEqual(len(result), 1, "Same content must not create a duplicate")
21+
22+
def test_different_content_creates_two_entries(self):
23+
"""Different content creates two separate entries."""
24+
from cache import merge_memory
25+
26+
e1 = {"content": "hello", "source_tool": "test", "category": "pref"}
27+
e2 = {"content": "world", "source_tool": "test", "category": "pref"}
28+
result = merge_memory([e1], [e2])
29+
self.assertEqual(len(result), 2)
30+
31+
def test_stable_id_deduplicates_correctly(self):
32+
"""Entries with an explicit 'id' are deduplicated by that id."""
33+
from cache import merge_memory
34+
35+
e1 = {"id": "abc123", "content": "old", "source_tool": "t"}
36+
e2 = {"id": "abc123", "content": "new", "source_tool": "t"}
37+
result = merge_memory([e1], [e2])
38+
self.assertEqual(len(result), 1)
39+
self.assertEqual(result[0]["content"], "new", "New entry should win (upsert)")
40+
41+
def test_entry_id_key_stable_across_runs(self):
42+
"""entry_id (old format) provides a stable dedup key across reloads."""
43+
from cache import merge_memory
44+
45+
e = {"entry_id": "20260307_abc123", "content": "hello", "category": "pref"}
46+
# Same entry reloaded = new Python object, but same entry_id
47+
e_reloaded = dict(e)
48+
result = merge_memory([e], [e_reloaded])
49+
self.assertEqual(len(result), 1)
50+
51+
def test_no_id_same_source_tool_file_deduplicates(self):
52+
"""Fallback hash uses source_tool + source_file + content for stable keying."""
53+
from cache import merge_memory
54+
55+
base = {"content": "data", "source_tool": "openclaw", "source_file": "MEMORY.md"}
56+
result = merge_memory([dict(base)], [dict(base)])
57+
self.assertEqual(len(result), 1, "Same source+content should deduplicate")
58+
59+
60+
class TestMemoryAddNewSchema(unittest.TestCase):
61+
"""#45 — apc memory add uses legacy schema vs collect's new schema."""
62+
63+
def test_memory_add_uses_id_not_entry_id(self):
64+
"""memory add now creates entries with 'id' field (new schema)."""
65+
from click.testing import CliRunner
66+
67+
from memory import memory
68+
69+
runner = CliRunner()
70+
saved = []
71+
72+
with (
73+
patch("memory.load_memory", return_value=[]),
74+
patch("memory.save_memory", side_effect=lambda entries: saved.extend(entries)),
75+
patch("memory.merge_memory", side_effect=lambda existing, new: new),
76+
):
77+
result = runner.invoke(memory, ["add", "test memory text"])
78+
79+
self.assertEqual(result.exit_code, 0)
80+
self.assertEqual(len(saved), 1)
81+
entry = saved[0]
82+
# New schema: must have 'id', not 'entry_id'
83+
self.assertIn("id", entry, "New schema requires 'id' field")
84+
self.assertNotIn("entry_id", entry, "'entry_id' is legacy — must not appear")
85+
self.assertIn("source_tool", entry)
86+
self.assertEqual(entry["source_tool"], "manual")
87+
self.assertIn("source_file", entry)
88+
89+
def test_memory_add_same_text_same_id(self):
90+
"""Adding the same text twice produces the same id (idempotent)."""
91+
from click.testing import CliRunner
92+
93+
from memory import memory
94+
95+
runner = CliRunner()
96+
ids = []
97+
98+
def capture_save(entries):
99+
ids.extend(e["id"] for e in entries)
100+
101+
with (
102+
patch("memory.load_memory", return_value=[]),
103+
patch("memory.save_memory", side_effect=capture_save),
104+
patch("memory.merge_memory", side_effect=lambda existing, new: new),
105+
):
106+
runner.invoke(memory, ["add", "repeated text"])
107+
runner.invoke(memory, ["add", "repeated text"])
108+
109+
self.assertEqual(len(ids), 2)
110+
self.assertEqual(ids[0], ids[1], "Same text must produce the same id")
111+
112+
def test_memory_add_different_category_different_id(self):
113+
"""Same text with different category produces a different id."""
114+
from click.testing import CliRunner
115+
116+
from memory import memory
117+
118+
runner = CliRunner()
119+
ids = []
120+
121+
def capture_save(entries):
122+
ids.extend(e["id"] for e in entries)
123+
124+
with (
125+
patch("memory.load_memory", return_value=[]),
126+
patch("memory.save_memory", side_effect=capture_save),
127+
patch("memory.merge_memory", side_effect=lambda existing, new: new),
128+
):
129+
runner.invoke(memory, ["add", "same text", "--category", "preference"])
130+
runner.invoke(memory, ["add", "same text", "--category", "workflow"])
131+
132+
self.assertEqual(len(ids), 2)
133+
self.assertNotEqual(ids[0], ids[1], "Different categories should produce different ids")
134+
135+
def test_memory_add_preserves_category(self):
136+
"""Category is still stored in the new schema entry."""
137+
from click.testing import CliRunner
138+
139+
from memory import memory
140+
141+
runner = CliRunner()
142+
saved = []
143+
144+
with (
145+
patch("memory.load_memory", return_value=[]),
146+
patch("memory.save_memory", side_effect=lambda e: saved.extend(e)),
147+
patch("memory.merge_memory", side_effect=lambda existing, new: new),
148+
):
149+
runner.invoke(memory, ["add", "some text", "--category", "workflow"])
150+
151+
self.assertEqual(saved[0]["category"], "workflow")
152+
self.assertEqual(saved[0]["content"], "some text")
153+
154+
155+
if __name__ == "__main__":
156+
unittest.main()

0 commit comments

Comments
 (0)