Skip to content

Commit 8c81d3c

Browse files
authored
perf(core): reduce postgres vector sync work (#723)
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent b35d594 commit 8c81d3c

20 files changed

+1636
-147
lines changed

.claude/settings.json

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,22 @@
11
{
2-
"enabledPlugins": {}
2+
"$schema": "https://json.schemastore.org/claude-code-settings.json",
3+
"env": {
4+
"CLAUDE_BASH_MAINTAIN_PROJECT_WORKING_DIR": "1",
5+
"CLAUDE_CODE_DISABLE_FEEDBACK_SURVEY": "1",
6+
"DISABLE_TELEMETRY": "1",
7+
"CLAUDE_CODE_NO_FLICKER": "1",
8+
"CLAUDE_CODE_DISABLE_ADAPTIVE_THINKING": "1"
9+
},
10+
"permissions": {
11+
"allow": [
12+
"Bash(just fast-check)",
13+
"Bash(just check)",
14+
"Bash(just fix)",
15+
"Bash(just typecheck)",
16+
"Bash(just lint)",
17+
"Bash(just test)"
18+
],
19+
"deny": []
20+
},
21+
"enableAllProjectMcpServers": true
322
}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
"""Persist vector sync fingerprints on chunk metadata.
2+
3+
Revision ID: m6h7i8j9k0l1
4+
Revises: l5g6h7i8j9k0
5+
Create Date: 2026-04-07 00:00:00.000000
6+
7+
"""
8+
9+
from typing import Sequence, Union
10+
11+
from alembic import op
12+
13+
# revision identifiers, used by Alembic.
14+
revision: str = "m6h7i8j9k0l1"
15+
down_revision: Union[str, None] = "l5g6h7i8j9k0"
16+
branch_labels: Union[str, Sequence[str], None] = None
17+
depends_on: Union[str, Sequence[str], None] = None
18+
19+
20+
def upgrade() -> None:
21+
"""Add entity fingerprint + embedding model metadata to Postgres chunk rows.
22+
23+
Trigger: vector sync now fast-skips unchanged entities using persisted
24+
semantic fingerprints.
25+
Why: chunk rows already own the per-entity derived metadata we diff against,
26+
so persisting the fingerprint on that table avoids a second sync-state table.
27+
Outcome: existing rows get empty-string placeholders and will be refreshed on
28+
the next vector sync before they become eligible for skip checks.
29+
"""
30+
connection = op.get_bind()
31+
if connection.dialect.name != "postgresql":
32+
return
33+
34+
op.execute(
35+
"""
36+
ALTER TABLE search_vector_chunks
37+
ADD COLUMN IF NOT EXISTS entity_fingerprint TEXT
38+
"""
39+
)
40+
op.execute(
41+
"""
42+
ALTER TABLE search_vector_chunks
43+
ADD COLUMN IF NOT EXISTS embedding_model TEXT
44+
"""
45+
)
46+
op.execute(
47+
"""
48+
UPDATE search_vector_chunks
49+
SET entity_fingerprint = COALESCE(entity_fingerprint, ''),
50+
embedding_model = COALESCE(embedding_model, '')
51+
"""
52+
)
53+
op.execute(
54+
"""
55+
ALTER TABLE search_vector_chunks
56+
ALTER COLUMN entity_fingerprint SET NOT NULL
57+
"""
58+
)
59+
op.execute(
60+
"""
61+
ALTER TABLE search_vector_chunks
62+
ALTER COLUMN embedding_model SET NOT NULL
63+
"""
64+
)
65+
66+
67+
def downgrade() -> None:
68+
"""Remove vector sync fingerprint columns from Postgres chunk rows."""
69+
connection = op.get_bind()
70+
if connection.dialect.name != "postgresql":
71+
return
72+
73+
op.execute(
74+
"""
75+
ALTER TABLE search_vector_chunks
76+
DROP COLUMN IF EXISTS embedding_model
77+
"""
78+
)
79+
op.execute(
80+
"""
81+
ALTER TABLE search_vector_chunks
82+
DROP COLUMN IF EXISTS entity_fingerprint
83+
"""
84+
)

src/basic_memory/cli/commands/cloud/cloud_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,12 +116,12 @@ async def sync_project(project_name: str, force_full: bool = False) -> None:
116116
117117
Args:
118118
project_name: Name of project to sync
119-
force_full: If True, force a full scan bypassing watermark optimization
119+
force_full: ignored, kept for backwards compatibility
120120
"""
121121
try:
122122
from basic_memory.cli.commands.command_utils import run_sync
123123

124-
await run_sync(project=project_name, force_full=force_full)
124+
await run_sync(project=project_name)
125125
except Exception as e:
126126
raise CloudUtilsError(f"Failed to sync project '{project_name}': {e}") from e
127127

src/basic_memory/cli/commands/cloud/upload_command.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ async def _upload():
142142
if sync and not dry_run:
143143
console.print(f"[blue]Syncing project '{project}'...[/blue]")
144144
try:
145-
await sync_project(project, force_full=True)
145+
await sync_project(project)
146146
except Exception as e:
147147
console.print(f"[yellow]Warning: Sync failed: {e}[/yellow]")
148148
console.print("[dim]Files uploaded but may not be indexed yet[/dim]")

src/basic_memory/cli/commands/doctor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ async def run_doctor() -> None:
101101
console.print("[green]OK[/green] Manual file written")
102102

103103
sync_data = await project_client.sync(
104-
project_id, force_full=True, run_in_background=False
104+
project_id, force_full=False, run_in_background=False
105105
)
106106
sync_report = SyncReportResponse.model_validate(sync_data)
107107
if sync_report.total == 0:

src/basic_memory/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,12 @@ class BasicMemoryConfig(BaseSettings):
198198
description="Batch size for vector sync orchestration flushes.",
199199
gt=0,
200200
)
201+
semantic_postgres_prepare_concurrency: int = Field(
202+
default=4,
203+
description="Number of Postgres entity prepare tasks to run concurrently during vector sync. Postgres only; keep this low to avoid overdriving the database connection pool.",
204+
gt=0,
205+
le=16,
206+
)
201207
semantic_embedding_cache_dir: str | None = Field(
202208
default=None,
203209
description="Optional cache directory for FastEmbed model artifacts.",

src/basic_memory/models/search.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@
104104
chunk_key TEXT NOT NULL,
105105
chunk_text TEXT NOT NULL,
106106
source_hash TEXT NOT NULL,
107+
entity_fingerprint TEXT NOT NULL,
108+
embedding_model TEXT NOT NULL,
107109
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
108110
UNIQUE (project_id, entity_id, chunk_key)
109111
)
@@ -124,6 +126,8 @@
124126
chunk_key TEXT NOT NULL,
125127
chunk_text TEXT NOT NULL,
126128
source_hash TEXT NOT NULL,
129+
entity_fingerprint TEXT NOT NULL,
130+
embedding_model TEXT NOT NULL,
127131
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
128132
)
129133
""")

0 commit comments

Comments
 (0)