Skip to content

Commit cc104f7

Browse files
committed
Merge branch 'main' of github.com:basicmachines-co/basic-memory
2 parents d7f3f6a + 7945c1e commit cc104f7

26 files changed

+2363
-810
lines changed

src/basic_memory/cli/commands/db.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ class EmbeddingProgress:
2626
"""Typed CLI progress payload for embedding backfills."""
2727

2828
entity_id: int
29-
index: int
29+
completed: int
3030
total: int
3131

3232

@@ -147,20 +147,30 @@ def reindex(
147147
False, "--embeddings", "-e", help="Rebuild vector embeddings (requires semantic search)"
148148
),
149149
search: bool = typer.Option(False, "--search", "-s", help="Rebuild full-text search index"),
150+
full: bool = typer.Option(
151+
False,
152+
"--full",
153+
help="Force a full filesystem scan and file reindex instead of the default incremental scan",
154+
),
150155
project: str = typer.Option(
151156
None, "--project", "-p", help="Reindex a specific project (default: all)"
152157
),
153158
): # pragma: no cover
154159
"""Rebuild search indexes and/or vector embeddings without dropping the database.
155160
156-
By default rebuilds everything (search + embeddings if semantic is enabled).
157-
Use --search or --embeddings to rebuild only one.
161+
By default runs incremental search + embeddings (if semantic search is enabled).
162+
Use --full to bypass incremental scan optimization, rebuild all file-backed search rows,
163+
and re-embed all eligible notes.
164+
Use --search or --embeddings to rebuild only one side.
158165
159166
Examples:
160-
bm reindex # Rebuild everything
167+
bm reindex # Incremental search + embeddings
168+
bm reindex --full # Full search + full re-embed
161169
bm reindex --embeddings # Only rebuild vector embeddings
162170
bm reindex --search # Only rebuild FTS index
163-
bm reindex -p claw # Reindex only the 'claw' project
171+
bm reindex --full --search # Full search only
172+
bm reindex --full --embeddings # Full re-embed only
173+
bm reindex -p claw --full # Full reindex for only the 'claw' project
164174
"""
165175
# If neither flag is set, do both
166176
if not embeddings and not search:
@@ -179,10 +189,19 @@ def reindex(
179189
if not search:
180190
raise typer.Exit(0)
181191

182-
run_with_cleanup(_reindex(app_config, search=search, embeddings=embeddings, project=project))
192+
run_with_cleanup(
193+
_reindex(app_config, search=search, embeddings=embeddings, full=full, project=project)
194+
)
183195

184196

185-
async def _reindex(app_config, search: bool, embeddings: bool, project: str | None):
197+
async def _reindex(
198+
app_config,
199+
*,
200+
search: bool,
201+
embeddings: bool,
202+
full: bool,
203+
project: str | None,
204+
):
186205
"""Run reindex operations."""
187206
from basic_memory.repository import EntityRepository
188207
from basic_memory.repository.search_repository import create_search_repository
@@ -220,6 +239,10 @@ async def _reindex(app_config, search: bool, embeddings: bool, project: str | No
220239
console.print(f"\n[bold]Project: [cyan]{proj.name}[/cyan][/bold]")
221240

222241
if search:
242+
search_mode_label = "full scan" if full else "incremental scan"
243+
console.print(
244+
f" Rebuilding full-text search index ([cyan]{search_mode_label}[/cyan])..."
245+
)
223246
sync_service = await get_sync_service(proj)
224247
sync_dir = Path(proj.path)
225248
with Progress(
@@ -244,14 +267,19 @@ async def on_index_progress(update: IndexProgress) -> None:
244267
await sync_service.sync(
245268
sync_dir,
246269
project_name=proj.name,
270+
force_full=full,
271+
sync_embeddings=False,
247272
progress_callback=on_index_progress,
248273
)
249274
progress.update(task, completed=progress.tasks[task].total or 1)
250275

251-
console.print(" [green][/green] Full-text search index rebuilt")
276+
console.print(" [green]done[/green] Full-text search index rebuilt")
252277

253278
if embeddings:
254-
console.print(" Building vector embeddings...")
279+
embedding_mode_label = "full rebuild" if full else "incremental sync"
280+
console.print(
281+
f" Building vector embeddings ([cyan]{embedding_mode_label}[/cyan])..."
282+
)
255283
entity_repository = EntityRepository(session_maker, project_id=proj.id)
256284
search_repository = create_search_repository(
257285
session_maker, project_id=proj.id, app_config=app_config
@@ -274,20 +302,27 @@ async def on_index_progress(update: IndexProgress) -> None:
274302
def on_progress(entity_id, index, total):
275303
embedding_progress = EmbeddingProgress(
276304
entity_id=entity_id,
277-
index=index,
305+
completed=index,
278306
total=total,
279307
)
308+
# Trigger: repository progress now reports terminal entity completion.
309+
# Why: operators need to see finished embedding work rather than
310+
# entities merely entering prepare.
311+
# Outcome: the CLI bar advances steadily with real completed work.
280312
progress.update(
281313
task,
282314
total=embedding_progress.total,
283-
completed=embedding_progress.index,
315+
completed=embedding_progress.completed,
284316
)
285317

286-
stats = await search_service.reindex_vectors(progress_callback=on_progress)
318+
stats = await search_service.reindex_vectors(
319+
progress_callback=on_progress,
320+
force_full=full,
321+
)
287322
progress.update(task, completed=stats["total_entities"])
288323

289324
console.print(
290-
f" [green][/green] Embeddings complete: "
325+
f" [green]done[/green] Embeddings complete: "
291326
f"{stats['embedded']} entities embedded, "
292327
f"{stats['skipped']} skipped, "
293328
f"{stats['errors']} errors"

src/basic_memory/config.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,14 @@ class BasicMemoryConfig(BaseSettings):
188188
default=None,
189189
description="Embedding vector dimensions. Auto-detected from provider if not set (384 for FastEmbed, 1536 for OpenAI).",
190190
)
191+
# Trigger: full local rebuilds spend most of their time waiting behind shared
192+
# embed flushes, not constructing vectors themselves.
193+
# Why: smaller FastEmbed batches cut queue wait far more than they increase
194+
# write overhead on real-world projects, which makes full reindex materially faster.
195+
# Outcome: default to the smaller local/cloud-safe batch size we benchmarked as
196+
# the current best end-to-end setting in the shared vector sync pipeline.
191197
semantic_embedding_batch_size: int = Field(
192-
default=64,
198+
default=2,
193199
description="Batch size for embedding generation.",
194200
gt=0,
195201
)
@@ -199,7 +205,7 @@ class BasicMemoryConfig(BaseSettings):
199205
gt=0,
200206
)
201207
semantic_embedding_sync_batch_size: int = Field(
202-
default=64,
208+
default=2,
203209
description="Batch size for vector sync orchestration flushes.",
204210
gt=0,
205211
)

src/basic_memory/file_utils.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,13 @@ async def write_file_atomic(path: FilePath, content: str) -> None:
114114
temp_path = path_obj.with_suffix(".tmp")
115115

116116
try:
117-
# Use aiofiles for non-blocking write
117+
# Trigger: callers hand us normalized Python text, but the final bytes are allowed
118+
# to use the host platform's native newline convention during the write.
119+
# Why: preserving CRLF on Windows keeps local files aligned with editors like
120+
# Obsidian, while FileService now hashes the persisted file bytes instead of
121+
# the pre-write string.
122+
# Outcome: this async write stays editor-friendly across platforms without
123+
# reintroducing checksum drift in sync or move detection.
118124
async with aiofiles.open(temp_path, mode="w", encoding="utf-8") as f:
119125
await f.write(content)
120126

@@ -168,6 +174,13 @@ async def format_markdown_builtin(path: Path) -> Optional[str]:
168174

169175
# Only write if content changed
170176
if formatted_content != content:
177+
# Trigger: mdformat may rewrite markdown content, then the host platform
178+
# decides the newline bytes for the follow-up async text write.
179+
# Why: we want formatter output to preserve native newlines instead of
180+
# forcing LF, and the authoritative checksum comes from rereading the
181+
# stored file bytes later in FileService.
182+
# Outcome: formatting remains compatible with local editors on Windows while
183+
# checksum-based sync logic stays anchored to on-disk bytes.
171184
async with aiofiles.open(path, mode="w", encoding="utf-8") as f:
172185
await f.write(formatted_content)
173186

src/basic_memory/repository/embedding_provider.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Embedding provider protocol for pluggable semantic backends."""
22

3-
from typing import Protocol
3+
from typing import Any, Protocol
44

55

66
class EmbeddingProvider(Protocol):
@@ -16,3 +16,7 @@ async def embed_query(self, text: str) -> list[float]:
1616
async def embed_documents(self, texts: list[str]) -> list[list[float]]:
1717
"""Embed a list of document chunks."""
1818
...
19+
20+
def runtime_log_attrs(self) -> dict[str, Any]:
21+
"""Return provider-specific runtime settings suitable for startup logs."""
22+
...

src/basic_memory/repository/embedding_provider_factory.py

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Factory for creating configured semantic embedding providers."""
22

3+
import os
34
from threading import Lock
45

56
from basic_memory.config import BasicMemoryConfig
@@ -18,19 +19,59 @@
1819

1920
_EMBEDDING_PROVIDER_CACHE: dict[ProviderCacheKey, EmbeddingProvider] = {}
2021
_EMBEDDING_PROVIDER_CACHE_LOCK = Lock()
22+
_FASTEMBED_MAX_THREADS = 8
23+
24+
25+
def _available_cpu_count() -> int | None:
26+
"""Return the CPU budget available to this process when the runtime exposes it."""
27+
process_cpu_count = getattr(os, "process_cpu_count", None)
28+
if callable(process_cpu_count):
29+
cpu_count = process_cpu_count()
30+
if isinstance(cpu_count, int) and cpu_count > 0:
31+
return cpu_count
32+
33+
cpu_count = os.cpu_count()
34+
return cpu_count if cpu_count is not None and cpu_count > 0 else None
35+
36+
37+
def _resolve_fastembed_runtime_knobs(
38+
app_config: BasicMemoryConfig,
39+
) -> tuple[int | None, int | None]:
40+
"""Resolve FastEmbed threads/parallel from explicit config or CPU-aware defaults."""
41+
configured_threads = app_config.semantic_embedding_threads
42+
configured_parallel = app_config.semantic_embedding_parallel
43+
if configured_threads is not None or configured_parallel is not None:
44+
return configured_threads, configured_parallel
45+
46+
available_cpus = _available_cpu_count()
47+
if available_cpus is None:
48+
return None, None
49+
50+
# Trigger: local laptops and cloud workers expose different CPU budgets.
51+
# Why: full rebuilds got faster when FastEmbed used most, but not all, of
52+
# the available CPUs. Leaving a little headroom avoids starving the rest of
53+
# the pipeline while still giving ONNX enough threads to stay busy.
54+
# Outcome: when config leaves the knobs unset, each process reserves a small
55+
# CPU cushion and keeps FastEmbed on the simpler single-process path.
56+
if available_cpus <= 2:
57+
return available_cpus, 1
58+
59+
threads = min(_FASTEMBED_MAX_THREADS, max(2, available_cpus - 2))
60+
return threads, 1
2161

2262

2363
def _provider_cache_key(app_config: BasicMemoryConfig) -> ProviderCacheKey:
2464
"""Build a stable cache key from provider-relevant semantic embedding config."""
65+
resolved_threads, resolved_parallel = _resolve_fastembed_runtime_knobs(app_config)
2566
return (
2667
app_config.semantic_embedding_provider.strip().lower(),
2768
app_config.semantic_embedding_model,
2869
app_config.semantic_embedding_dimensions,
2970
app_config.semantic_embedding_batch_size,
3071
app_config.semantic_embedding_request_concurrency,
3172
app_config.semantic_embedding_cache_dir,
32-
app_config.semantic_embedding_threads,
33-
app_config.semantic_embedding_parallel,
73+
resolved_threads,
74+
resolved_parallel,
3475
)
3576

3677

@@ -61,12 +102,13 @@ def create_embedding_provider(app_config: BasicMemoryConfig) -> EmbeddingProvide
61102
# Deferred import: fastembed (and its onnxruntime dep) may not be installed
62103
from basic_memory.repository.fastembed_provider import FastEmbedEmbeddingProvider
63104

105+
resolved_threads, resolved_parallel = _resolve_fastembed_runtime_knobs(app_config)
64106
if app_config.semantic_embedding_cache_dir is not None:
65107
extra_kwargs["cache_dir"] = app_config.semantic_embedding_cache_dir
66-
if app_config.semantic_embedding_threads is not None:
67-
extra_kwargs["threads"] = app_config.semantic_embedding_threads
68-
if app_config.semantic_embedding_parallel is not None:
69-
extra_kwargs["parallel"] = app_config.semantic_embedding_parallel
108+
if resolved_threads is not None:
109+
extra_kwargs["threads"] = resolved_threads
110+
if resolved_parallel is not None:
111+
extra_kwargs["parallel"] = resolved_parallel
70112

71113
provider = FastEmbedEmbeddingProvider(
72114
model_name=app_config.semantic_embedding_model,

src/basic_memory/repository/fastembed_provider.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@ class FastEmbedEmbeddingProvider(EmbeddingProvider):
2424
def _effective_parallel(self) -> int | None:
2525
return self.parallel if self.parallel is not None and self.parallel > 1 else None
2626

27+
def runtime_log_attrs(self) -> dict[str, int | str | None]:
28+
"""Return the resolved runtime knobs that shape FastEmbed throughput."""
29+
return {
30+
"provider_batch_size": self.batch_size,
31+
"threads": self.threads,
32+
"configured_parallel": self.parallel,
33+
"effective_parallel": self._effective_parallel(),
34+
}
35+
2736
def __init__(
2837
self,
2938
model_name: str = "bge-small-en-v1.5",

src/basic_memory/repository/openai_provider.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ def __init__(
3434
self._client: Any | None = None
3535
self._client_lock = asyncio.Lock()
3636

37+
def runtime_log_attrs(self) -> dict[str, int]:
38+
"""Return the request fan-out knobs that shape API embedding batches."""
39+
return {
40+
"provider_batch_size": self.batch_size,
41+
"request_concurrency": self.request_concurrency,
42+
}
43+
3744
async def _get_client(self) -> Any:
3845
if self._client is not None:
3946
return self._client

0 commit comments

Comments
 (0)