basicmachines-co
diff --git a/‎src/basic_memory/indexing/batch_indexer.py‎
Lines changed: 40 additions & 43 deletions b/‎src/basic_memory/indexing/batch_indexer.py‎
Lines changed: 40 additions & 43 deletions
diff --git a/‎src/basic_memory/indexing/batching.py‎
Lines changed: 1 addition & 1 deletion b/‎src/basic_memory/indexing/batching.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/basic_memory/repository/postgres_search_repository.py‎
Lines changed: 0 additions & 57 deletions b/‎src/basic_memory/repository/postgres_search_repository.py‎
Lines changed: 0 additions & 57 deletions
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import asyncio
-import time
 from dataclasses import dataclass
 from datetime import datetime
 from pathlib import Path
@@ -13,18 +12,18 @@
 from sqlalchemy.exc import IntegrityError
 
 from basic_memory.config import BasicMemoryConfig
-from basic_memory.file_utils import compute_checksum, has_frontmatter, remove_frontmatter
+from basic_memory.file_utils import compute_checksum, has_frontmatter
 from basic_memory.markdown.schemas import EntityMarkdown
 from basic_memory.indexing.models import (
     IndexedEntity,
     IndexFileWriter,
     IndexFrontmatterUpdate,
     IndexingBatchResult,
     IndexInputFile,
-    IndexProgress,
 )
 from basic_memory.models import Entity, Relation
 from basic_memory.services import EntityService
+from basic_memory.services.exceptions import SyncFatalError
 from basic_memory.services.search_service import SearchService
 from basic_memory.repository import EntityRepository, RelationRepository
 
@@ -76,28 +75,17 @@ async def index_files(
         *,
         max_concurrent: int,
         parse_max_concurrent: int | None = None,
-        progress_callback: Callable[[IndexProgress], Awaitable[None]] | None = None,
+        existing_permalink_by_path: dict[str, str | None] | None = None,
     ) -> IndexingBatchResult:
         """Index one batch of loaded files with bounded concurrency."""
         if max_concurrent <= 0:
             raise ValueError("max_concurrent must be greater than zero")
 
         ordered_paths = sorted(files)
         if not ordered_paths:
-            result = IndexingBatchResult()
-            if progress_callback is not None:
-                await progress_callback(
-                    IndexProgress(
-                        files_total=0,
-                        files_processed=0,
-                        batches_total=0,
-                        batches_completed=0,
-                    )
-                )
-            return result
+            return IndexingBatchResult()
 
         parse_limit = parse_max_concurrent or max_concurrent
-        batch_start = time.monotonic()
         error_by_path: dict[str, str] = {}
 
         markdown_paths = [path for path in ordered_paths if self._is_markdown(files[path])]
@@ -111,7 +99,8 @@ async def index_files(
         error_by_path.update(parse_errors)
 
         prepared_markdown, normalization_errors = await self._normalize_markdown_batch(
-            prepared_markdown
+            prepared_markdown,
+            existing_permalink_by_path=existing_permalink_by_path,
         )
         error_by_path.update(normalization_errors)
 
@@ -171,21 +160,6 @@ async def index_files(
 
         search_indexed = len(indexed_entities)
 
-        if progress_callback is not None:
-            elapsed_seconds = max(time.monotonic() - batch_start, 0.001)
-            files_per_minute = len(ordered_paths) / elapsed_seconds * 60
-            await progress_callback(
-                IndexProgress(
-                    files_total=len(ordered_paths),
-                    files_processed=len(ordered_paths),
-                    batches_total=1,
-                    batches_completed=1,
-                    current_batch_bytes=sum(max(files[path].size, 0) for path in ordered_paths),
-                    files_per_minute=files_per_minute,
-                    eta_seconds=0.0,
-                )
-            )
-
         return IndexingBatchResult(
             indexed=indexed_entities,
             errors=[(path, error_by_path[path]) for path in ordered_paths if path in error_by_path],
@@ -221,12 +195,21 @@ async def _prepare_markdown_file(self, file: IndexInputFile) -> _PreparedMarkdow
     async def _normalize_markdown_batch(
         self,
         prepared_markdown: dict[str, _PreparedMarkdownFile],
+        *,
+        existing_permalink_by_path: dict[str, str | None] | None = None,
     ) -> tuple[dict[str, _PreparedMarkdownFile], dict[str, str]]:
         if not prepared_markdown:
             return {}, {}
 
+        if existing_permalink_by_path is None:
+            existing_permalink_by_path = {
+                path: permalink
+                for path, permalink in (
+                    await self.entity_repository.get_file_path_to_permalink_map()
+                ).items()
+            }
+
         batch_paths = set(prepared_markdown)
-        existing_permalink_by_path = await self.entity_repository.get_file_path_to_permalink_map()
         reserved_permalinks = {
             permalink
             for path, permalink in existing_permalink_by_path.items()
@@ -242,6 +225,7 @@ async def _normalize_markdown_batch(
                     prepared_markdown[path],
                     reserved_permalinks,
                 )
+                existing_permalink_by_path[path] = normalized[path].markdown.frontmatter.permalink
             except Exception as exc:
                 errors[path] = str(exc)
                 logger.warning("Batch markdown normalization failed", path=path, error=str(exc))
@@ -357,13 +341,18 @@ async def _upsert_markdown_file(self, prepared: _PreparedMarkdownFile) -> _Prepa
             entity_id=updated.id,
             checksum=prepared.final_checksum,
             content_type=prepared.file.content_type,
-            search_content=remove_frontmatter(prepared.content),
+            search_content=(
+                prepared.markdown.content
+                if prepared.markdown.content is not None
+                else prepared.content
+            ),
             markdown_content=prepared.content,
         )
 
     async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
         checksum = await self._resolve_checksum(file)
         existing = await self.entity_repository.get_by_file_path(file.path, load_relations=False)
+        is_new_entity = existing is None
 
         if existing is None:
             await self.entity_service.resolve_permalink(file.path, skip_conflict_check=True)
@@ -408,7 +397,7 @@ async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
 
         updated = await self.entity_repository.update(
             entity_id,
-            self._entity_metadata_updates(file, checksum, include_created_at=existing is None),
+            self._entity_metadata_updates(file, checksum, include_created_at=is_new_entity),
         )
         if updated is None:
             raise ValueError(f"Failed to update file entity metadata for {file.path}")
@@ -430,11 +419,15 @@ async def _resolve_batch_relations(
         *,
         max_concurrent: int,
     ) -> tuple[int, int]:
-        unresolved_relations: list[Relation] = []
-        for entity_id in entity_ids:
-            unresolved_relations.extend(
-                await self.relation_repository.find_unresolved_relations_for_entity(entity_id)
+        unresolved_relation_lists = await asyncio.gather(
+            *(
+                self.relation_repository.find_unresolved_relations_for_entity(entity_id)
+                for entity_id in entity_ids
             )
+        )
+        unresolved_relations = [
+            relation for relation_list in unresolved_relation_lists for relation in relation_list
+        ]
 
         if not unresolved_relations:
             return 0, 0
@@ -475,11 +468,13 @@ async def resolve_relation(relation: Relation) -> int:
             *(resolve_relation(relation) for relation in unresolved_relations)
         )
 
-        remaining_unresolved = 0
-        for entity_id in entity_ids:
-            remaining_unresolved += len(
-                await self.relation_repository.find_unresolved_relations_for_entity(entity_id)
+        remaining_relation_lists = await asyncio.gather(
+            *(
+                self.relation_repository.find_unresolved_relations_for_entity(entity_id)
+                for entity_id in entity_ids
             )
+        )
+        remaining_unresolved = sum(len(relations) for relations in remaining_relation_lists)
 
         return sum(resolved_counts), remaining_unresolved
 
@@ -552,6 +547,8 @@ async def run(path: str) -> None:
                 try:
                     results[path] = await worker(path)
                 except Exception as exc:
+                    if isinstance(exc, SyncFatalError) or isinstance(exc.__cause__, SyncFatalError):
+                        raise
                     errors[path] = str(exc)
                     logger.warning("Batch indexing failed", path=path, error=str(exc))
 
 
@@ -52,7 +52,7 @@ def build_index_batches(
         current_paths.append(path)
         current_bytes += file_bytes
 
-        if len(current_paths) >= max_files or current_bytes >= max_bytes:
+        if len(current_paths) >= max_files or current_bytes == max_bytes:
             batches.append(IndexBatch(paths=current_paths, total_bytes=current_bytes))
             current_paths = []
             current_bytes = 0
 
@@ -512,12 +512,6 @@ async def _prepare_entity_vector_jobs(self, entity_id: int) -> _PreparedEntityVe
         """Prepare chunk mutations with Postgres-specific bulk upserts."""
         sync_start = time.perf_counter()
 
-        logger.info(
-            "Vector sync start: project_id={project_id} entity_id={entity_id}",
-            project_id=self.project_id,
-            entity_id=entity_id,
-        )
-
         async with db.scoped_session(self.session_maker) as session:
             await self._prepare_vector_session(session)
 
@@ -546,13 +540,6 @@ async def _prepare_entity_vector_jobs(self, entity_id: int) -> _PreparedEntityVe
             source_rows_count = len(rows)
 
             if not rows:
-                logger.info(
-                    "Vector sync source prepared: project_id={project_id} entity_id={entity_id} "
-                    "source_rows_count={source_rows_count} built_chunk_records_count=0",
-                    project_id=self.project_id,
-                    entity_id=entity_id,
-                    source_rows_count=source_rows_count,
-                )
                 await self._delete_entity_chunks(session, entity_id)
                 await session.commit()
                 prepare_seconds = time.perf_counter() - sync_start
@@ -568,15 +555,6 @@ async def _prepare_entity_vector_jobs(self, entity_id: int) -> _PreparedEntityVe
             built_chunk_records_count = len(chunk_records)
             current_entity_fingerprint = self._build_entity_fingerprint(chunk_records)
             current_embedding_model = self._embedding_model_key()
-            logger.info(
-                "Vector sync source prepared: project_id={project_id} entity_id={entity_id} "
-                "source_rows_count={source_rows_count} "
-                "built_chunk_records_count={built_chunk_records_count}",
-                project_id=self.project_id,
-                entity_id=entity_id,
-                source_rows_count=source_rows_count,
-                built_chunk_records_count=built_chunk_records_count,
-            )
             if not chunk_records:
                 await self._delete_entity_chunks(session, entity_id)
                 await session.commit()
@@ -629,16 +607,6 @@ async def _prepare_entity_vector_jobs(self, entity_id: int) -> _PreparedEntityVe
                 )
             )
             if skip_unchanged_entity:
-                logger.info(
-                    "Vector sync skipped unchanged entity: project_id={project_id} "
-                    "entity_id={entity_id} chunks_skipped={chunks_skipped} "
-                    "entity_fingerprint={entity_fingerprint} embedding_model={embedding_model}",
-                    project_id=self.project_id,
-                    entity_id=entity_id,
-                    chunks_skipped=built_chunk_records_count,
-                    entity_fingerprint=current_entity_fingerprint,
-                    embedding_model=current_embedding_model,
-                )
                 prepare_seconds = time.perf_counter() - sync_start
                 return _PreparedEntityVectorSync(
                     entity_id=entity_id,
@@ -752,31 +720,6 @@ async def _prepare_entity_vector_jobs(self, entity_id: int) -> _PreparedEntityVe
                     row_id = upserted_ids_by_key[record["chunk_key"]]
                     embedding_jobs.append((row_id, record["chunk_text"]))
 
-            logger.info(
-                "Vector sync diff complete: project_id={project_id} entity_id={entity_id} "
-                "existing_chunks_count={existing_chunks_count} "
-                "stale_chunks_count={stale_chunks_count} "
-                "orphan_chunks_count={orphan_chunks_count} "
-                "chunks_skipped={chunks_skipped} "
-                "embedding_jobs_count={embedding_jobs_count} "
-                "pending_jobs_total={pending_jobs_total} shard_index={shard_index} "
-                "shard_count={shard_count} remaining_jobs_after_shard={remaining_jobs_after_shard} "
-                "oversized_entity={oversized_entity} entity_complete={entity_complete}",
-                project_id=self.project_id,
-                entity_id=entity_id,
-                existing_chunks_count=existing_chunks_count,
-                stale_chunks_count=stale_chunks_count,
-                orphan_chunks_count=orphan_chunks_count,
-                chunks_skipped=skipped_chunks_count,
-                embedding_jobs_count=len(embedding_jobs),
-                pending_jobs_total=shard_plan.pending_jobs_total,
-                shard_index=shard_plan.shard_index,
-                shard_count=shard_plan.shard_count,
-                remaining_jobs_after_shard=shard_plan.remaining_jobs_after_shard,
-                oversized_entity=shard_plan.oversized_entity,
-                entity_complete=shard_plan.entity_complete,
-            )
-
         prepare_seconds = time.perf_counter() - sync_start
         return _PreparedEntityVectorSync(
             entity_id=entity_id,