fix: strip NUL bytes from content before PostgreSQL search indexing

phernandez · claude · phernandez · commit ec9b2c404618 · 2026-02-20T11:44:13.000-06:00
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream) pads markdown files with \x00 bytes (rclone/rclone#6801), which PostgreSQL rejects with CharacterNotInRepertoireError during search indexing. Three-pronged fix: - 🛡️ Primary: _strip_nul() in SearchService.index_entity_markdown() sanitizes content_stems, content_snippet, and observation/relation content before building SearchIndexRow objects - 🛡️ Secondary: _strip_nul_from_row() in PostgresSearchRepository.bulk_index_items() as a safety net before INSERT - 🔧 Prevention: --local-no-preallocate flag added to rclone sync and bisync commands to prevent the padding at the source Fixes #548 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
diff --git a/src/basic_memory/cli/commands/cloud/rclone_commands.py b/src/basic_memory/cli/commands/cloud/rclone_commands.py
@@ -223,6 +223,9 @@ def project_sync(
         *TIGRIS_CONSISTENCY_HEADERS,
         "--filter-from",
         str(filter_path),
+        # Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
+        # See: rclone/rclone#6801
+        "--local-no-preallocate",
     ]
 
     if verbose:
@@ -299,6 +302,9 @@ def project_bisync(
         str(filter_path),
         "--workdir",
         str(state_path),
+        # Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
+        # See: rclone/rclone#6801
+        "--local-no-preallocate",
     ]
 
     # Add --create-empty-src-dirs if rclone version supports it (v1.64+)
diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py
@@ -19,6 +19,15 @@
 from basic_memory.schemas.search import SearchItemType
 
 
+def _strip_nul_from_row(row_data: dict) -> dict:
+    """Strip NUL bytes from all string values in a row dict.
+
+    Secondary defense: PostgreSQL text columns cannot store \\x00.
+    Primary sanitization happens in SearchService.index_entity_markdown().
+    """
+    return {k: v.replace("\x00", "") if isinstance(v, str) else v for k, v in row_data.items()}
+
+
 class PostgresSearchRepository(SearchRepositoryBase):
     """PostgreSQL tsvector implementation of search repository.
 
@@ -489,7 +498,7 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
             for row in search_index_rows:
                 insert_data = row.to_insert(serialize_json=True)
                 insert_data["project_id"] = self.project_id
-                insert_data_list.append(insert_data)
+                insert_data_list.append(_strip_nul_from_row(insert_data))
 
             # Use upsert to handle race conditions during parallel indexing
             # ON CONFLICT (permalink, project_id) matches the partial unique index
diff --git a/src/basic_memory/services/search_service.py b/src/basic_memory/services/search_service.py
@@ -22,6 +22,15 @@
 MAX_CONTENT_STEMS_SIZE = 6000
 
 
+def _strip_nul(value: str) -> str:
+    """Strip NUL bytes that PostgreSQL text columns cannot store.
+
+    rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
+    can pad files with \\x00 bytes. See: rclone/rclone#6801
+    """
+    return value.replace("\x00", "")
+
+
 def _mtime_to_datetime(entity: Entity) -> datetime:
     """Convert entity mtime (file modification time) to datetime.
 
@@ -297,7 +306,7 @@ async def index_entity_markdown(
             content = await self.file_service.read_entity_content(entity)
         if content:
             content_stems.append(content)
-            content_snippet = f"{content[:250]}"
+            content_snippet = _strip_nul(content[:250])
 
         if entity.permalink:
             content_stems.extend(self._generate_variants(entity.permalink))
@@ -309,7 +318,7 @@ async def index_entity_markdown(
         if entity_tags:
             content_stems.extend(entity_tags)
 
-        entity_content_stems = "\n".join(p for p in content_stems if p and p.strip())
+        entity_content_stems = _strip_nul("\n".join(p for p in content_stems if p and p.strip()))
 
         # Truncate to stay under Postgres's 8KB index row limit
         if len(entity_content_stems) > MAX_CONTENT_STEMS_SIZE:  # pragma: no cover
@@ -346,8 +355,8 @@ async def index_entity_markdown(
             seen_permalinks.add(obs_permalink)
 
             # Index with parent entity's file path since that's where it's defined
-            obs_content_stems = "\n".join(
-                p for p in self._generate_variants(obs.content) if p and p.strip()
+            obs_content_stems = _strip_nul(
+                "\n".join(p for p in self._generate_variants(obs.content) if p and p.strip())
             )
             # Truncate to stay under Postgres's 8KB index row limit
             if len(obs_content_stems) > MAX_CONTENT_STEMS_SIZE:  # pragma: no cover
@@ -358,7 +367,7 @@ async def index_entity_markdown(
                     type=SearchItemType.OBSERVATION.value,
                     title=f"{obs.category}: {obs.content[:100]}...",
                     content_stems=obs_content_stems,
-                    content_snippet=obs.content,
+                    content_snippet=_strip_nul(obs.content),
                     permalink=obs_permalink,
                     file_path=entity.file_path,
                     category=obs.category,
@@ -381,8 +390,8 @@ async def index_entity_markdown(
                 else f"{rel.from_entity.title}"
             )
 
-            rel_content_stems = "\n".join(
-                p for p in self._generate_variants(relation_title) if p and p.strip()
+            rel_content_stems = _strip_nul(
+                "\n".join(p for p in self._generate_variants(relation_title) if p and p.strip())
             )
             rows_to_index.append(
                 SearchIndexRow(
diff --git a/tests/repository/test_postgres_search_repository.py b/tests/repository/test_postgres_search_repository.py
@@ -8,7 +8,10 @@
 
 import pytest
 
-from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
+from basic_memory.repository.postgres_search_repository import (
+    PostgresSearchRepository,
+    _strip_nul_from_row,
+)
 from basic_memory.repository.search_index_row import SearchIndexRow
 from basic_memory.schemas.search import SearchItemType
 
@@ -197,3 +200,22 @@ async def test_postgres_search_repository_reraises_non_tsquery_db_errors(
         # Use a non-text query so the generated SQL doesn't include to_tsquery(),
         # ensuring we hit the generic "re-raise other db errors" branch.
         await repo.search(permalink="docs/anything")
+
+
+def test_strip_nul_from_row():
+    """_strip_nul_from_row strips NUL bytes from string values, leaves non-strings alone."""
+    row = {
+        "title": "hello\x00world",
+        "content_stems": "some\x00content\x00here",
+        "content_snippet": "clean",
+        "id": 42,
+        "metadata": None,
+        "created_at": datetime(2024, 1, 1),
+    }
+    result = _strip_nul_from_row(row)
+    assert result["title"] == "helloworld"
+    assert result["content_stems"] == "somecontenthere"
+    assert result["content_snippet"] == "clean"
+    assert result["id"] == 42
+    assert result["metadata"] is None
+    assert result["created_at"] == datetime(2024, 1, 1)
diff --git a/tests/services/test_search_service.py b/tests/services/test_search_service.py
@@ -7,6 +7,7 @@
 
 from basic_memory import db
 from basic_memory.schemas.search import SearchQuery, SearchItemType
+from basic_memory.services.search_service import _strip_nul
 
 
 @pytest.mark.asyncio
@@ -968,3 +969,59 @@ async def test_index_entity_multiple_categories_same_content(
     # Search for the shared content - should find both observations
     results = await search_service.search(SearchQuery(text="Shared content"))
     assert len(results) >= 2
+
+
+# Tests for NUL byte stripping
+
+
+def test_strip_nul_removes_nul_bytes():
+    """_strip_nul removes \\x00 from strings."""
+    assert _strip_nul("hello\x00world") == "helloworld"
+    assert _strip_nul("\x00\x00\x00") == ""
+    assert _strip_nul("clean string") == "clean string"
+
+
+@pytest.mark.asyncio
+async def test_index_entity_markdown_strips_nul_bytes(search_service, session_maker, test_project):
+    """Content with NUL bytes should be stripped before indexing.
+
+    rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
+    can pad files with \\x00 bytes, causing PostgreSQL CharacterNotInRepertoireError.
+    """
+    from basic_memory.repository import EntityRepository, ObservationRepository
+    from basic_memory.repository.search_repository import SearchRepository
+
+    entity_repo = EntityRepository(session_maker, project_id=test_project.id)
+    obs_repo = ObservationRepository(session_maker, project_id=test_project.id)
+
+    entity_data = {
+        "title": "NUL Test Entity",
+        "entity_type": "note",
+        "entity_metadata": {},
+        "content_type": "text/markdown",
+        "file_path": "test/nul-test.md",
+        "permalink": "test/nul-test",
+        "project_id": test_project.id,
+        "created_at": datetime.now(),
+        "updated_at": datetime.now(),
+    }
+    entity = await entity_repo.create(entity_data)
+
+    # Add observation with NUL bytes
+    await obs_repo.create(
+        {"entity_id": entity.id, "category": "note", "content": "obs with\x00nul bytes"}
+    )
+    entity = await entity_repo.get_by_permalink("test/nul-test")
+
+    # Index with NUL-containing content
+    nul_content = "# NUL Test\x00\x00\nSome content\x00here"
+    await search_service.index_entity(entity, content=nul_content)
+
+    # Verify no NUL bytes in stored search index rows
+    search_repo: SearchRepository = search_service.repository
+    results = await search_repo.search(permalink_match="test/nul-test*")
+    for row in results:
+        if row.content_snippet:
+            assert "\x00" not in row.content_snippet, (
+                f"NUL found in content_snippet for {row.permalink}"
+            )
diff --git a/tests/test_rclone_commands.py b/tests/test_rclone_commands.py
@@ -511,3 +511,45 @@ def test_supports_create_empty_src_dirs_false_for_unknown_version():
 
 def test_min_rclone_version_constant():
     assert MIN_RCLONE_VERSION_EMPTY_DIRS == (1, 64, 0)
+
+
+def test_project_sync_includes_no_preallocate_flag(tmp_path):
+    """Sync command includes --local-no-preallocate to prevent NUL byte padding."""
+    runner = _Runner(returncode=0)
+    filter_path = _write_filter_file(tmp_path)
+    project = SyncProject(name="research", path="/research", local_sync_path="/tmp/research")
+
+    project_sync(
+        project,
+        "my-bucket",
+        run=runner,
+        is_installed=lambda: True,
+        filter_path=filter_path,
+    )
+
+    cmd, _ = runner.calls[0]
+    assert "--local-no-preallocate" in cmd
+
+
+def test_project_bisync_includes_no_preallocate_flag(tmp_path):
+    """Bisync command includes --local-no-preallocate to prevent NUL byte padding."""
+    runner = _Runner(returncode=0)
+    filter_path = _write_filter_file(tmp_path)
+    state_path = tmp_path / "state"
+    project = SyncProject(
+        name="research", path="app/data/research", local_sync_path="/tmp/research"
+    )
+
+    project_bisync(
+        project,
+        "my-bucket",
+        run=runner,
+        is_installed=lambda: True,
+        version=(1, 64, 2),
+        filter_path=filter_path,
+        state_path=state_path,
+        is_initialized=lambda _name: True,
+    )
+
+    cmd, _ = runner.calls[0]
+    assert "--local-no-preallocate" in cmd