Skip to content

Commit ec9b2c4

Browse files
phernandezclaude
andcommitted
fix: strip NUL bytes from content before PostgreSQL search indexing
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream) pads markdown files with \x00 bytes (rclone/rclone#6801), which PostgreSQL rejects with CharacterNotInRepertoireError during search indexing. Three-pronged fix: - 🛡️ Primary: _strip_nul() in SearchService.index_entity_markdown() sanitizes content_stems, content_snippet, and observation/relation content before building SearchIndexRow objects - 🛡️ Secondary: _strip_nul_from_row() in PostgresSearchRepository.bulk_index_items() as a safety net before INSERT - 🔧 Prevention: --local-no-preallocate flag added to rclone sync and bisync commands to prevent the padding at the source Fixes #548 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
1 parent de7f15b commit ec9b2c4

File tree

6 files changed

+154
-9
lines changed

6 files changed

+154
-9
lines changed

src/basic_memory/cli/commands/cloud/rclone_commands.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,9 @@ def project_sync(
223223
*TIGRIS_CONSISTENCY_HEADERS,
224224
"--filter-from",
225225
str(filter_path),
226+
# Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
227+
# See: rclone/rclone#6801
228+
"--local-no-preallocate",
226229
]
227230

228231
if verbose:
@@ -299,6 +302,9 @@ def project_bisync(
299302
str(filter_path),
300303
"--workdir",
301304
str(state_path),
305+
# Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
306+
# See: rclone/rclone#6801
307+
"--local-no-preallocate",
302308
]
303309

304310
# Add --create-empty-src-dirs if rclone version supports it (v1.64+)

src/basic_memory/repository/postgres_search_repository.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@
1919
from basic_memory.schemas.search import SearchItemType
2020

2121

22+
def _strip_nul_from_row(row_data: dict) -> dict:
23+
"""Strip NUL bytes from all string values in a row dict.
24+
25+
Secondary defense: PostgreSQL text columns cannot store \\x00.
26+
Primary sanitization happens in SearchService.index_entity_markdown().
27+
"""
28+
return {k: v.replace("\x00", "") if isinstance(v, str) else v for k, v in row_data.items()}
29+
30+
2231
class PostgresSearchRepository(SearchRepositoryBase):
2332
"""PostgreSQL tsvector implementation of search repository.
2433
@@ -489,7 +498,7 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
489498
for row in search_index_rows:
490499
insert_data = row.to_insert(serialize_json=True)
491500
insert_data["project_id"] = self.project_id
492-
insert_data_list.append(insert_data)
501+
insert_data_list.append(_strip_nul_from_row(insert_data))
493502

494503
# Use upsert to handle race conditions during parallel indexing
495504
# ON CONFLICT (permalink, project_id) matches the partial unique index

src/basic_memory/services/search_service.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,15 @@
2222
MAX_CONTENT_STEMS_SIZE = 6000
2323

2424

25+
def _strip_nul(value: str) -> str:
26+
"""Strip NUL bytes that PostgreSQL text columns cannot store.
27+
28+
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
29+
can pad files with \\x00 bytes. See: rclone/rclone#6801
30+
"""
31+
return value.replace("\x00", "")
32+
33+
2534
def _mtime_to_datetime(entity: Entity) -> datetime:
2635
"""Convert entity mtime (file modification time) to datetime.
2736
@@ -297,7 +306,7 @@ async def index_entity_markdown(
297306
content = await self.file_service.read_entity_content(entity)
298307
if content:
299308
content_stems.append(content)
300-
content_snippet = f"{content[:250]}"
309+
content_snippet = _strip_nul(content[:250])
301310

302311
if entity.permalink:
303312
content_stems.extend(self._generate_variants(entity.permalink))
@@ -309,7 +318,7 @@ async def index_entity_markdown(
309318
if entity_tags:
310319
content_stems.extend(entity_tags)
311320

312-
entity_content_stems = "\n".join(p for p in content_stems if p and p.strip())
321+
entity_content_stems = _strip_nul("\n".join(p for p in content_stems if p and p.strip()))
313322

314323
# Truncate to stay under Postgres's 8KB index row limit
315324
if len(entity_content_stems) > MAX_CONTENT_STEMS_SIZE: # pragma: no cover
@@ -346,8 +355,8 @@ async def index_entity_markdown(
346355
seen_permalinks.add(obs_permalink)
347356

348357
# Index with parent entity's file path since that's where it's defined
349-
obs_content_stems = "\n".join(
350-
p for p in self._generate_variants(obs.content) if p and p.strip()
358+
obs_content_stems = _strip_nul(
359+
"\n".join(p for p in self._generate_variants(obs.content) if p and p.strip())
351360
)
352361
# Truncate to stay under Postgres's 8KB index row limit
353362
if len(obs_content_stems) > MAX_CONTENT_STEMS_SIZE: # pragma: no cover
@@ -358,7 +367,7 @@ async def index_entity_markdown(
358367
type=SearchItemType.OBSERVATION.value,
359368
title=f"{obs.category}: {obs.content[:100]}...",
360369
content_stems=obs_content_stems,
361-
content_snippet=obs.content,
370+
content_snippet=_strip_nul(obs.content),
362371
permalink=obs_permalink,
363372
file_path=entity.file_path,
364373
category=obs.category,
@@ -381,8 +390,8 @@ async def index_entity_markdown(
381390
else f"{rel.from_entity.title}"
382391
)
383392

384-
rel_content_stems = "\n".join(
385-
p for p in self._generate_variants(relation_title) if p and p.strip()
393+
rel_content_stems = _strip_nul(
394+
"\n".join(p for p in self._generate_variants(relation_title) if p and p.strip())
386395
)
387396
rows_to_index.append(
388397
SearchIndexRow(

tests/repository/test_postgres_search_repository.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88

99
import pytest
1010

11-
from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
11+
from basic_memory.repository.postgres_search_repository import (
12+
PostgresSearchRepository,
13+
_strip_nul_from_row,
14+
)
1215
from basic_memory.repository.search_index_row import SearchIndexRow
1316
from basic_memory.schemas.search import SearchItemType
1417

@@ -197,3 +200,22 @@ async def test_postgres_search_repository_reraises_non_tsquery_db_errors(
197200
# Use a non-text query so the generated SQL doesn't include to_tsquery(),
198201
# ensuring we hit the generic "re-raise other db errors" branch.
199202
await repo.search(permalink="docs/anything")
203+
204+
205+
def test_strip_nul_from_row():
206+
"""_strip_nul_from_row strips NUL bytes from string values, leaves non-strings alone."""
207+
row = {
208+
"title": "hello\x00world",
209+
"content_stems": "some\x00content\x00here",
210+
"content_snippet": "clean",
211+
"id": 42,
212+
"metadata": None,
213+
"created_at": datetime(2024, 1, 1),
214+
}
215+
result = _strip_nul_from_row(row)
216+
assert result["title"] == "helloworld"
217+
assert result["content_stems"] == "somecontenthere"
218+
assert result["content_snippet"] == "clean"
219+
assert result["id"] == 42
220+
assert result["metadata"] is None
221+
assert result["created_at"] == datetime(2024, 1, 1)

tests/services/test_search_service.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from basic_memory import db
99
from basic_memory.schemas.search import SearchQuery, SearchItemType
10+
from basic_memory.services.search_service import _strip_nul
1011

1112

1213
@pytest.mark.asyncio
@@ -968,3 +969,59 @@ async def test_index_entity_multiple_categories_same_content(
968969
# Search for the shared content - should find both observations
969970
results = await search_service.search(SearchQuery(text="Shared content"))
970971
assert len(results) >= 2
972+
973+
974+
# Tests for NUL byte stripping
975+
976+
977+
def test_strip_nul_removes_nul_bytes():
978+
"""_strip_nul removes \\x00 from strings."""
979+
assert _strip_nul("hello\x00world") == "helloworld"
980+
assert _strip_nul("\x00\x00\x00") == ""
981+
assert _strip_nul("clean string") == "clean string"
982+
983+
984+
@pytest.mark.asyncio
985+
async def test_index_entity_markdown_strips_nul_bytes(search_service, session_maker, test_project):
986+
"""Content with NUL bytes should be stripped before indexing.
987+
988+
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
989+
can pad files with \\x00 bytes, causing PostgreSQL CharacterNotInRepertoireError.
990+
"""
991+
from basic_memory.repository import EntityRepository, ObservationRepository
992+
from basic_memory.repository.search_repository import SearchRepository
993+
994+
entity_repo = EntityRepository(session_maker, project_id=test_project.id)
995+
obs_repo = ObservationRepository(session_maker, project_id=test_project.id)
996+
997+
entity_data = {
998+
"title": "NUL Test Entity",
999+
"entity_type": "note",
1000+
"entity_metadata": {},
1001+
"content_type": "text/markdown",
1002+
"file_path": "test/nul-test.md",
1003+
"permalink": "test/nul-test",
1004+
"project_id": test_project.id,
1005+
"created_at": datetime.now(),
1006+
"updated_at": datetime.now(),
1007+
}
1008+
entity = await entity_repo.create(entity_data)
1009+
1010+
# Add observation with NUL bytes
1011+
await obs_repo.create(
1012+
{"entity_id": entity.id, "category": "note", "content": "obs with\x00nul bytes"}
1013+
)
1014+
entity = await entity_repo.get_by_permalink("test/nul-test")
1015+
1016+
# Index with NUL-containing content
1017+
nul_content = "# NUL Test\x00\x00\nSome content\x00here"
1018+
await search_service.index_entity(entity, content=nul_content)
1019+
1020+
# Verify no NUL bytes in stored search index rows
1021+
search_repo: SearchRepository = search_service.repository
1022+
results = await search_repo.search(permalink_match="test/nul-test*")
1023+
for row in results:
1024+
if row.content_snippet:
1025+
assert "\x00" not in row.content_snippet, (
1026+
f"NUL found in content_snippet for {row.permalink}"
1027+
)

tests/test_rclone_commands.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,3 +511,45 @@ def test_supports_create_empty_src_dirs_false_for_unknown_version():
511511

512512
def test_min_rclone_version_constant():
513513
assert MIN_RCLONE_VERSION_EMPTY_DIRS == (1, 64, 0)
514+
515+
516+
def test_project_sync_includes_no_preallocate_flag(tmp_path):
517+
"""Sync command includes --local-no-preallocate to prevent NUL byte padding."""
518+
runner = _Runner(returncode=0)
519+
filter_path = _write_filter_file(tmp_path)
520+
project = SyncProject(name="research", path="/research", local_sync_path="/tmp/research")
521+
522+
project_sync(
523+
project,
524+
"my-bucket",
525+
run=runner,
526+
is_installed=lambda: True,
527+
filter_path=filter_path,
528+
)
529+
530+
cmd, _ = runner.calls[0]
531+
assert "--local-no-preallocate" in cmd
532+
533+
534+
def test_project_bisync_includes_no_preallocate_flag(tmp_path):
535+
"""Bisync command includes --local-no-preallocate to prevent NUL byte padding."""
536+
runner = _Runner(returncode=0)
537+
filter_path = _write_filter_file(tmp_path)
538+
state_path = tmp_path / "state"
539+
project = SyncProject(
540+
name="research", path="app/data/research", local_sync_path="/tmp/research"
541+
)
542+
543+
project_bisync(
544+
project,
545+
"my-bucket",
546+
run=runner,
547+
is_installed=lambda: True,
548+
version=(1, 64, 2),
549+
filter_path=filter_path,
550+
state_path=state_path,
551+
is_initialized=lambda _name: True,
552+
)
553+
554+
cmd, _ = runner.calls[0]
555+
assert "--local-no-preallocate" in cmd

0 commit comments

Comments
 (0)