Skip to content

Commit 08ee7e1

Browse files
jope-bmclaude[bot]
andauthored
fix: Critical search index bug - prevent note disappearing on edit (#257)
Signed-off-by: Joe P <joe@basicmemory.com> Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Co-authored-by: jope-bm <jope-bm@users.noreply.github.com>
1 parent 63ae9ee commit 08ee7e1

5 files changed

Lines changed: 796 additions & 34 deletions

File tree

src/basic_memory/mcp/tools/read_note.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
"""Read note tool for Basic Memory MCP server."""
22

33
from textwrap import dedent
4+
from typing import Optional
45

56
from loguru import logger
67

78
from basic_memory.mcp.async_client import client
89
from basic_memory.mcp.server import mcp
910
from basic_memory.mcp.tools.search import search_notes
1011
from basic_memory.mcp.tools.utils import call_get
12+
from basic_memory.mcp.project_session import get_active_project
1113
from basic_memory.schemas.memory import memory_url_path
14+
from basic_memory.utils import validate_project_path
1215

1316

1417
@mcp.tool(
1518
description="Read a markdown note by title or permalink.",
1619
)
17-
async def read_note(identifier: str, page: int = 1, page_size: int = 10) -> str:
20+
async def read_note(
21+
identifier: str, page: int = 1, page_size: int = 10, project: Optional[str] = None
22+
) -> str:
1823
"""Read a markdown note from the knowledge base.
1924
2025
This tool finds and retrieves a note by its title, permalink, or content search,
@@ -26,6 +31,7 @@ async def read_note(identifier: str, page: int = 1, page_size: int = 10) -> str:
2631
Can be a full memory:// URL, a permalink, a title, or search text
2732
page: Page number for paginated results (default: 1)
2833
page_size: Number of items per page (default: 10)
34+
project: Optional project name to read from. If not provided, uses current active project.
2935
3036
Returns:
3137
The full markdown content of the note if found, or helpful guidance if not found.
@@ -42,10 +48,41 @@ async def read_note(identifier: str, page: int = 1, page_size: int = 10) -> str:
4248
4349
# Read with pagination
4450
read_note("Project Updates", page=2, page_size=5)
51+
52+
# Read from specific project
53+
read_note("Meeting Notes", project="work-project")
4554
"""
55+
56+
# Get the active project first to check project-specific sync status
57+
active_project = get_active_project(project)
58+
59+
# Validate identifier to prevent path traversal attacks
60+
# We need to check both the raw identifier and the processed path
61+
processed_path = memory_url_path(identifier)
62+
project_path = active_project.home
63+
64+
if not validate_project_path(identifier, project_path) or not validate_project_path(processed_path, project_path):
65+
logger.warning(
66+
"Attempted path traversal attack blocked",
67+
identifier=identifier,
68+
processed_path=processed_path,
69+
project=active_project.name,
70+
)
71+
return f"# Error\n\nIdentifier '{identifier}' is not allowed - paths must stay within project boundaries"
72+
73+
# Check migration status and wait briefly if needed
74+
from basic_memory.mcp.tools.utils import wait_for_migration_or_return_status
75+
76+
migration_status = await wait_for_migration_or_return_status(
77+
timeout=5.0, project_name=active_project.name
78+
)
79+
if migration_status: # pragma: no cover
80+
return f"# System Status\n\n{migration_status}\n\nPlease wait for migration to complete before reading notes."
81+
project_url = active_project.project_url
82+
4683
# Get the file via REST API - first try direct permalink lookup
4784
entity_path = memory_url_path(identifier)
48-
path = f"/resource/{entity_path}"
85+
path = f"{project_url}/resource/{entity_path}"
4986
logger.info(f"Attempting to read note from URL: {path}")
5087

5188
try:
@@ -62,14 +99,14 @@ async def read_note(identifier: str, page: int = 1, page_size: int = 10) -> str:
6299

63100
# Fallback 1: Try title search via API
64101
logger.info(f"Search title for: {identifier}")
65-
title_results = await search_notes(query=identifier, search_type="title")
102+
title_results = await search_notes.fn(query=identifier, search_type="title", project=project)
66103

67104
if title_results and title_results.results:
68105
result = title_results.results[0] # Get the first/best match
69106
if result.permalink:
70107
try:
71108
# Try to fetch the content using the found permalink
72-
path = f"/resource/{result.permalink}"
109+
path = f"{project_url}/resource/{result.permalink}"
73110
response = await call_get(
74111
client, path, params={"page": page, "page_size": page_size}
75112
)
@@ -86,7 +123,7 @@ async def read_note(identifier: str, page: int = 1, page_size: int = 10) -> str:
86123

87124
# Fallback 2: Text search as a last resort
88125
logger.info(f"Title search failed, trying text search for: {identifier}")
89-
text_results = await search_notes(query=identifier, search_type="text")
126+
text_results = await search_notes.fn(query=identifier, search_type="text", project=project)
90127

91128
# We didn't find a direct match, construct a helpful error message
92129
if not text_results or not text_results.results:

src/basic_memory/repository/search_repository.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -523,8 +523,8 @@ async def index_item(
523523
async with db.scoped_session(self.session_maker) as session:
524524
# Delete existing record if any
525525
await session.execute(
526-
text("DELETE FROM search_index WHERE permalink = :permalink"),
527-
{"permalink": search_index_row.permalink},
526+
text("DELETE FROM search_index WHERE permalink = :permalink AND project_id = :project_id"),
527+
{"permalink": search_index_row.permalink, "project_id": self.project_id},
528528
)
529529

530530
# Prepare data for insert with project_id

src/basic_memory/utils.py

Lines changed: 142 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,26 +53,53 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5353
# Remove extension
5454
base = os.path.splitext(path_str)[0]
5555

56-
# Check if we have non-ASCII characters that should be preserved
57-
has_non_ascii = any(ord(char) > 127 for char in base)
56+
# Check if we have CJK characters that should be preserved
57+
# CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
58+
# \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
59+
has_cjk_chars = any(
60+
'\u4e00' <= char <= '\u9fff' or
61+
'\u3000' <= char <= '\u303f' or
62+
'\u3400' <= char <= '\u4dbf' or
63+
'\uff00' <= char <= '\uffef'
64+
for char in base
65+
)
5866

59-
if has_non_ascii:
60-
# Preserve non-ASCII characters like Chinese while still processing ASCII parts
61-
result = base
67+
if has_cjk_chars:
68+
# For text with CJK characters, selectively transliterate only Latin accented chars
69+
result = ""
70+
for char in base:
71+
if ('\u4e00' <= char <= '\u9fff' or
72+
'\u3000' <= char <= '\u303f' or
73+
'\u3400' <= char <= '\u4dbf'):
74+
# Preserve CJK ideographs and symbols
75+
result += char
76+
elif ('\uff00' <= char <= '\uffef'):
77+
# Remove Chinese fullwidth punctuation entirely (like ,!?)
78+
continue
79+
else:
80+
# Transliterate Latin accented characters to ASCII
81+
result += unidecode(char)
82+
83+
# Insert hyphens between CJK and Latin character transitions
84+
# Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
85+
result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result)
86+
result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result)
6287

6388
# Insert dash between camelCase
6489
result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
6590

66-
# Convert only ASCII letters to lowercase, preserve non-ASCII
91+
# Convert ASCII letters to lowercase, preserve CJK
6792
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
6893

6994
# Replace underscores with hyphens
7095
text_with_hyphens = lower_text.replace("_", "-")
7196

72-
# Replace spaces and unsafe ASCII chars with hyphens, preserve non-ASCII chars
73-
# Includes Chinese character ranges (CJK Unified Ideographs, CJK symbols, etc.)
97+
# Remove apostrophes entirely (don't replace with hyphens)
98+
text_no_apostrophes = text_with_hyphens.replace("'", "")
99+
100+
# Replace unsafe chars with hyphens, but preserve CJK characters
74101
clean_text = re.sub(
75-
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens
102+
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes
76103
)
77104
else:
78105
# Original ASCII-only processing for backward compatibility
@@ -88,8 +115,11 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
88115
# replace underscores with hyphens
89116
text_with_hyphens = lower_text.replace("_", "-")
90117

118+
# Remove apostrophes entirely (don't replace with hyphens)
119+
text_no_apostrophes = text_with_hyphens.replace("'", "")
120+
91121
# Replace remaining invalid chars with hyphens
92-
clean_text = re.sub(r"[^a-z0-9/\-]", "-", text_with_hyphens)
122+
clean_text = re.sub(r"[^a-z0-9/\-]", "-", text_no_apostrophes)
93123

94124
# Collapse multiple hyphens
95125
clean_text = re.sub(r"-+", "-", clean_text)
@@ -187,3 +217,105 @@ def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
187217
except (ValueError, TypeError): # pragma: no cover
188218
logger.warning(f"Couldn't parse tags from input of type {type(tags)}: {tags}")
189219
return []
220+
221+
222+
def normalize_file_path_for_comparison(file_path: str) -> str:
223+
"""Normalize a file path for conflict detection.
224+
225+
This function normalizes file paths to help detect potential conflicts:
226+
- Converts to lowercase for case-insensitive comparison
227+
- Normalizes Unicode characters
228+
- Handles path separators consistently
229+
230+
Args:
231+
file_path: The file path to normalize
232+
233+
Returns:
234+
Normalized file path for comparison purposes
235+
"""
236+
import unicodedata
237+
238+
# Convert to lowercase for case-insensitive comparison
239+
normalized = file_path.lower()
240+
241+
# Normalize Unicode characters (NFD normalization)
242+
normalized = unicodedata.normalize('NFD', normalized)
243+
244+
# Replace path separators with forward slashes
245+
normalized = normalized.replace('\\', '/')
246+
247+
# Remove multiple slashes
248+
normalized = re.sub(r'/+', '/', normalized)
249+
250+
return normalized
251+
252+
253+
def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]:
254+
"""Detect potential conflicts between a file path and existing paths.
255+
256+
This function checks for various types of conflicts:
257+
- Case sensitivity differences
258+
- Unicode normalization differences
259+
- Path separator differences
260+
- Permalink generation conflicts
261+
262+
Args:
263+
file_path: The file path to check
264+
existing_paths: List of existing file paths to check against
265+
266+
Returns:
267+
List of existing paths that might conflict with the given file path
268+
"""
269+
conflicts = []
270+
271+
# Normalize the input file path
272+
normalized_input = normalize_file_path_for_comparison(file_path)
273+
input_permalink = generate_permalink(file_path)
274+
275+
for existing_path in existing_paths:
276+
# Skip identical paths
277+
if existing_path == file_path:
278+
continue
279+
280+
# Check for case-insensitive path conflicts
281+
normalized_existing = normalize_file_path_for_comparison(existing_path)
282+
if normalized_input == normalized_existing:
283+
conflicts.append(existing_path)
284+
continue
285+
286+
# Check for permalink conflicts
287+
existing_permalink = generate_permalink(existing_path)
288+
if input_permalink == existing_permalink:
289+
conflicts.append(existing_path)
290+
continue
291+
292+
return conflicts
293+
294+
295+
def validate_project_path(path: str, project_path: Path) -> bool:
296+
"""Ensure path stays within project boundaries."""
297+
# Allow empty strings as they resolve to the project root
298+
if not path:
299+
return True
300+
301+
# Check for obvious path traversal patterns first
302+
if ".." in path or "~" in path:
303+
return False
304+
305+
# Check for Windows-style path traversal (even on Unix systems)
306+
if "\\.." in path or path.startswith("\\"):
307+
return False
308+
309+
# Block absolute paths (Unix-style starting with / or Windows-style with drive letters)
310+
if path.startswith("/") or (len(path) >= 2 and path[1] == ":"):
311+
return False
312+
313+
# Block paths with control characters (but allow whitespace that will be stripped)
314+
if path.strip() and any(ord(c) < 32 and c not in [" ", "\t"] for c in path):
315+
return False
316+
317+
try:
318+
resolved = (project_path / path).resolve()
319+
return resolved.is_relative_to(project_path.resolve())
320+
except (ValueError, OSError):
321+
return False

0 commit comments

Comments
 (0)