Skip to content

Commit aad9c42

Browse files
committed
fix: Sanitize filenames and allow optional kebab case
1 parent 08ee7e1 commit aad9c42

6 files changed

Lines changed: 134 additions & 44 deletions

File tree

src/basic_memory/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ class BasicMemoryConfig(BaseSettings):
7474
description="Whether to sync changes in real time. default (True)",
7575
)
7676

77+
kebab_filenames: bool = Field(
78+
default=False,
79+
description="Format for generated filenames. False preserves spaces and special chars, True converts them to hyphens for consistency with permalinks",
80+
)
81+
7782
# API connection configuration
7883
api_url: Optional[str] = Field(
7984
default=None,

src/basic_memory/file_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hashlib
44
from pathlib import Path
5+
import re
56
from typing import Any, Dict, Union
67

78
import yaml
@@ -233,3 +234,21 @@ async def update_frontmatter(path: FilePath, updates: Dict[str, Any]) -> str:
233234
error=str(e),
234235
)
235236
raise FileError(f"Failed to update frontmatter: {e}")
237+
238+
239+
def sanitize_for_filename(text, replacement="-"):
240+
"""
241+
Sanitize string to be safe for use as a note title
242+
Replaces path separators and other problematic characters
243+
with hyphens.
244+
"""
245+
# replace both POSIX and Windows path separators
246+
text = re.sub(r"[/\\]", replacement, text)
247+
248+
# replace some other problematic chars
249+
text = re.sub(r'[<>:"|?*]', replacement, text)
250+
251+
# compress multiple, repeated replacements
252+
text = re.sub(f"{re.escape(replacement)}+", replacement, text)
253+
254+
return text.strip(replacement)

src/basic_memory/schemas/base.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
from pydantic import BaseModel, BeforeValidator, Field, model_validator
2424

25+
from basic_memory.config import ConfigManager
26+
from basic_memory.file_utils import sanitize_for_filename
2527
from basic_memory.utils import generate_permalink
2628

2729

@@ -184,13 +186,35 @@ class Entity(BaseModel):
184186
default="text/markdown",
185187
)
186188

189+
@property
190+
def safe_title(self) -> str:
191+
"""
192+
A sanitized version of the title, which is safe for use on the filesystem. For example,
193+
a title of "Coupon Enable/Disable Feature" should create a the file as "Coupon Enable-Disable Feature.md"
194+
instead of creating a file named "Disable Feature.md" beneath the "Coupon Enable" directory.
195+
196+
Replaces POSIX and/or Windows style slashes as well as a few other characters that are not safe for filenames.
197+
If kebab_filenames is True, then behavior is consistent with transformation used when generating permalink
198+
strings (e.g. "Coupon Enable/Disable Feature" -> "coupon-enable-disable-feature").
199+
"""
200+
fixed_title = sanitize_for_filename(self.title)
201+
202+
app_config = ConfigManager().config
203+
use_kebab_case = app_config.kebab_filenames
204+
205+
if use_kebab_case:
206+
fixed_title = generate_permalink(file_path=fixed_title, split_extension=False)
207+
208+
return fixed_title
209+
187210
@property
188211
def file_path(self):
189212
"""Get the file path for this entity based on its permalink."""
213+
safe_title = self.safe_title
190214
if self.content_type == "text/markdown":
191-
return f"{self.folder}/{self.title}.md" if self.folder else f"{self.title}.md"
215+
return f"{self.folder}/{safe_title}.md" if self.folder else f"{safe_title}.md"
192216
else:
193-
return f"{self.folder}/{self.title}" if self.folder else self.title
217+
return f"{self.folder}/{safe_title}" if self.folder else safe_title
194218

195219
@property
196220
def permalink(self) -> Permalink:

src/basic_memory/utils.py

Lines changed: 54 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def __str__(self) -> str: ...
2727
logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)
2828

2929

30-
def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
30+
def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str:
3131
"""Generate a stable permalink from a file path.
3232
3333
Args:
@@ -50,53 +50,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5050
# Convert Path to string if needed
5151
path_str = str(file_path)
5252

53-
# Remove extension
54-
base = os.path.splitext(path_str)[0]
53+
# Remove extension (for now, possibly)
54+
(base, extension) = os.path.splitext(path_str)
5555

5656
# Check if we have CJK characters that should be preserved
57-
# CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
57+
# CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
5858
# \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
5959
has_cjk_chars = any(
60-
'\u4e00' <= char <= '\u9fff' or
61-
'\u3000' <= char <= '\u303f' or
62-
'\u3400' <= char <= '\u4dbf' or
63-
'\uff00' <= char <= '\uffef'
60+
"\u4e00" <= char <= "\u9fff"
61+
or "\u3000" <= char <= "\u303f"
62+
or "\u3400" <= char <= "\u4dbf"
63+
or "\uff00" <= char <= "\uffef"
6464
for char in base
6565
)
66-
66+
6767
if has_cjk_chars:
6868
# For text with CJK characters, selectively transliterate only Latin accented chars
6969
result = ""
7070
for char in base:
71-
if ('\u4e00' <= char <= '\u9fff' or
72-
'\u3000' <= char <= '\u303f' or
73-
'\u3400' <= char <= '\u4dbf'):
71+
if (
72+
"\u4e00" <= char <= "\u9fff"
73+
or "\u3000" <= char <= "\u303f"
74+
or "\u3400" <= char <= "\u4dbf"
75+
):
7476
# Preserve CJK ideographs and symbols
7577
result += char
76-
elif ('\uff00' <= char <= '\uffef'):
78+
elif "\uff00" <= char <= "\uffef":
7779
# Remove Chinese fullwidth punctuation entirely (like ,!?)
7880
continue
7981
else:
8082
# Transliterate Latin accented characters to ASCII
8183
result += unidecode(char)
82-
84+
8385
# Insert hyphens between CJK and Latin character transitions
8486
# Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
85-
result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result)
86-
result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result)
87-
87+
result = re.sub(
88+
r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])", r"\1-\2", result
89+
)
90+
result = re.sub(
91+
r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])", r"\1-\2", result
92+
)
93+
8894
# Insert dash between camelCase
8995
result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
90-
96+
9197
# Convert ASCII letters to lowercase, preserve CJK
9298
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
93-
99+
94100
# Replace underscores with hyphens
95101
text_with_hyphens = lower_text.replace("_", "-")
96-
102+
97103
# Remove apostrophes entirely (don't replace with hyphens)
98104
text_no_apostrophes = text_with_hyphens.replace("'", "")
99-
105+
100106
# Replace unsafe chars with hyphens, but preserve CJK characters
101107
clean_text = re.sub(
102108
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes
@@ -128,7 +134,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
128134
segments = clean_text.split("/")
129135
clean_segments = [s.strip("-") for s in segments]
130136

131-
return "/".join(clean_segments)
137+
return_val = "/".join(clean_segments)
138+
139+
# Append file extension back, if necessary
140+
if not split_extension and extension:
141+
return_val += extension
142+
143+
return return_val
132144

133145

134146
def setup_logging(
@@ -221,74 +233,74 @@ def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
221233

222234
def normalize_file_path_for_comparison(file_path: str) -> str:
223235
"""Normalize a file path for conflict detection.
224-
236+
225237
This function normalizes file paths to help detect potential conflicts:
226238
- Converts to lowercase for case-insensitive comparison
227239
- Normalizes Unicode characters
228240
- Handles path separators consistently
229-
241+
230242
Args:
231243
file_path: The file path to normalize
232-
244+
233245
Returns:
234246
Normalized file path for comparison purposes
235247
"""
236248
import unicodedata
237-
249+
238250
# Convert to lowercase for case-insensitive comparison
239251
normalized = file_path.lower()
240-
252+
241253
# Normalize Unicode characters (NFD normalization)
242-
normalized = unicodedata.normalize('NFD', normalized)
243-
254+
normalized = unicodedata.normalize("NFD", normalized)
255+
244256
# Replace path separators with forward slashes
245-
normalized = normalized.replace('\\', '/')
246-
257+
normalized = normalized.replace("\\", "/")
258+
247259
# Remove multiple slashes
248-
normalized = re.sub(r'/+', '/', normalized)
249-
260+
normalized = re.sub(r"/+", "/", normalized)
261+
250262
return normalized
251263

252264

253265
def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]:
254266
"""Detect potential conflicts between a file path and existing paths.
255-
267+
256268
This function checks for various types of conflicts:
257269
- Case sensitivity differences
258270
- Unicode normalization differences
259271
- Path separator differences
260272
- Permalink generation conflicts
261-
273+
262274
Args:
263275
file_path: The file path to check
264276
existing_paths: List of existing file paths to check against
265-
277+
266278
Returns:
267279
List of existing paths that might conflict with the given file path
268280
"""
269281
conflicts = []
270-
282+
271283
# Normalize the input file path
272284
normalized_input = normalize_file_path_for_comparison(file_path)
273285
input_permalink = generate_permalink(file_path)
274-
286+
275287
for existing_path in existing_paths:
276288
# Skip identical paths
277289
if existing_path == file_path:
278290
continue
279-
291+
280292
# Check for case-insensitive path conflicts
281293
normalized_existing = normalize_file_path_for_comparison(existing_path)
282294
if normalized_input == normalized_existing:
283295
conflicts.append(existing_path)
284296
continue
285-
297+
286298
# Check for permalink conflicts
287299
existing_permalink = generate_permalink(existing_path)
288300
if input_permalink == existing_permalink:
289301
conflicts.append(existing_path)
290302
continue
291-
303+
292304
return conflicts
293305

294306

@@ -318,4 +330,4 @@ def validate_project_path(path: str, project_path: Path) -> bool:
318330
resolved = (project_path / path).resolve()
319331
return resolved.is_relative_to(project_path.resolve())
320332
except (ValueError, OSError):
321-
return False
333+
return False

tests/api/test_knowledge_router.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ async def test_get_entity_by_permalink(client: AsyncClient, project_url):
188188
# Verify retrieval
189189
assert response.status_code == 200
190190
entity = response.json()
191+
assert entity["title"] == "TestEntity"
191192
assert entity["file_path"] == "test/TestEntity.md"
192193
assert entity["entity_type"] == "test"
193194
assert entity["permalink"] == "test/test-entity"
@@ -209,6 +210,7 @@ async def test_get_entity_by_file_path(client: AsyncClient, project_url):
209210
# Verify retrieval
210211
assert response.status_code == 200
211212
entity = response.json()
213+
assert entity["title"] == "TestEntity"
212214
assert entity["file_path"] == "test/TestEntity.md"
213215
assert entity["entity_type"] == "test"
214216
assert entity["permalink"] == "test/test-entity"

tests/utils/test_file_utils.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from pathlib import Path
44

55
import pytest
6+
import random
7+
import string
68

79
from basic_memory.file_utils import (
810
FileError,
@@ -13,11 +15,24 @@
1315
has_frontmatter,
1416
parse_frontmatter,
1517
remove_frontmatter,
18+
sanitize_for_filename,
1619
update_frontmatter,
1720
write_file_atomic,
1821
)
1922

2023

24+
def get_random_word(length: int = 12, necessary_char: str | None = None) -> str:
25+
letters = string.ascii_lowercase
26+
word_chars = [random.choice(letters) for i in range(length)]
27+
28+
if necessary_char and length > 0:
29+
# Replace a character at a random position with the necessary character
30+
random_pos = random.randint(0, length - 1)
31+
word_chars[random_pos] = necessary_char
32+
33+
return "".join(word_chars)
34+
35+
2136
@pytest.mark.asyncio
2237
async def test_compute_checksum():
2338
"""Test checksum computation."""
@@ -241,3 +256,16 @@ async def test_update_frontmatter_errors(tmp_path: Path):
241256
nonexistent = tmp_path / "nonexistent" / "test.md"
242257
with pytest.raises(FileError):
243258
await update_frontmatter(nonexistent, {"title": "Test"})
259+
260+
261+
@pytest.mark.asyncio
262+
def test_sanitize_for_filename_removes_invalid_characters():
263+
# Test all invalid characters listed in the regex
264+
invalid_chars = '<>:"|?*'
265+
266+
# All invalid characters should be replaced
267+
for char in invalid_chars:
268+
text = get_random_word(length=12, necessary_char=char)
269+
sanitized_text = sanitize_for_filename(text)
270+
271+
assert char not in sanitized_text

0 commit comments

Comments
 (0)