fix: Sanitize filenames and allow optional kebab case

bdmayes · bdmayes · commit aad9c420eafd · 2025-08-19T23:10:31.000-04:00
diff --git a/src/basic_memory/config.py b/src/basic_memory/config.py
@@ -74,6 +74,11 @@ class BasicMemoryConfig(BaseSettings):
         description="Whether to sync changes in real time. default (True)",
     )
 
+    kebab_filenames: bool = Field(
+        default=False,
+        description="Format for generated filenames. False preserves spaces and special chars, True converts them to hyphens for consistency with permalinks",
+    )
+
     # API connection configuration
     api_url: Optional[str] = Field(
         default=None,
diff --git a/src/basic_memory/file_utils.py b/src/basic_memory/file_utils.py
@@ -2,6 +2,7 @@
 
 import hashlib
 from pathlib import Path
+import re
 from typing import Any, Dict, Union
 
 import yaml
@@ -233,3 +234,21 @@ async def update_frontmatter(path: FilePath, updates: Dict[str, Any]) -> str:
             error=str(e),
         )
         raise FileError(f"Failed to update frontmatter: {e}")
+
+
+def sanitize_for_filename(text, replacement="-"):
+    """
+    Sanitize string to be safe for use as a note title
+    Replaces path separators and other problematic characters
+    with hyphens.
+    """
+    # replace both POSIX and Windows path separators
+    text = re.sub(r"[/\\]", replacement, text)
+
+    # replace some other problematic chars
+    text = re.sub(r'[<>:"|?*]', replacement, text)
+
+    # compress multiple, repeated replacements
+    text = re.sub(f"{re.escape(replacement)}+", replacement, text)
+
+    return text.strip(replacement)
diff --git a/src/basic_memory/schemas/base.py b/src/basic_memory/schemas/base.py
@@ -22,6 +22,8 @@
 
 from pydantic import BaseModel, BeforeValidator, Field, model_validator
 
+from basic_memory.config import ConfigManager
+from basic_memory.file_utils import sanitize_for_filename
 from basic_memory.utils import generate_permalink
 
 
@@ -184,13 +186,35 @@ class Entity(BaseModel):
         default="text/markdown",
     )
 
+    @property
+    def safe_title(self) -> str:
+        """
+        A sanitized version of the title, which is safe for use on the filesystem. For example,
+        a title of "Coupon Enable/Disable Feature" should create a the file as "Coupon Enable-Disable Feature.md"
+        instead of creating a file named "Disable Feature.md" beneath the "Coupon Enable" directory.
+
+        Replaces POSIX and/or Windows style slashes as well as a few other characters that are not safe for filenames.
+        If kebab_filenames is True, then behavior is consistent with transformation used when generating permalink
+        strings (e.g. "Coupon Enable/Disable Feature" -> "coupon-enable-disable-feature").
+        """
+        fixed_title = sanitize_for_filename(self.title)
+
+        app_config = ConfigManager().config
+        use_kebab_case = app_config.kebab_filenames
+
+        if use_kebab_case:
+            fixed_title = generate_permalink(file_path=fixed_title, split_extension=False)
+
+        return fixed_title
+
     @property
     def file_path(self):
         """Get the file path for this entity based on its permalink."""
+        safe_title = self.safe_title
         if self.content_type == "text/markdown":
-            return f"{self.folder}/{self.title}.md" if self.folder else f"{self.title}.md"
+            return f"{self.folder}/{safe_title}.md" if self.folder else f"{safe_title}.md"
         else:
-            return f"{self.folder}/{self.title}" if self.folder else self.title
+            return f"{self.folder}/{safe_title}" if self.folder else safe_title
 
     @property
     def permalink(self) -> Permalink:
diff --git a/src/basic_memory/utils.py b/src/basic_memory/utils.py
@@ -27,7 +27,7 @@ def __str__(self) -> str: ...
 logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)
 
 
-def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
+def generate_permalink(file_path: Union[Path, str, PathLike], split_extension: bool = True) -> str:
     """Generate a stable permalink from a file path.
 
     Args:
@@ -50,53 +50,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
     # Convert Path to string if needed
     path_str = str(file_path)
 
-    # Remove extension
-    base = os.path.splitext(path_str)[0]
+    # Remove extension (for now, possibly)
+    (base, extension) = os.path.splitext(path_str)
 
     # Check if we have CJK characters that should be preserved
-    # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols), 
+    # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
     # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
     has_cjk_chars = any(
-        '\u4e00' <= char <= '\u9fff' or 
-        '\u3000' <= char <= '\u303f' or 
-        '\u3400' <= char <= '\u4dbf' or
-        '\uff00' <= char <= '\uffef'
+        "\u4e00" <= char <= "\u9fff"
+        or "\u3000" <= char <= "\u303f"
+        or "\u3400" <= char <= "\u4dbf"
+        or "\uff00" <= char <= "\uffef"
         for char in base
     )
-    
+
     if has_cjk_chars:
         # For text with CJK characters, selectively transliterate only Latin accented chars
         result = ""
         for char in base:
-            if ('\u4e00' <= char <= '\u9fff' or 
-                '\u3000' <= char <= '\u303f' or 
-                '\u3400' <= char <= '\u4dbf'):
+            if (
+                "\u4e00" <= char <= "\u9fff"
+                or "\u3000" <= char <= "\u303f"
+                or "\u3400" <= char <= "\u4dbf"
+            ):
                 # Preserve CJK ideographs and symbols
                 result += char
-            elif ('\uff00' <= char <= '\uffef'):
+            elif "\uff00" <= char <= "\uffef":
                 # Remove Chinese fullwidth punctuation entirely (like ，！？)
                 continue
             else:
                 # Transliterate Latin accented characters to ASCII
                 result += unidecode(char)
-        
+
         # Insert hyphens between CJK and Latin character transitions
         # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
-        result = re.sub(r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])', r'\1-\2', result)
-        result = re.sub(r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])', r'\1-\2', result)
-        
+        result = re.sub(
+            r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])", r"\1-\2", result
+        )
+        result = re.sub(
+            r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])", r"\1-\2", result
+        )
+
         # Insert dash between camelCase
         result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
-        
+
         # Convert ASCII letters to lowercase, preserve CJK
         lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
-        
+
         # Replace underscores with hyphens
         text_with_hyphens = lower_text.replace("_", "-")
-        
+
         # Remove apostrophes entirely (don't replace with hyphens)
         text_no_apostrophes = text_with_hyphens.replace("'", "")
-        
+
         # Replace unsafe chars with hyphens, but preserve CJK characters
         clean_text = re.sub(
             r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_no_apostrophes
@@ -128,7 +134,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
     segments = clean_text.split("/")
     clean_segments = [s.strip("-") for s in segments]
 
-    return "/".join(clean_segments)
+    return_val = "/".join(clean_segments)
+
+    # Append file extension back, if necessary
+    if not split_extension and extension:
+        return_val += extension
+
+    return return_val
 
 
 def setup_logging(
@@ -221,74 +233,74 @@ def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
 
 def normalize_file_path_for_comparison(file_path: str) -> str:
     """Normalize a file path for conflict detection.
-    
+
     This function normalizes file paths to help detect potential conflicts:
     - Converts to lowercase for case-insensitive comparison
     - Normalizes Unicode characters
     - Handles path separators consistently
-    
+
     Args:
         file_path: The file path to normalize
-        
+
     Returns:
         Normalized file path for comparison purposes
     """
     import unicodedata
-    
+
     # Convert to lowercase for case-insensitive comparison
     normalized = file_path.lower()
-    
+
     # Normalize Unicode characters (NFD normalization)
-    normalized = unicodedata.normalize('NFD', normalized)
-    
+    normalized = unicodedata.normalize("NFD", normalized)
+
     # Replace path separators with forward slashes
-    normalized = normalized.replace('\\', '/')
-    
+    normalized = normalized.replace("\\", "/")
+
     # Remove multiple slashes
-    normalized = re.sub(r'/+', '/', normalized)
-    
+    normalized = re.sub(r"/+", "/", normalized)
+
     return normalized
 
 
 def detect_potential_file_conflicts(file_path: str, existing_paths: List[str]) -> List[str]:
     """Detect potential conflicts between a file path and existing paths.
-    
+
     This function checks for various types of conflicts:
     - Case sensitivity differences
     - Unicode normalization differences
     - Path separator differences
     - Permalink generation conflicts
-    
+
     Args:
         file_path: The file path to check
         existing_paths: List of existing file paths to check against
-        
+
     Returns:
         List of existing paths that might conflict with the given file path
     """
     conflicts = []
-    
+
     # Normalize the input file path
     normalized_input = normalize_file_path_for_comparison(file_path)
     input_permalink = generate_permalink(file_path)
-    
+
     for existing_path in existing_paths:
         # Skip identical paths
         if existing_path == file_path:
             continue
-            
+
         # Check for case-insensitive path conflicts
         normalized_existing = normalize_file_path_for_comparison(existing_path)
         if normalized_input == normalized_existing:
             conflicts.append(existing_path)
             continue
-            
+
         # Check for permalink conflicts
         existing_permalink = generate_permalink(existing_path)
         if input_permalink == existing_permalink:
             conflicts.append(existing_path)
             continue
-    
+
     return conflicts
 
 
@@ -318,4 +330,4 @@ def validate_project_path(path: str, project_path: Path) -> bool:
         resolved = (project_path / path).resolve()
         return resolved.is_relative_to(project_path.resolve())
     except (ValueError, OSError):
-        return False
+        return False
diff --git a/tests/api/test_knowledge_router.py b/tests/api/test_knowledge_router.py
@@ -188,6 +188,7 @@ async def test_get_entity_by_permalink(client: AsyncClient, project_url):
     # Verify retrieval
     assert response.status_code == 200
     entity = response.json()
+    assert entity["title"] == "TestEntity"
     assert entity["file_path"] == "test/TestEntity.md"
     assert entity["entity_type"] == "test"
     assert entity["permalink"] == "test/test-entity"
@@ -209,6 +210,7 @@ async def test_get_entity_by_file_path(client: AsyncClient, project_url):
     # Verify retrieval
     assert response.status_code == 200
     entity = response.json()
+    assert entity["title"] == "TestEntity"
     assert entity["file_path"] == "test/TestEntity.md"
     assert entity["entity_type"] == "test"
     assert entity["permalink"] == "test/test-entity"
diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 
 import pytest
+import random
+import string
 
 from basic_memory.file_utils import (
     FileError,
@@ -13,11 +15,24 @@
     has_frontmatter,
     parse_frontmatter,
     remove_frontmatter,
+    sanitize_for_filename,
     update_frontmatter,
     write_file_atomic,
 )
 
 
+def get_random_word(length: int = 12, necessary_char: str | None = None) -> str:
+    letters = string.ascii_lowercase
+    word_chars = [random.choice(letters) for i in range(length)]
+
+    if necessary_char and length > 0:
+        # Replace a character at a random position with the necessary character
+        random_pos = random.randint(0, length - 1)
+        word_chars[random_pos] = necessary_char
+
+    return "".join(word_chars)
+
+
 @pytest.mark.asyncio
 async def test_compute_checksum():
     """Test checksum computation."""
@@ -241,3 +256,16 @@ async def test_update_frontmatter_errors(tmp_path: Path):
     nonexistent = tmp_path / "nonexistent" / "test.md"
     with pytest.raises(FileError):
         await update_frontmatter(nonexistent, {"title": "Test"})
+
+
+@pytest.mark.asyncio
+def test_sanitize_for_filename_removes_invalid_characters():
+    # Test all invalid characters listed in the regex
+    invalid_chars = '<>:"|?*'
+
+    # All invalid characters should be replaced
+    for char in invalid_chars:
+        text = get_random_word(length=12, necessary_char=char)
+        sanitized_text = sanitize_for_filename(text)
+
+        assert char not in sanitized_text