Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 63 additions & 13 deletions src/basic_memory/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import logging
import re
import sys
import unicodedata
from pathlib import Path
from typing import Optional, Protocol, Union, runtime_checkable, List
from typing import Optional, Protocol, Union, runtime_checkable, List, Any

from loguru import logger
from unidecode import unidecode


@runtime_checkable
Expand All @@ -27,11 +27,9 @@ def __str__(self) -> str: ...
logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)


def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
"""Generate a stable permalink from a file path.

Args:
file_path: Original file path (str, Path, or PathLike)
def generate_permalink(file_path: Union[Path, str, Any]) -> str:
"""
Generate a permalink from a file path.

Returns:
Normalized permalink that matches validation rules. Converts spaces and underscores
Expand All @@ -40,11 +38,11 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
Examples:
>>> generate_permalink("docs/My Feature.md")
'docs/my-feature'
>>> generate_permalink("specs/API (v2).md")
>>> generate_permalink("specs/API_v2.md")
'specs/api-v2'
>>> generate_permalink("design/unified_model_refactor.md")
'design/unified-model-refactor'
>>> generate_permalink("中文/测试文档.md")
>>> generate_permalink("中文/测试文档.md")
'中文/测试文档'
"""
# Convert Path to string if needed
Expand All @@ -53,26 +51,78 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
# Remove extension
base = os.path.splitext(path_str)[0]

# Create a transliteration mapping for specific characters
transliteration_map = {
"ø": "o", # Handle Søren -> soren
"å": "a", # Handle Kierkegård -> kierkegard
"ü": "u", # Handle Müller -> muller
"é": "e", # Handle Café -> cafe
"è": "e", # Handle Mère -> mere
"ê": "e", # Handle Fête -> fete
"à": "a", # Handle À la mode -> a la mode
"ç": "c", # Handle Façade -> facade
"ñ": "n", # Handle Niño -> nino
"ö": "o", # Handle Björk -> bjork
"ä": "a", # Handle Häagen -> haagen
# Add more mappings as needed
}

# Process character by character, transliterating Latin characters with diacritics
result = ""
for char in base:
# Direct mapping for known characters
if char.lower() in transliteration_map:
result += transliteration_map[char.lower()]
# General case using Unicode normalization
elif unicodedata.category(char).startswith("L") and ord(char) > 127:
# Decompose the character (e.g., ü -> u + combining diaeresis)
decomposed = unicodedata.normalize("NFD", char)
# If decomposition produced multiple characters and first one is ASCII
if len(decomposed) > 1 and ord(decomposed[0]) < 128:
# Keep only the base character
result += decomposed[0].lower()
else:
# For non-Latin scripts like Chinese, preserve the character
result += char
else:
# Add the character as is
result += char

# Handle special punctuation cases for apostrophes
result = result.replace("'", "")

# Insert dash between camelCase
base_with_dashes = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", base)
# This regex finds boundaries between lowercase and uppercase letters
result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)

# Insert dash between Chinese and Latin character boundaries
# This is needed for cases like "中文English" -> "中文-english"
result = re.sub(r"([\u4e00-\u9fff])([a-zA-Z])", r"\1-\2", result)
result = re.sub(r"([a-zA-Z])([\u4e00-\u9fff])", r"\1-\2", result)

# Convert ASCII letters to lowercase, preserve non-ASCII characters
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in base_with_dashes)
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)

# Replace underscores with hyphens
text_with_hyphens = lower_text.replace("_", "-")

# Replace spaces and unsafe ASCII characters with hyphens, but preserve non-ASCII characters
# Include common Chinese character ranges and other non-ASCII characters
clean_text = re.sub(r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens)
clean_text = re.sub(
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens
)

# Collapse multiple hyphens
clean_text = re.sub(r"-+", "-", clean_text)

# Remove hyphens between adjacent Chinese characters only
# This handles cases like "你好-世界" -> "你好世界"
clean_text = re.sub(r"([\u4e00-\u9fff])-([\u4e00-\u9fff])", r"\1\2", clean_text)

# Clean each path segment
segments = clean_text.split("/")
clean_segments = [s.strip("-") for s in segments]

return "/".join(clean_segments)


Expand Down
52 changes: 52 additions & 0 deletions tests/utils/test_permalink_formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from basic_memory.config import ProjectConfig
from basic_memory.services import EntityService
from basic_memory.sync.sync_service import SyncService
from basic_memory.utils import generate_permalink


async def create_test_file(path: Path, content: str = "test content") -> None:
Expand Down Expand Up @@ -66,3 +67,54 @@ async def test_permalink_formatting(
assert entity.permalink == expected_permalink, (
f"File {filename} should have permalink {expected_permalink}"
)


@pytest.mark.parametrize(
"input_path, expected",
[
("test/Über File.md", "test/uber-file"),
("docs/résumé.md", "docs/resume"),
("notes/Déjà vu.md", "notes/deja-vu"),
("papers/Jürgen's Findings.md", "papers/jurgens-findings"),
("archive/François Müller.md", "archive/francois-muller"),
("research/Søren Kierkegård.md", "research/soren-kierkegard"),
("articles/El Niño.md", "articles/el-nino"),
],
)
def test_latin_accents_transliteration(input_path, expected):
"""Test that Latin letters with accents are properly transliterated."""
assert generate_permalink(input_path) == expected


@pytest.mark.parametrize(
"input_path, expected",
[
("中文/测试文档.md", "中文/测试文档"),
("notes/北京市.md", "notes/北京市"),
("research/上海简介.md", "research/上海简介"),
("docs/中文 English Mixed.md", "docs/中文-english-mixed"),
("articles/东京Tokyo混合.md", "articles/东京-tokyo-混合"),
("papers/汉字_underscore_test.md", "papers/汉字-underscore-test"),
("projects/中文CamelCase测试.md", "projects/中文-camel-case-测试"),
],
)
def test_chinese_character_preservation(input_path, expected):
"""Test that Chinese characters are preserved in permalinks."""
assert generate_permalink(input_path) == expected


@pytest.mark.parametrize(
"input_path, expected",
[
("mixed/北京Café.md", "mixed/北京-cafe"),
("notes/东京Tōkyō.md", "notes/东京-tokyo"),
("research/München中文.md", "research/munchen-中文"),
("docs/Über测试.md", "docs/uber-测试"),
("complex/北京Beijing上海Shanghai.md", "complex/北京-beijing-上海-shanghai"),
("special/中文!@#$%^&*()_+.md", "special/中文"),
("punctuation/你好,世界!.md", "punctuation/你好世界"),
],
)
def test_mixed_character_sets(input_path, expected):
"""Test handling of mixed character sets and edge cases."""
assert generate_permalink(input_path) == expected
2 changes: 1 addition & 1 deletion uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading