Skip to content

Commit 73ea91f

Browse files
committed
fix: add extra logic for permalink generation with mixed Latin unicode and Chinese characters
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 03d4e97 commit 73ea91f

3 files changed

Lines changed: 116 additions & 14 deletions

File tree

src/basic_memory/utils.py

Lines changed: 63 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
import logging
66
import re
77
import sys
8+
import unicodedata
89
from pathlib import Path
9-
from typing import Optional, Protocol, Union, runtime_checkable, List
10+
from typing import Optional, Protocol, Union, runtime_checkable, List, Any
1011

1112
from loguru import logger
12-
from unidecode import unidecode
1313

1414

1515
@runtime_checkable
@@ -27,11 +27,9 @@ def __str__(self) -> str: ...
2727
logging.getLogger("opentelemetry.sdk.metrics._internal.instrument").setLevel(logging.ERROR)
2828

2929

30-
def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
31-
"""Generate a stable permalink from a file path.
32-
33-
Args:
34-
file_path: Original file path (str, Path, or PathLike)
30+
def generate_permalink(file_path: Union[Path, str, Any]) -> str:
31+
"""
32+
Generate a permalink from a file path.
3533
3634
Returns:
3735
Normalized permalink that matches validation rules. Converts spaces and underscores
@@ -40,11 +38,11 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
4038
Examples:
4139
>>> generate_permalink("docs/My Feature.md")
4240
'docs/my-feature'
43-
>>> generate_permalink("specs/API (v2).md")
41+
>>> generate_permalink("specs/API_v2.md")
4442
'specs/api-v2'
4543
>>> generate_permalink("design/unified_model_refactor.md")
4644
'design/unified-model-refactor'
47-
>>> generate_permalink("中文/测试文档.md")
45+
>>> generate_permalink("中文/测试文档.md")
4846
'中文/测试文档'
4947
"""
5048
# Convert Path to string if needed
@@ -53,26 +51,78 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5351
# Remove extension
5452
base = os.path.splitext(path_str)[0]
5553

54+
# Create a transliteration mapping for specific characters
55+
transliteration_map = {
56+
"ø": "o", # Handle Søren -> soren
57+
"å": "a", # Handle Kierkegård -> kierkegard
58+
"ü": "u", # Handle Müller -> muller
59+
"é": "e", # Handle Café -> cafe
60+
"è": "e", # Handle Mère -> mere
61+
"ê": "e", # Handle Fête -> fete
62+
"à": "a", # Handle À la mode -> a la mode
63+
"ç": "c", # Handle Façade -> facade
64+
"ñ": "n", # Handle Niño -> nino
65+
"ö": "o", # Handle Björk -> bjork
66+
"ä": "a", # Handle Häagen -> haagen
67+
# Add more mappings as needed
68+
}
69+
70+
# Process character by character, transliterating Latin characters with diacritics
71+
result = ""
72+
for char in base:
73+
# Direct mapping for known characters
74+
if char.lower() in transliteration_map:
75+
result += transliteration_map[char.lower()]
76+
# General case using Unicode normalization
77+
elif unicodedata.category(char).startswith("L") and ord(char) > 127:
78+
# Decompose the character (e.g., ü -> u + combining diaeresis)
79+
decomposed = unicodedata.normalize("NFD", char)
80+
# If decomposition produced multiple characters and first one is ASCII
81+
if len(decomposed) > 1 and ord(decomposed[0]) < 128:
82+
# Keep only the base character
83+
result += decomposed[0].lower()
84+
else:
85+
# For non-Latin scripts like Chinese, preserve the character
86+
result += char
87+
else:
88+
# Add the character as is
89+
result += char
90+
91+
# Handle special punctuation cases for apostrophes
92+
result = result.replace("'", "")
93+
5694
# Insert dash between camelCase
57-
base_with_dashes = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", base)
95+
# This regex finds boundaries between lowercase and uppercase letters
96+
result = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", result)
97+
98+
# Insert dash between Chinese and Latin character boundaries
99+
# This is needed for cases like "中文English" -> "中文-english"
100+
result = re.sub(r"([\u4e00-\u9fff])([a-zA-Z])", r"\1-\2", result)
101+
result = re.sub(r"([a-zA-Z])([\u4e00-\u9fff])", r"\1-\2", result)
58102

59103
# Convert ASCII letters to lowercase, preserve non-ASCII characters
60-
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in base_with_dashes)
104+
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in result)
61105

62106
# Replace underscores with hyphens
63107
text_with_hyphens = lower_text.replace("_", "-")
64108

65109
# Replace spaces and unsafe ASCII characters with hyphens, but preserve non-ASCII characters
66110
# Include common Chinese character ranges and other non-ASCII characters
67-
clean_text = re.sub(r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens)
111+
clean_text = re.sub(
112+
r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens
113+
)
68114

69115
# Collapse multiple hyphens
70116
clean_text = re.sub(r"-+", "-", clean_text)
71117

118+
# Remove hyphens between adjacent Chinese characters only
119+
# This handles cases like "你好-世界" -> "你好世界"
120+
clean_text = re.sub(r"([\u4e00-\u9fff])-([\u4e00-\u9fff])", r"\1\2", clean_text)
121+
72122
# Clean each path segment
73123
segments = clean_text.split("/")
74124
clean_segments = [s.strip("-") for s in segments]
75-
125+
76126
return "/".join(clean_segments)
77127

78128

tests/utils/test_permalink_formatting.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from basic_memory.config import ProjectConfig
88
from basic_memory.services import EntityService
99
from basic_memory.sync.sync_service import SyncService
10+
from basic_memory.utils import generate_permalink
1011

1112

1213
async def create_test_file(path: Path, content: str = "test content") -> None:
@@ -66,3 +67,54 @@ async def test_permalink_formatting(
6667
assert entity.permalink == expected_permalink, (
6768
f"File {filename} should have permalink {expected_permalink}"
6869
)
70+
71+
72+
@pytest.mark.parametrize(
73+
"input_path, expected",
74+
[
75+
("test/Über File.md", "test/uber-file"),
76+
("docs/résumé.md", "docs/resume"),
77+
("notes/Déjà vu.md", "notes/deja-vu"),
78+
("papers/Jürgen's Findings.md", "papers/jurgens-findings"),
79+
("archive/François Müller.md", "archive/francois-muller"),
80+
("research/Søren Kierkegård.md", "research/soren-kierkegard"),
81+
("articles/El Niño.md", "articles/el-nino"),
82+
],
83+
)
84+
def test_latin_accents_transliteration(input_path, expected):
85+
"""Test that Latin letters with accents are properly transliterated."""
86+
assert generate_permalink(input_path) == expected
87+
88+
89+
@pytest.mark.parametrize(
90+
"input_path, expected",
91+
[
92+
("中文/测试文档.md", "中文/测试文档"),
93+
("notes/北京市.md", "notes/北京市"),
94+
("research/上海简介.md", "research/上海简介"),
95+
("docs/中文 English Mixed.md", "docs/中文-english-mixed"),
96+
("articles/东京Tokyo混合.md", "articles/东京-tokyo-混合"),
97+
("papers/汉字_underscore_test.md", "papers/汉字-underscore-test"),
98+
("projects/中文CamelCase测试.md", "projects/中文-camel-case-测试"),
99+
],
100+
)
101+
def test_chinese_character_preservation(input_path, expected):
102+
"""Test that Chinese characters are preserved in permalinks."""
103+
assert generate_permalink(input_path) == expected
104+
105+
106+
@pytest.mark.parametrize(
107+
"input_path, expected",
108+
[
109+
("mixed/北京Café.md", "mixed/北京-cafe"),
110+
("notes/东京Tōkyō.md", "notes/东京-tokyo"),
111+
("research/München中文.md", "research/munchen-中文"),
112+
("docs/Über测试.md", "docs/uber-测试"),
113+
("complex/北京Beijing上海Shanghai.md", "complex/北京-beijing-上海-shanghai"),
114+
("special/中文!@#$%^&*()_+.md", "special/中文"),
115+
("punctuation/你好,世界!.md", "punctuation/你好世界"),
116+
],
117+
)
118+
def test_mixed_character_sets(input_path, expected):
119+
"""Test handling of mixed character sets and edge cases."""
120+
assert generate_permalink(input_path) == expected

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)