Skip to content

Commit 03d4e97

Browse files
andyxinweiminicloudphernandez
authored andcommitted
Fix: preserve Chinese characters in permalinks
1 parent 98622a7 commit 03d4e97

1 file changed

Lines changed: 11 additions & 11 deletions

File tree

src/basic_memory/utils.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
3535
3636
Returns:
3737
Normalized permalink that matches validation rules. Converts spaces and underscores
38-
to hyphens for consistency.
38+
to hyphens for consistency. Preserves non-ASCII characters like Chinese.
3939
4040
Examples:
4141
>>> generate_permalink("docs/My Feature.md")
@@ -44,35 +44,35 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
4444
'specs/api-v2'
4545
>>> generate_permalink("design/unified_model_refactor.md")
4646
'design/unified-model-refactor'
47+
>>> generate_permalink("中文/测试文档.md")
48+
'中文/测试文档'
4749
"""
4850
# Convert Path to string if needed
4951
path_str = str(file_path)
5052

5153
# Remove extension
5254
base = os.path.splitext(path_str)[0]
5355

54-
# Transliterate unicode to ascii
55-
ascii_text = unidecode(base)
56-
5756
# Insert dash between camelCase
58-
ascii_text = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", ascii_text)
57+
base_with_dashes = re.sub(r"([a-z0-9])([A-Z])", r"\1-\2", base)
5958

60-
# Convert to lowercase
61-
lower_text = ascii_text.lower()
59+
# Convert ASCII letters to lowercase, preserve non-ASCII characters
60+
lower_text = "".join(c.lower() if c.isascii() and c.isalpha() else c for c in base_with_dashes)
6261

63-
# replace underscores with hyphens
62+
# Replace underscores with hyphens
6463
text_with_hyphens = lower_text.replace("_", "-")
6564

66-
# Replace remaining invalid chars with hyphens
67-
clean_text = re.sub(r"[^a-z0-9/\-]", "-", text_with_hyphens)
65+
# Replace spaces and unsafe ASCII characters with hyphens, but preserve non-ASCII characters
66+
# Include common Chinese character ranges and other non-ASCII characters
67+
clean_text = re.sub(r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]", "-", text_with_hyphens)
6868

6969
# Collapse multiple hyphens
7070
clean_text = re.sub(r"-+", "-", clean_text)
7171

7272
# Clean each path segment
7373
segments = clean_text.split("/")
7474
clean_segments = [s.strip("-") for s in segments]
75-
75+
7676
return "/".join(clean_segments)
7777

7878

0 commit comments

Comments
 (0)