55import logging
66import re
77import sys
8+ import unicodedata
89from pathlib import Path
9- from typing import Optional , Protocol , Union , runtime_checkable , List
10+ from typing import Optional , Protocol , Union , runtime_checkable , List , Any
1011
1112from loguru import logger
12- from unidecode import unidecode
1313
1414
1515@runtime_checkable
@@ -27,11 +27,9 @@ def __str__(self) -> str: ...
2727logging .getLogger ("opentelemetry.sdk.metrics._internal.instrument" ).setLevel (logging .ERROR )
2828
2929
30- def generate_permalink (file_path : Union [Path , str , PathLike ]) -> str :
31- """Generate a stable permalink from a file path.
32-
33- Args:
34- file_path: Original file path (str, Path, or PathLike)
30+ def generate_permalink (file_path : Union [Path , str , Any ]) -> str :
31+ """
32+ Generate a permalink from a file path.
3533
3634 Returns:
3735 Normalized permalink that matches validation rules. Converts spaces and underscores
@@ -40,11 +38,11 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
4038 Examples:
4139 >>> generate_permalink("docs/My Feature.md")
4240 'docs/my-feature'
43- >>> generate_permalink("specs/API (v2) .md")
41+ >>> generate_permalink("specs/API_v2 .md")
4442 'specs/api-v2'
4543 >>> generate_permalink("design/unified_model_refactor.md")
4644 'design/unified-model-refactor'
47- >>> generate_permalink("中文/测试文档.md")
45+ >>> generate_permalink("中文/测试文档.md")
4846 '中文/测试文档'
4947 """
5048 # Convert Path to string if needed
@@ -53,26 +51,78 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5351 # Remove extension
5452 base = os .path .splitext (path_str )[0 ]
5553
54+ # Create a transliteration mapping for specific characters
55+ transliteration_map = {
56+ "ø" : "o" , # Handle Søren -> soren
57+ "å" : "a" , # Handle Kierkegård -> kierkegard
58+ "ü" : "u" , # Handle Müller -> muller
59+ "é" : "e" , # Handle Café -> cafe
60+ "è" : "e" , # Handle Mère -> mere
61+ "ê" : "e" , # Handle Fête -> fete
62+ "à" : "a" , # Handle À la mode -> a la mode
63+ "ç" : "c" , # Handle Façade -> facade
64+ "ñ" : "n" , # Handle Niño -> nino
65+ "ö" : "o" , # Handle Björk -> bjork
66+ "ä" : "a" , # Handle Häagen -> haagen
67+ # Add more mappings as needed
68+ }
69+
70+ # Process character by character, transliterating Latin characters with diacritics
71+ result = ""
72+ for char in base :
73+ # Direct mapping for known characters
74+ if char .lower () in transliteration_map :
75+ result += transliteration_map [char .lower ()]
76+ # General case using Unicode normalization
77+ elif unicodedata .category (char ).startswith ("L" ) and ord (char ) > 127 :
78+ # Decompose the character (e.g., ü -> u + combining diaeresis)
79+ decomposed = unicodedata .normalize ("NFD" , char )
80+ # If decomposition produced multiple characters and first one is ASCII
81+ if len (decomposed ) > 1 and ord (decomposed [0 ]) < 128 :
82+ # Keep only the base character
83+ result += decomposed [0 ].lower ()
84+ else :
85+ # For non-Latin scripts like Chinese, preserve the character
86+ result += char
87+ else :
88+ # Add the character as is
89+ result += char
90+
91+ # Handle special punctuation cases for apostrophes
92+ result = result .replace ("'" , "" )
93+
5694 # Insert dash between camelCase
57- base_with_dashes = re .sub (r"([a-z0-9])([A-Z])" , r"\1-\2" , base )
95+ # This regex finds boundaries between lowercase and uppercase letters
96+ result = re .sub (r"([a-z0-9])([A-Z])" , r"\1-\2" , result )
97+
98+ # Insert dash between Chinese and Latin character boundaries
99+ # This is needed for cases like "中文English" -> "中文-english"
100+ result = re .sub (r"([\u4e00-\u9fff])([a-zA-Z])" , r"\1-\2" , result )
101+ result = re .sub (r"([a-zA-Z])([\u4e00-\u9fff])" , r"\1-\2" , result )
58102
59103 # Convert ASCII letters to lowercase, preserve non-ASCII characters
60- lower_text = "" .join (c .lower () if c .isascii () and c .isalpha () else c for c in base_with_dashes )
104+ lower_text = "" .join (c .lower () if c .isascii () and c .isalpha () else c for c in result )
61105
62106 # Replace underscores with hyphens
63107 text_with_hyphens = lower_text .replace ("_" , "-" )
64108
65109 # Replace spaces and unsafe ASCII characters with hyphens, but preserve non-ASCII characters
66110 # Include common Chinese character ranges and other non-ASCII characters
67- clean_text = re .sub (r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_with_hyphens )
111+ clean_text = re .sub (
112+ r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_with_hyphens
113+ )
68114
69115 # Collapse multiple hyphens
70116 clean_text = re .sub (r"-+" , "-" , clean_text )
71117
118+ # Remove hyphens between adjacent Chinese characters only
119+ # This handles cases like "你好-世界" -> "你好世界"
120+ clean_text = re .sub (r"([\u4e00-\u9fff])-([\u4e00-\u9fff])" , r"\1\2" , clean_text )
121+
72122 # Clean each path segment
73123 segments = clean_text .split ("/" )
74124 clean_segments = [s .strip ("-" ) for s in segments ]
75-
125+
76126 return "/" .join (clean_segments )
77127
78128
0 commit comments