@@ -27,7 +27,7 @@ def __str__(self) -> str: ...
2727logging .getLogger ("opentelemetry.sdk.metrics._internal.instrument" ).setLevel (logging .ERROR )
2828
2929
30- def generate_permalink (file_path : Union [Path , str , PathLike ]) -> str :
30+ def generate_permalink (file_path : Union [Path , str , PathLike ], split_extension : bool = True ) -> str :
3131 """Generate a stable permalink from a file path.
3232
3333 Args:
@@ -50,53 +50,59 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5050 # Convert Path to string if needed
5151 path_str = str (file_path )
5252
53- # Remove extension
54- base = os .path .splitext (path_str )[ 0 ]
53+ # Remove extension (for now, possibly)
54+ ( base , extension ) = os .path .splitext (path_str )
5555
5656 # Check if we have CJK characters that should be preserved
57- # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
57+ # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
5858 # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
5959 has_cjk_chars = any (
60- ' \u4e00 ' <= char <= ' \u9fff ' or
61- ' \u3000 ' <= char <= ' \u303f ' or
62- ' \u3400 ' <= char <= ' \u4dbf ' or
63- ' \uff00 ' <= char <= ' \uffef '
60+ " \u4e00 " <= char <= " \u9fff "
61+ or " \u3000 " <= char <= " \u303f "
62+ or " \u3400 " <= char <= " \u4dbf "
63+ or " \uff00 " <= char <= " \uffef "
6464 for char in base
6565 )
66-
66+
6767 if has_cjk_chars :
6868 # For text with CJK characters, selectively transliterate only Latin accented chars
6969 result = ""
7070 for char in base :
71- if ('\u4e00 ' <= char <= '\u9fff ' or
72- '\u3000 ' <= char <= '\u303f ' or
73- '\u3400 ' <= char <= '\u4dbf ' ):
71+ if (
72+ "\u4e00 " <= char <= "\u9fff "
73+ or "\u3000 " <= char <= "\u303f "
74+ or "\u3400 " <= char <= "\u4dbf "
75+ ):
7476 # Preserve CJK ideographs and symbols
7577 result += char
76- elif ( ' \uff00 ' <= char <= ' \uffef ' ) :
78+ elif " \uff00 " <= char <= " \uffef " :
7779 # Remove Chinese fullwidth punctuation entirely (like ,!?)
7880 continue
7981 else :
8082 # Transliterate Latin accented characters to ASCII
8183 result += unidecode (char )
82-
84+
8385 # Insert hyphens between CJK and Latin character transitions
8486 # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
85- result = re .sub (r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])' , r'\1-\2' , result )
86- result = re .sub (r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])' , r'\1-\2' , result )
87-
87+ result = re .sub (
88+ r"([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])" , r"\1-\2" , result
89+ )
90+ result = re .sub (
91+ r"([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])" , r"\1-\2" , result
92+ )
93+
8894 # Insert dash between camelCase
8995 result = re .sub (r"([a-z0-9])([A-Z])" , r"\1-\2" , result )
90-
96+
9197 # Convert ASCII letters to lowercase, preserve CJK
9298 lower_text = "" .join (c .lower () if c .isascii () and c .isalpha () else c for c in result )
93-
99+
94100 # Replace underscores with hyphens
95101 text_with_hyphens = lower_text .replace ("_" , "-" )
96-
102+
97103 # Remove apostrophes entirely (don't replace with hyphens)
98104 text_no_apostrophes = text_with_hyphens .replace ("'" , "" )
99-
105+
100106 # Replace unsafe chars with hyphens, but preserve CJK characters
101107 clean_text = re .sub (
102108 r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_no_apostrophes
@@ -128,7 +134,13 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
128134 segments = clean_text .split ("/" )
129135 clean_segments = [s .strip ("-" ) for s in segments ]
130136
131- return "/" .join (clean_segments )
137+ return_val = "/" .join (clean_segments )
138+
139+ # Append file extension back, if necessary
140+ if not split_extension and extension :
141+ return_val += extension
142+
143+ return return_val
132144
133145
134146def setup_logging (
@@ -221,74 +233,74 @@ def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
221233
222234def normalize_file_path_for_comparison (file_path : str ) -> str :
223235 """Normalize a file path for conflict detection.
224-
236+
225237 This function normalizes file paths to help detect potential conflicts:
226238 - Converts to lowercase for case-insensitive comparison
227239 - Normalizes Unicode characters
228240 - Handles path separators consistently
229-
241+
230242 Args:
231243 file_path: The file path to normalize
232-
244+
233245 Returns:
234246 Normalized file path for comparison purposes
235247 """
236248 import unicodedata
237-
249+
238250 # Convert to lowercase for case-insensitive comparison
239251 normalized = file_path .lower ()
240-
252+
241253 # Normalize Unicode characters (NFD normalization)
242- normalized = unicodedata .normalize (' NFD' , normalized )
243-
254+ normalized = unicodedata .normalize (" NFD" , normalized )
255+
244256 # Replace path separators with forward slashes
245- normalized = normalized .replace (' \\ ' , '/' )
246-
257+ normalized = normalized .replace (" \\ " , "/" )
258+
247259 # Remove multiple slashes
248- normalized = re .sub (r'/+' , '/' , normalized )
249-
260+ normalized = re .sub (r"/+" , "/" , normalized )
261+
250262 return normalized
251263
252264
253265def detect_potential_file_conflicts (file_path : str , existing_paths : List [str ]) -> List [str ]:
254266 """Detect potential conflicts between a file path and existing paths.
255-
267+
256268 This function checks for various types of conflicts:
257269 - Case sensitivity differences
258270 - Unicode normalization differences
259271 - Path separator differences
260272 - Permalink generation conflicts
261-
273+
262274 Args:
263275 file_path: The file path to check
264276 existing_paths: List of existing file paths to check against
265-
277+
266278 Returns:
267279 List of existing paths that might conflict with the given file path
268280 """
269281 conflicts = []
270-
282+
271283 # Normalize the input file path
272284 normalized_input = normalize_file_path_for_comparison (file_path )
273285 input_permalink = generate_permalink (file_path )
274-
286+
275287 for existing_path in existing_paths :
276288 # Skip identical paths
277289 if existing_path == file_path :
278290 continue
279-
291+
280292 # Check for case-insensitive path conflicts
281293 normalized_existing = normalize_file_path_for_comparison (existing_path )
282294 if normalized_input == normalized_existing :
283295 conflicts .append (existing_path )
284296 continue
285-
297+
286298 # Check for permalink conflicts
287299 existing_permalink = generate_permalink (existing_path )
288300 if input_permalink == existing_permalink :
289301 conflicts .append (existing_path )
290302 continue
291-
303+
292304 return conflicts
293305
294306
@@ -318,4 +330,4 @@ def validate_project_path(path: str, project_path: Path) -> bool:
318330 resolved = (project_path / path ).resolve ()
319331 return resolved .is_relative_to (project_path .resolve ())
320332 except (ValueError , OSError ):
321- return False
333+ return False
0 commit comments