@@ -53,26 +53,53 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
5353 # Remove extension
5454 base = os .path .splitext (path_str )[0 ]
5555
56- # Check if we have non-ASCII characters that should be preserved
57- has_non_ascii = any (ord (char ) > 127 for char in base )
56+ # Check if we have CJK characters that should be preserved
57+ # CJK ranges: \u4e00-\u9fff (CJK Unified Ideographs), \u3000-\u303f (CJK symbols),
58+ # \u3400-\u4dbf (CJK Extension A), \uff00-\uffef (Fullwidth forms)
59+ has_cjk_chars = any (
60+ '\u4e00 ' <= char <= '\u9fff ' or
61+ '\u3000 ' <= char <= '\u303f ' or
62+ '\u3400 ' <= char <= '\u4dbf ' or
63+ '\uff00 ' <= char <= '\uffef '
64+ for char in base
65+ )
5866
59- if has_non_ascii :
60- # Preserve non-ASCII characters like Chinese while still processing ASCII parts
61- result = base
67+ if has_cjk_chars :
68+ # For text with CJK characters, selectively transliterate only Latin accented chars
69+ result = ""
70+ for char in base :
71+ if ('\u4e00 ' <= char <= '\u9fff ' or
72+ '\u3000 ' <= char <= '\u303f ' or
73+ '\u3400 ' <= char <= '\u4dbf ' ):
74+ # Preserve CJK ideographs and symbols
75+ result += char
76+ elif ('\uff00 ' <= char <= '\uffef ' ):
77+ # Remove Chinese fullwidth punctuation entirely (like ,!?)
78+ continue
79+ else :
80+ # Transliterate Latin accented characters to ASCII
81+ result += unidecode (char )
82+
83+ # Insert hyphens between CJK and Latin character transitions
84+ # Match: CJK followed by Latin letter/digit, or Latin letter/digit followed by CJK
85+ result = re .sub (r'([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])([a-zA-Z0-9])' , r'\1-\2' , result )
86+ result = re .sub (r'([a-zA-Z0-9])([\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf])' , r'\1-\2' , result )
6287
6388 # Insert dash between camelCase
6489 result = re .sub (r"([a-z0-9])([A-Z])" , r"\1-\2" , result )
6590
66- # Convert only ASCII letters to lowercase, preserve non-ASCII
91+ # Convert ASCII letters to lowercase, preserve CJK
6792 lower_text = "" .join (c .lower () if c .isascii () and c .isalpha () else c for c in result )
6893
6994 # Replace underscores with hyphens
7095 text_with_hyphens = lower_text .replace ("_" , "-" )
7196
72- # Replace spaces and unsafe ASCII chars with hyphens, preserve non-ASCII chars
73- # Includes Chinese character ranges (CJK Unified Ideographs, CJK symbols, etc.)
97+ # Remove apostrophes entirely (don't replace with hyphens)
98+ text_no_apostrophes = text_with_hyphens .replace ("'" , "" )
99+
100+ # Replace unsafe chars with hyphens, but preserve CJK characters
74101 clean_text = re .sub (
75- r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_with_hyphens
102+ r"[^a-z0-9\u4e00-\u9fff\u3000-\u303f\u3400-\u4dbf/\-]" , "-" , text_no_apostrophes
76103 )
77104 else :
78105 # Original ASCII-only processing for backward compatibility
@@ -88,8 +115,11 @@ def generate_permalink(file_path: Union[Path, str, PathLike]) -> str:
88115 # replace underscores with hyphens
89116 text_with_hyphens = lower_text .replace ("_" , "-" )
90117
118+ # Remove apostrophes entirely (don't replace with hyphens)
119+ text_no_apostrophes = text_with_hyphens .replace ("'" , "" )
120+
91121 # Replace remaining invalid chars with hyphens
92- clean_text = re .sub (r"[^a-z0-9/\-]" , "-" , text_with_hyphens )
122+ clean_text = re .sub (r"[^a-z0-9/\-]" , "-" , text_no_apostrophes )
93123
94124 # Collapse multiple hyphens
95125 clean_text = re .sub (r"-+" , "-" , clean_text )
@@ -187,3 +217,105 @@ def parse_tags(tags: Union[List[str], str, None]) -> List[str]:
187217 except (ValueError , TypeError ): # pragma: no cover
188218 logger .warning (f"Couldn't parse tags from input of type { type (tags )} : { tags } " )
189219 return []
220+
221+
222+ def normalize_file_path_for_comparison (file_path : str ) -> str :
223+ """Normalize a file path for conflict detection.
224+
225+ This function normalizes file paths to help detect potential conflicts:
226+ - Converts to lowercase for case-insensitive comparison
227+ - Normalizes Unicode characters
228+ - Handles path separators consistently
229+
230+ Args:
231+ file_path: The file path to normalize
232+
233+ Returns:
234+ Normalized file path for comparison purposes
235+ """
236+ import unicodedata
237+
238+ # Convert to lowercase for case-insensitive comparison
239+ normalized = file_path .lower ()
240+
241+ # Normalize Unicode characters (NFD normalization)
242+ normalized = unicodedata .normalize ('NFD' , normalized )
243+
244+ # Replace path separators with forward slashes
245+ normalized = normalized .replace ('\\ ' , '/' )
246+
247+ # Remove multiple slashes
248+ normalized = re .sub (r'/+' , '/' , normalized )
249+
250+ return normalized
251+
252+
253+ def detect_potential_file_conflicts (file_path : str , existing_paths : List [str ]) -> List [str ]:
254+ """Detect potential conflicts between a file path and existing paths.
255+
256+ This function checks for various types of conflicts:
257+ - Case sensitivity differences
258+ - Unicode normalization differences
259+ - Path separator differences
260+ - Permalink generation conflicts
261+
262+ Args:
263+ file_path: The file path to check
264+ existing_paths: List of existing file paths to check against
265+
266+ Returns:
267+ List of existing paths that might conflict with the given file path
268+ """
269+ conflicts = []
270+
271+ # Normalize the input file path
272+ normalized_input = normalize_file_path_for_comparison (file_path )
273+ input_permalink = generate_permalink (file_path )
274+
275+ for existing_path in existing_paths :
276+ # Skip identical paths
277+ if existing_path == file_path :
278+ continue
279+
280+ # Check for case-insensitive path conflicts
281+ normalized_existing = normalize_file_path_for_comparison (existing_path )
282+ if normalized_input == normalized_existing :
283+ conflicts .append (existing_path )
284+ continue
285+
286+ # Check for permalink conflicts
287+ existing_permalink = generate_permalink (existing_path )
288+ if input_permalink == existing_permalink :
289+ conflicts .append (existing_path )
290+ continue
291+
292+ return conflicts
293+
294+
295+ def validate_project_path (path : str , project_path : Path ) -> bool :
296+ """Ensure path stays within project boundaries."""
297+ # Allow empty strings as they resolve to the project root
298+ if not path :
299+ return True
300+
301+ # Check for obvious path traversal patterns first
302+ if ".." in path or "~" in path :
303+ return False
304+
305+ # Check for Windows-style path traversal (even on Unix systems)
306+ if "\\ .." in path or path .startswith ("\\ " ):
307+ return False
308+
309+ # Block absolute paths (Unix-style starting with / or Windows-style with drive letters)
310+ if path .startswith ("/" ) or (len (path ) >= 2 and path [1 ] == ":" ):
311+ return False
312+
313+ # Block paths with control characters (but allow whitespace that will be stripped)
314+ if path .strip () and any (ord (c ) < 32 and c not in [" " , "\t " ] for c in path ):
315+ return False
316+
317+ try :
318+ resolved = (project_path / path ).resolve ()
319+ return resolved .is_relative_to (project_path .resolve ())
320+ except (ValueError , OSError ):
321+ return False
0 commit comments