33import os
44import re
55import secrets
6+ import unicodedata
67from pathlib import Path
78from typing import Any , Dict
89
@@ -213,14 +214,27 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str:
213214
214215 return f"Execution failed (exit code { exit_code } ):\n { stderr_clean } "
215216
217+ # ASCII chars safe in filenames — matches LibreChat's ASCII_FILENAME_SAFE_PATTERN.
218+ _ASCII_SAFE = re .compile (r"[a-zA-Z0-9._\-]" )
219+ # C1 control characters (U+0080–U+009F) — unsafe in filenames.
220+ _C1_CONTROLS = re .compile (r"[\x80-\x9f]" )
221+
222+ @classmethod
223+ def _sanitize_char (cls , char : str ) -> str :
224+ """Replace unsafe ASCII; preserve Unicode letters, marks, numbers, and emoji."""
225+ if ord (char ) <= 0x7F :
226+ return char if cls ._ASCII_SAFE .match (char ) else "_"
227+ return "_" if cls ._C1_CONTROLS .match (char ) else char
228+
216229 @classmethod
217230 def sanitize_filename (cls , input_name : str ) -> str :
218- """Sanitize filename while preserving Unicode letters and digits .
231+ """Sanitize filename while preserving Unicode letters, digits, and emoji .
219232
220- Keeps word characters (\\ w — letters, digits, underscore in all
221- scripts), dots, and dashes. Replaces everything else (path
222- separators, control chars, shell metacharacters, quotes, etc.)
223- with underscores.
233+ NFC-normalizes, then applies a two-pass approach matching
234+ LibreChat's ``sanitizeFilenameSegment``: strict for ASCII
235+ (only ``[a-zA-Z0-9._-]``), permissive for non-ASCII (keeps
236+ Unicode letters, combining marks, numbers, emoji — blocks
237+ only C1 control characters).
224238
225239 Args:
226240 input_name: Original filename (may include path components)
@@ -235,9 +249,12 @@ def sanitize_filename(cls, input_name: str) -> str:
235249 # Remove any directory components (path traversal prevention)
236250 name = os .path .basename (input_name )
237251
238- # Replace dangerous characters while preserving Unicode letters/digits.
239- # \w matches [a-zA-Z0-9_] plus all Unicode letters and digits.
240- name = re .sub (r"[^\w.\-]" , "_" , name )
252+ # NFC-normalize so decomposed sequences (e + U+0301) become
253+ # precomposed (é) before the regex runs.
254+ name = unicodedata .normalize ("NFC" , name )
255+
256+ # Two-pass sanitization: strict ASCII, permissive Unicode.
257+ name = "" .join (cls ._sanitize_char (c ) for c in name )
241258
242259 # Ensure the name doesn't start with a dot (hidden file in Unix)
243260 if name .startswith ("." ) or name == "" :
0 commit comments