Skip to content

Commit e85e9e8

Browse files
usnavy13claude
andcommitted
fix: Match LibreChat's Unicode sanitization — add emoji, NFC, and two-pass approach
Align sanitize_filename with LibreChat#12977's sanitizeFilenameSegment: - NFC-normalize before sanitizing (handles decomposed accents) - Two-pass: strict ASCII [a-zA-Z0-9._-], permissive non-ASCII (only blocks C1 controls U+0080-U+009F) - Preserves emoji (📊) and ZWJ sequences that \w alone would strip Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 34f4f91 commit e85e9e8

2 files changed

Lines changed: 36 additions & 8 deletions

File tree

src/services/execution/output.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import re
55
import secrets
6+
import unicodedata
67
from pathlib import Path
78
from typing import Any, Dict
89

@@ -213,14 +214,27 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str:
213214

214215
return f"Execution failed (exit code {exit_code}):\n{stderr_clean}"
215216

217+
# ASCII chars safe in filenames — matches LibreChat's ASCII_FILENAME_SAFE_PATTERN.
218+
_ASCII_SAFE = re.compile(r"[a-zA-Z0-9._\-]")
219+
# C1 control characters (U+0080–U+009F) — unsafe in filenames.
220+
_C1_CONTROLS = re.compile(r"[\x80-\x9f]")
221+
222+
@classmethod
223+
def _sanitize_char(cls, char: str) -> str:
224+
"""Replace unsafe ASCII; preserve Unicode letters, marks, numbers, and emoji."""
225+
if ord(char) <= 0x7F:
226+
return char if cls._ASCII_SAFE.match(char) else "_"
227+
return "_" if cls._C1_CONTROLS.match(char) else char
228+
216229
@classmethod
217230
def sanitize_filename(cls, input_name: str) -> str:
218-
"""Sanitize filename while preserving Unicode letters and digits.
231+
"""Sanitize filename while preserving Unicode letters, digits, and emoji.
219232
220-
Keeps word characters (\\w — letters, digits, underscore in all
221-
scripts), dots, and dashes. Replaces everything else (path
222-
separators, control chars, shell metacharacters, quotes, etc.)
223-
with underscores.
233+
NFC-normalizes, then applies a two-pass approach matching
234+
LibreChat's ``sanitizeFilenameSegment``: strict for ASCII
235+
(only ``[a-zA-Z0-9._-]``), permissive for non-ASCII (keeps
236+
Unicode letters, combining marks, numbers, emoji — blocks
237+
only C1 control characters).
224238
225239
Args:
226240
input_name: Original filename (may include path components)
@@ -235,9 +249,12 @@ def sanitize_filename(cls, input_name: str) -> str:
235249
# Remove any directory components (path traversal prevention)
236250
name = os.path.basename(input_name)
237251

238-
# Replace dangerous characters while preserving Unicode letters/digits.
239-
# \w matches [a-zA-Z0-9_] plus all Unicode letters and digits.
240-
name = re.sub(r"[^\w.\-]", "_", name)
252+
# NFC-normalize so decomposed sequences (e + U+0301) become
253+
# precomposed (é) before the regex runs.
254+
name = unicodedata.normalize("NFC", name)
255+
256+
# Two-pass sanitization: strict ASCII, permissive Unicode.
257+
name = "".join(cls._sanitize_char(c) for c in name)
241258

242259
# Ensure the name doesn't start with a dot (hidden file in Unix)
243260
if name.startswith(".") or name == "":

tests/unit/test_output_processor.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,17 @@ def test_underscores_preserved(self):
107107
result = OutputProcessor.sanitize_filename("my_file_name.txt")
108108
assert result == "my_file_name.txt"
109109

110+
def test_emoji_preserved(self):
111+
"""Test that emoji are preserved (matches LibreChat's \\p{Emoji})."""
112+
result = OutputProcessor.sanitize_filename("chart\U0001F4CA.csv")
113+
assert result == "chart\U0001F4CA.csv"
114+
115+
def test_nfd_normalized_to_nfc(self):
116+
"""Test that decomposed Unicode is NFC-normalized before sanitizing."""
117+
# e + combining acute (U+0301) -> precomposed e-acute
118+
result = OutputProcessor.sanitize_filename("Café.txt")
119+
assert result == "Café.txt"
120+
110121
def test_brackets_replaced(self):
111122
"""Test that brackets are replaced with underscores."""
112123
result = OutputProcessor.sanitize_filename("[brackets].txt")

0 commit comments

Comments
 (0)