Skip to content

Commit 34f4f91

Browse files
committed
feat: Add original filename support in file handling
- Introduced `original_filename` field in the FileInfo model to store pre-sanitization filenames. - Updated file upload and batch upload functions to include the original filename in metadata. - Enhanced file listing to return the original filename if available, improving metadata accuracy. - Adjusted file service methods to handle the new original filename parameter for better file management.
1 parent 458d9a7 commit 34f4f91

5 files changed

Lines changed: 72 additions & 11 deletions

File tree

src/api/files.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ async def upload_file(
142142
content=content,
143143
content_type=file.content_type,
144144
is_agent_file=is_agent_file,
145+
original_filename=file.filename,
145146
)
146147

147148
uploaded_files.append(
@@ -295,6 +296,7 @@ async def upload_files_batch(
295296
content_type=upload.content_type,
296297
is_agent_file=is_agent_file,
297298
is_read_only=is_read_only,
299+
original_filename=original_filename,
298300
)
299301

300302
results.append(
@@ -413,7 +415,8 @@ async def list_files(
413415
"etag": f'"{file_info.file_id}"',
414416
"metadata": {
415417
"content-type": file_info.content_type,
416-
"original-filename": file_info.filename,
418+
"original-filename": file_info.original_filename
419+
or file_info.filename,
417420
},
418421
"contentType": file_info.content_type,
419422
}

src/models/files.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class FileInfo(BaseModel):
4040
content_type: str
4141
created_at: datetime
4242
path: str = Field(..., description="File path in the session")
43+
original_filename: Optional[str] = None
4344

4445
class Config:
4546
json_encoders = {datetime: lambda v: v.isoformat()}

src/services/execution/output.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,11 +215,12 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str:
215215

216216
@classmethod
217217
def sanitize_filename(cls, input_name: str) -> str:
218-
"""Sanitize filename to match LibreChat's sanitization logic.
218+
"""Sanitize filename while preserving Unicode letters and digits.
219219
220-
Replaces all non-alphanumeric characters (except '.' and '-') with
221-
underscores. This ensures filenames on disk match what LibreChat
222-
reports in the system prompt.
220+
Keeps word characters (\\w — letters, digits, underscore in all
221+
scripts), dots, and dashes. Replaces everything else (path
222+
separators, control chars, shell metacharacters, quotes, etc.)
223+
with underscores.
223224
224225
Args:
225226
input_name: Original filename (may include path components)
@@ -234,8 +235,9 @@ def sanitize_filename(cls, input_name: str) -> str:
234235
# Remove any directory components (path traversal prevention)
235236
name = os.path.basename(input_name)
236237

237-
# Replace any non-alphanumeric characters except for '.' and '-'
238-
name = re.sub(r"[^a-zA-Z0-9.-]", "_", name)
238+
# Replace dangerous characters while preserving Unicode letters/digits.
239+
# \w matches [a-zA-Z0-9_] plus all Unicode letters and digits.
240+
name = re.sub(r"[^\w.\-]", "_", name)
239241

240242
# Ensure the name doesn't start with a dot (hidden file in Unix)
241243
if name.startswith(".") or name == "":

src/services/file.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ async def get_file_info(self, session_id: str, file_id: str) -> Optional[FileInf
360360
content_type=metadata["content_type"],
361361
created_at=metadata["created_at"],
362362
path=metadata["path"],
363+
original_filename=metadata.get("original_filename"),
363364
)
364365

365366
async def list_files(self, session_id: str) -> List[FileInfo]:
@@ -419,6 +420,9 @@ async def link_file_into_session(
419420
"source_session_id": source_session_id,
420421
"source_file_id": source_file_id,
421422
"is_read_only": "1",
423+
"original_filename": source_metadata.get(
424+
"original_filename", source_metadata["filename"]
425+
),
422426
}
423427

424428
await self._store_file_metadata(target_session_id, linked_file_id, metadata)
@@ -444,6 +448,7 @@ async def link_file_into_session(
444448
content_type=metadata["content_type"],
445449
created_at=datetime.fromisoformat(metadata["created_at"]),
446450
path=metadata["path"],
451+
original_filename=metadata.get("original_filename"),
447452
)
448453

449454
async def download_file(self, session_id: str, file_id: str) -> Optional[str]:
@@ -771,16 +776,18 @@ async def store_uploaded_file(
771776
content_type: Optional[str] = None,
772777
is_agent_file: bool = False,
773778
is_read_only: bool = False,
779+
original_filename: Optional[str] = None,
774780
) -> str:
775781
"""Store an uploaded file directly.
776782
777783
Args:
778784
session_id: Session identifier
779-
filename: Original filename
785+
filename: Sanitized filename used for storage and sandbox mounting
780786
content: File content as bytes
781787
content_type: MIME type of the file
782788
is_agent_file: If True, marks the file as read-only (agent-assigned)
783789
is_read_only: If True, mounted file should be chmod 444 in sandbox
790+
original_filename: Pre-sanitization filename for metadata recovery
784791
785792
Returns:
786793
The generated file_id
@@ -825,6 +832,7 @@ async def store_uploaded_file(
825832
"1" if is_agent_file else "0"
826833
), # Read-only if agent file
827834
"is_read_only": "1" if (is_read_only or is_agent_file) else "0",
835+
"original_filename": original_filename or filename,
828836
}
829837

830838
await self._store_file_metadata(session_id, file_id, metadata)

tests/unit/test_output_processor.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,50 @@ def test_absolute_path_stripped(self):
6262
result = OutputProcessor.sanitize_filename("/absolute/path/file.txt")
6363
assert result == "file.txt"
6464

65-
def test_unicode_characters_replaced(self):
66-
"""Test that non-ASCII characters are replaced."""
65+
def test_unicode_characters_preserved(self):
66+
"""Test that Unicode letters are preserved."""
6767
result = OutputProcessor.sanitize_filename("résumé.docx")
68-
assert result == "r_sum_.docx"
68+
assert result == "résumé.docx"
69+
70+
def test_cjk_characters_preserved(self):
71+
"""Test that CJK characters are preserved."""
72+
result = OutputProcessor.sanitize_filename("日本語レポート.xlsx")
73+
assert result == "日本語レポート.xlsx"
74+
75+
def test_cyrillic_characters_preserved(self):
76+
"""Test that Cyrillic characters are preserved."""
77+
result = OutputProcessor.sanitize_filename("файл.txt")
78+
assert result == "файл.txt"
79+
80+
def test_korean_characters_preserved(self):
81+
"""Test that Korean characters are preserved."""
82+
result = OutputProcessor.sanitize_filename("보고서.xlsx")
83+
assert result == "보고서.xlsx"
84+
85+
def test_arabic_characters_preserved(self):
86+
"""Test that Arabic characters are preserved."""
87+
result = OutputProcessor.sanitize_filename("تقرير.pdf")
88+
assert result == "تقرير.pdf"
89+
90+
def test_mixed_unicode_and_ascii(self):
91+
"""Test mixed Unicode and ASCII filename."""
92+
result = OutputProcessor.sanitize_filename("report_2024_報告.pdf")
93+
assert result == "report_2024_報告.pdf"
94+
95+
def test_unicode_with_spaces_sanitized(self):
96+
"""Test that spaces in Unicode filenames are still replaced."""
97+
result = OutputProcessor.sanitize_filename("日本語 レポート.xlsx")
98+
assert result == "日本語_レポート.xlsx"
99+
100+
def test_dangerous_chars_still_blocked(self):
101+
"""Test that shell metacharacters are still replaced."""
102+
result = OutputProcessor.sanitize_filename("file<>|&;$().txt")
103+
assert result == "file________.txt"
104+
105+
def test_underscores_preserved(self):
106+
"""Test that underscores are preserved."""
107+
result = OutputProcessor.sanitize_filename("my_file_name.txt")
108+
assert result == "my_file_name.txt"
69109

70110
def test_brackets_replaced(self):
71111
"""Test that brackets are replaced with underscores."""
@@ -169,6 +209,13 @@ def test_librechat_skill_bundle_pattern(self):
169209
== "skills/foo/SKILL.md"
170210
)
171211

212+
def test_unicode_segments_preserved(self):
213+
"""Test that Unicode directory and file names are preserved."""
214+
assert (
215+
OutputProcessor.sanitize_relative_path("報告/2024年/レポート.xlsx")
216+
== "報告/2024年/レポート.xlsx"
217+
)
218+
172219
def test_sanitize_filename_unchanged_for_basename_callers(self):
173220
"""Regression: sanitize_filename still flattens (legacy upload behavior)."""
174221
# Existing single-call sites rely on this.

0 commit comments

Comments
 (0)