Skip to content

Commit 8d6c5d8

Browse files
committed
feat: Implement filename sanitization for file uploads and listings
- Added `sanitize_filename` method in `OutputProcessor` to ensure filenames are safe for container use by replacing non-alphanumeric characters and handling edge cases. - Updated `upload_file` and `list_files` functions to utilize the new sanitization logic, ensuring consistent filename formatting across file operations. - Introduced unit tests for `sanitize_filename` to validate its functionality and edge case handling.
1 parent fa0d182 commit 8d6c5d8

3 files changed

Lines changed: 180 additions & 12 deletions

File tree

src/api/files.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# Local application imports
1616
from ..config import settings
1717
from ..dependencies import FileServiceDep
18+
from ..services.execution.output import OutputProcessor
1819
from ..utils.id_generator import generate_session_id
1920

2021

@@ -119,13 +120,13 @@ async def upload_file(
119120
content_type=file.content_type,
120121
)
121122

122-
# Get file info for complete details
123-
file_info = await file_service.get_file_info(session_id, file_id)
123+
# Sanitize filename to match what will be used in container
124+
sanitized_name = OutputProcessor.sanitize_filename(file.filename)
124125

125126
uploaded_files.append(
126127
{
127128
"id": file_id,
128-
"name": file.filename,
129+
"name": sanitized_name,
129130
"session_id": session_id,
130131
"content": None, # LibreChat doesn't return content in upload response
131132
"size": len(content),
@@ -207,10 +208,12 @@ async def list_files(
207208
# Return simple file information
208209
simple_files = []
209210
for file_info in files:
211+
# Return sanitized filename to match container
212+
sanitized_name = OutputProcessor.sanitize_filename(file_info.filename)
210213
simple_files.append(
211214
{
212215
"id": file_info.file_id,
213-
"name": file_info.filename,
216+
"name": sanitized_name,
214217
"path": file_info.path,
215218
}
216219
)
@@ -219,9 +222,11 @@ async def list_files(
219222
# Return full file details - LibreChat format
220223
detailed_files = []
221224
for file_info in files:
225+
# Return sanitized filename to match container
226+
sanitized_name = OutputProcessor.sanitize_filename(file_info.filename)
222227
detailed_files.append(
223228
{
224-
"name": file_info.filename,
229+
"name": sanitized_name,
225230
"id": file_info.file_id,
226231
"session_id": session_id,
227232
"content": None, # Not returned in list

src/services/execution/output.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""Output processing and validation for code execution."""
22

3+
import os
34
import re
5+
import secrets
46
from pathlib import Path
57
from typing import Any, Dict
68

@@ -221,13 +223,55 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str:
221223
return f"Execution failed (exit code {exit_code}):\n{stderr_clean}"
222224

223225
@classmethod
224-
def normalize_filename(cls, filename: str) -> str:
225-
"""Normalize filename for container use: replace spaces with underscores.
226+
def sanitize_filename(cls, input_name: str) -> str:
227+
"""Sanitize filename to match LibreChat's sanitization logic.
228+
229+
Replaces all non-alphanumeric characters (except '.' and '-') with
230+
underscores. This ensures filenames on disk match what LibreChat
231+
reports in the system prompt.
226232
227-
Important: we deliberately KEEP non-ASCII characters (e.g., Japanese)
228-
so that user-visible filenames aren't transliterated.
233+
Args:
234+
input_name: Original filename (may include path components)
235+
236+
Returns:
237+
Sanitized filename safe for container use
229238
"""
239+
if not input_name:
240+
return "_"
241+
230242
try:
231-
return filename.replace(" ", "_") if filename else filename
232-
except Exception:
233-
return filename
243+
# Remove any directory components (path traversal prevention)
244+
name = os.path.basename(input_name)
245+
246+
# Replace any non-alphanumeric characters except for '.' and '-'
247+
name = re.sub(r"[^a-zA-Z0-9.-]", "_", name)
248+
249+
# Ensure the name doesn't start with a dot (hidden file in Unix)
250+
if name.startswith(".") or name == "":
251+
name = "_" + name
252+
253+
# Limit the length of the filename
254+
max_length = 255
255+
if len(name) > max_length:
256+
ext = os.path.splitext(name)[1]
257+
name_without_ext = os.path.splitext(name)[0]
258+
random_suffix = secrets.token_hex(3)
259+
truncate_len = max_length - len(ext) - 7
260+
if truncate_len < 1:
261+
truncate_len = 1
262+
name = name_without_ext[:truncate_len] + "-" + random_suffix + ext
263+
264+
return name
265+
266+
except Exception as e:
267+
logger.error(f"Failed to sanitize filename: {e}")
268+
return "_"
269+
270+
@classmethod
271+
def normalize_filename(cls, filename: str) -> str:
272+
"""Deprecated: Use sanitize_filename instead.
273+
274+
This method is kept for backward compatibility but delegates to
275+
sanitize_filename which matches LibreChat's sanitization logic.
276+
"""
277+
return cls.sanitize_filename(filename)
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
"""Unit tests for the OutputProcessor."""
2+
3+
import pytest
4+
from src.services.execution.output import OutputProcessor
5+
6+
7+
class TestSanitizeFilename:
8+
"""Tests for the sanitize_filename method."""
9+
10+
def test_spaces_replaced_with_underscores(self):
11+
"""Test that spaces are replaced with underscores."""
12+
result = OutputProcessor.sanitize_filename("file with spaces.txt")
13+
assert result == "file_with_spaces.txt"
14+
15+
def test_parentheses_replaced_with_underscores(self):
16+
"""Test that parentheses are replaced with underscores."""
17+
result = OutputProcessor.sanitize_filename("manufacturing_analysis (v2).xlsx")
18+
assert result == "manufacturing_analysis__v2_.xlsx"
19+
20+
def test_special_characters_replaced(self):
21+
"""Test that special characters are replaced with underscores."""
22+
result = OutputProcessor.sanitize_filename("special@chars#here!.pdf")
23+
assert result == "special_chars_here_.pdf"
24+
25+
def test_already_valid_unchanged(self):
26+
"""Test that already valid filenames are unchanged."""
27+
result = OutputProcessor.sanitize_filename("already-valid.txt")
28+
assert result == "already-valid.txt"
29+
30+
def test_uppercase_preserved(self):
31+
"""Test that uppercase letters are preserved."""
32+
result = OutputProcessor.sanitize_filename("UPPERCASE.TXT")
33+
assert result == "UPPERCASE.TXT"
34+
35+
def test_numbers_preserved(self):
36+
"""Test that numbers are preserved."""
37+
result = OutputProcessor.sanitize_filename("123numbers.doc")
38+
assert result == "123numbers.doc"
39+
40+
def test_hidden_file_prefixed(self):
41+
"""Test that hidden files (starting with dot) get underscore prefix."""
42+
result = OutputProcessor.sanitize_filename(".hidden")
43+
assert result == "_.hidden"
44+
45+
def test_empty_string_returns_underscore(self):
46+
"""Test that empty string returns underscore."""
47+
result = OutputProcessor.sanitize_filename("")
48+
assert result == "_"
49+
50+
def test_none_returns_underscore(self):
51+
"""Test that None returns underscore."""
52+
result = OutputProcessor.sanitize_filename(None)
53+
assert result == "_"
54+
55+
def test_path_traversal_stripped(self):
56+
"""Test that path traversal attempts are stripped."""
57+
result = OutputProcessor.sanitize_filename("../../../etc/passwd")
58+
assert result == "passwd"
59+
60+
def test_absolute_path_stripped(self):
61+
"""Test that absolute paths are stripped to basename."""
62+
result = OutputProcessor.sanitize_filename("/absolute/path/file.txt")
63+
assert result == "file.txt"
64+
65+
def test_unicode_characters_replaced(self):
66+
"""Test that non-ASCII characters are replaced."""
67+
result = OutputProcessor.sanitize_filename("résumé.docx")
68+
assert result == "r_sum_.docx"
69+
70+
def test_brackets_replaced(self):
71+
"""Test that brackets are replaced with underscores."""
72+
result = OutputProcessor.sanitize_filename("[brackets].txt")
73+
assert result == "_brackets_.txt"
74+
75+
def test_leading_parenthesis_prefixed(self):
76+
"""Test that filename starting with parenthesis is handled."""
77+
result = OutputProcessor.sanitize_filename("(parentheses).txt")
78+
assert result == "_parentheses_.txt"
79+
80+
def test_dashes_preserved(self):
81+
"""Test that dashes are preserved."""
82+
result = OutputProcessor.sanitize_filename("file-name.with-dashes.txt")
83+
assert result == "file-name.with-dashes.txt"
84+
85+
def test_dots_preserved(self):
86+
"""Test that dots in filename are preserved."""
87+
result = OutputProcessor.sanitize_filename("file.name.multiple.dots.txt")
88+
assert result == "file.name.multiple.dots.txt"
89+
90+
def test_simple_filename_unchanged(self):
91+
"""Test that simple alphanumeric filename is unchanged."""
92+
result = OutputProcessor.sanitize_filename("SimpleFile123.pdf")
93+
assert result == "SimpleFile123.pdf"
94+
95+
def test_long_filename_truncated(self):
96+
"""Test that filenames over 255 chars are truncated with hash suffix."""
97+
long_name = "a" * 300 + ".txt"
98+
result = OutputProcessor.sanitize_filename(long_name)
99+
# Should be 255 chars or less
100+
assert len(result) <= 255
101+
# Should end with .txt
102+
assert result.endswith(".txt")
103+
# Should have a random suffix before extension
104+
assert "-" in result
105+
106+
107+
class TestNormalizeFilename:
108+
"""Tests for the deprecated normalize_filename method."""
109+
110+
def test_delegates_to_sanitize_filename(self):
111+
"""Test that normalize_filename delegates to sanitize_filename."""
112+
result = OutputProcessor.normalize_filename("file with spaces.txt")
113+
expected = OutputProcessor.sanitize_filename("file with spaces.txt")
114+
assert result == expected
115+
116+
def test_parentheses_now_replaced(self):
117+
"""Test that normalize_filename now also replaces parentheses."""
118+
result = OutputProcessor.normalize_filename("file (v2).xlsx")
119+
assert result == "file__v2_.xlsx"

0 commit comments

Comments
 (0)