Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 1 addition & 43 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -1,43 +1 @@
# Repository Guidelines

## Project Structure & Module Organization
Core application code lives in `src/`. Use `src/api/` for FastAPI routes, `src/services/` for orchestration and business logic, `src/services/sandbox/` and `src/services/container/` for execution backends, `src/models/` for request/response models, and `src/config/` for environment-driven settings. Supporting docs are in `docs/`, dashboard assets in `dashboard/`, container/runtime files in `docker/`, and helper scripts in `scripts/`.

Tests are split by scope: `tests/unit/` for isolated service logic, `tests/integration/` for API and dependency-backed flows, `tests/functional/` for live endpoint testing, and `tests/snapshots/` for stored response fixtures.

## Build, Test, and Development Commands
Set up a local environment with:

```bash
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
cp .env.example .env
```

Run locally with `uvicorn src.main:app --reload`. Start required services with `docker compose up -d`, and build the sandbox image with `docker build -t code-interpreter:nsjail .`.

Key verification commands:

```bash
pytest tests/unit/
pytest tests/integration/
pytest tests/functional/ -v
pytest --cov=src tests/
black src/ --check
flake8 src/
mypy src/
bandit -r src/ -s B104,B108 --severity-level high
```

## Coding Style & Naming Conventions
Target Python 3.11+ with 4-space indentation, explicit type hints, and small async-friendly service boundaries. Follow Black formatting and keep code Flake8- and MyPy-clean. Use `snake_case` for modules, functions, and variables; `PascalCase` for classes and Pydantic models; and `UPPER_SNAKE_CASE` for constants and env names.

## Testing Guidelines
Pytest, `pytest-asyncio`, and `pytest-cov` are the standard tools. Name files `test_*.py` and mirror the component under test where practical, for example `tests/unit/test_session_service.py`. Add unit coverage for new logic first, then integration coverage for endpoint or storage changes. Functional tests use `API_BASE`, `API_KEY`, and `API_TIMEOUT`; keep them stable against a real running API.

## Commit & Pull Request Guidelines
Recent history uses short imperative subjects with prefixes such as `fix:`, `docs:`, `chore(...)`, and `feat:`. Keep the first line under 72 characters and reference issues in the body when relevant. Pull requests should explain behavior changes, note config or API contract impacts, and include the commands you ran. Add screenshots when changing the admin dashboard or other visible UI.

## Security & Configuration Tips
Never commit populated `.env` files, API keys, or storage credentials. Use `.env.example` as the template, and review `docs/CONFIGURATION.md` and `docs/SECURITY.md` before changing auth, sandboxing, Redis, or MinIO behavior.
Read CLAUDE.md
5 changes: 4 additions & 1 deletion src/api/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ async def upload_file(
content=content,
content_type=file.content_type,
is_agent_file=is_agent_file,
original_filename=file.filename,
)

uploaded_files.append(
Expand Down Expand Up @@ -295,6 +296,7 @@ async def upload_files_batch(
content_type=upload.content_type,
is_agent_file=is_agent_file,
is_read_only=is_read_only,
original_filename=original_filename,
)

results.append(
Expand Down Expand Up @@ -413,7 +415,8 @@ async def list_files(
"etag": f'"{file_info.file_id}"',
"metadata": {
"content-type": file_info.content_type,
"original-filename": file_info.filename,
"original-filename": file_info.original_filename
or file_info.filename,
},
"contentType": file_info.content_type,
}
Expand Down
1 change: 1 addition & 0 deletions src/models/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class FileInfo(BaseModel):
content_type: str
created_at: datetime
path: str = Field(..., description="File path in the session")
original_filename: Optional[str] = None

class Config:
json_encoders = {datetime: lambda v: v.isoformat()}
Expand Down
31 changes: 25 additions & 6 deletions src/services/execution/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import secrets
import unicodedata
from pathlib import Path
from typing import Any, Dict

Expand Down Expand Up @@ -213,13 +214,27 @@ def format_error_message(cls, exit_code: int, stderr: str) -> str:

return f"Execution failed (exit code {exit_code}):\n{stderr_clean}"

# ASCII chars safe in filenames — matches LibreChat's ASCII_FILENAME_SAFE_PATTERN.
_ASCII_SAFE = re.compile(r"[a-zA-Z0-9._\-]")
# C1 control characters (U+0080–U+009F) — unsafe in filenames.
_C1_CONTROLS = re.compile(r"[\x80-\x9f]")

@classmethod
def _sanitize_char(cls, char: str) -> str:
"""Replace unsafe ASCII; preserve Unicode letters, marks, numbers, and emoji."""
if ord(char) <= 0x7F:
return char if cls._ASCII_SAFE.match(char) else "_"
return "_" if cls._C1_CONTROLS.match(char) else char

@classmethod
def sanitize_filename(cls, input_name: str) -> str:
"""Sanitize filename to match LibreChat's sanitization logic.
"""Sanitize filename while preserving Unicode letters, digits, and emoji.

Replaces all non-alphanumeric characters (except '.' and '-') with
underscores. This ensures filenames on disk match what LibreChat
reports in the system prompt.
NFC-normalizes, then applies a two-pass approach matching
LibreChat's ``sanitizeFilenameSegment``: strict for ASCII
(only ``[a-zA-Z0-9._-]``), permissive for non-ASCII (keeps
Unicode letters, combining marks, numbers, emoji — blocks
only C1 control characters).

Args:
input_name: Original filename (may include path components)
Expand All @@ -234,8 +249,12 @@ def sanitize_filename(cls, input_name: str) -> str:
# Remove any directory components (path traversal prevention)
name = os.path.basename(input_name)

# Replace any non-alphanumeric characters except for '.' and '-'
name = re.sub(r"[^a-zA-Z0-9.-]", "_", name)
# NFC-normalize so decomposed sequences (e + U+0301) become
# precomposed (é) before the regex runs.
name = unicodedata.normalize("NFC", name)

# Two-pass sanitization: strict ASCII, permissive Unicode.
name = "".join(cls._sanitize_char(c) for c in name)

# Ensure the name doesn't start with a dot (hidden file in Unix)
if name.startswith(".") or name == "":
Expand Down
10 changes: 9 additions & 1 deletion src/services/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ async def get_file_info(self, session_id: str, file_id: str) -> Optional[FileInf
content_type=metadata["content_type"],
created_at=metadata["created_at"],
path=metadata["path"],
original_filename=metadata.get("original_filename"),
)

async def list_files(self, session_id: str) -> List[FileInfo]:
Expand Down Expand Up @@ -419,6 +420,9 @@ async def link_file_into_session(
"source_session_id": source_session_id,
"source_file_id": source_file_id,
"is_read_only": "1",
"original_filename": source_metadata.get(
"original_filename", source_metadata["filename"]
),
}

await self._store_file_metadata(target_session_id, linked_file_id, metadata)
Expand All @@ -444,6 +448,7 @@ async def link_file_into_session(
content_type=metadata["content_type"],
created_at=datetime.fromisoformat(metadata["created_at"]),
path=metadata["path"],
original_filename=metadata.get("original_filename"),
)

async def download_file(self, session_id: str, file_id: str) -> Optional[str]:
Expand Down Expand Up @@ -771,16 +776,18 @@ async def store_uploaded_file(
content_type: Optional[str] = None,
is_agent_file: bool = False,
is_read_only: bool = False,
original_filename: Optional[str] = None,
) -> str:
"""Store an uploaded file directly.

Args:
session_id: Session identifier
filename: Original filename
filename: Sanitized filename used for storage and sandbox mounting
content: File content as bytes
content_type: MIME type of the file
is_agent_file: If True, marks the file as read-only (agent-assigned)
is_read_only: If True, mounted file should be chmod 444 in sandbox
original_filename: Pre-sanitization filename for metadata recovery

Returns:
The generated file_id
Expand Down Expand Up @@ -825,6 +832,7 @@ async def store_uploaded_file(
"1" if is_agent_file else "0"
), # Read-only if agent file
"is_read_only": "1" if (is_read_only or is_agent_file) else "0",
"original_filename": original_filename or filename,
}

await self._store_file_metadata(session_id, file_id, metadata)
Expand Down
8 changes: 7 additions & 1 deletion tests/integration/test_librechat_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1901,7 +1901,13 @@ def setup_mocks(self):
stored_filenames = []

async def fake_store(
session_id, filename, content, content_type, is_agent_file, is_read_only=False
session_id,
filename,
content,
content_type,
is_agent_file,
is_read_only=False,
original_filename=None,
):
stored_filenames.append(filename)
return f"fid-{len(stored_filenames)}"
Expand Down
64 changes: 61 additions & 3 deletions tests/unit/test_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,61 @@ def test_absolute_path_stripped(self):
result = OutputProcessor.sanitize_filename("/absolute/path/file.txt")
assert result == "file.txt"

def test_unicode_characters_replaced(self):
"""Test that non-ASCII characters are replaced."""
def test_unicode_characters_preserved(self):
"""Test that Unicode letters are preserved."""
result = OutputProcessor.sanitize_filename("résumé.docx")
assert result == "r_sum_.docx"
assert result == "résumé.docx"

def test_cjk_characters_preserved(self):
"""Test that CJK characters are preserved."""
result = OutputProcessor.sanitize_filename("日本語レポート.xlsx")
assert result == "日本語レポート.xlsx"

def test_cyrillic_characters_preserved(self):
"""Test that Cyrillic characters are preserved."""
result = OutputProcessor.sanitize_filename("файл.txt")
assert result == "файл.txt"

def test_korean_characters_preserved(self):
"""Test that Korean characters are preserved."""
result = OutputProcessor.sanitize_filename("보고서.xlsx")
assert result == "보고서.xlsx"

def test_arabic_characters_preserved(self):
"""Test that Arabic characters are preserved."""
result = OutputProcessor.sanitize_filename("تقرير.pdf")
assert result == "تقرير.pdf"

def test_mixed_unicode_and_ascii(self):
"""Test mixed Unicode and ASCII filename."""
result = OutputProcessor.sanitize_filename("report_2024_報告.pdf")
assert result == "report_2024_報告.pdf"

def test_unicode_with_spaces_sanitized(self):
"""Test that spaces in Unicode filenames are still replaced."""
result = OutputProcessor.sanitize_filename("日本語 レポート.xlsx")
assert result == "日本語_レポート.xlsx"

def test_dangerous_chars_still_blocked(self):
"""Test that shell metacharacters are still replaced."""
result = OutputProcessor.sanitize_filename("file<>|&;$().txt")
assert result == "file________.txt"

def test_underscores_preserved(self):
"""Test that underscores are preserved."""
result = OutputProcessor.sanitize_filename("my_file_name.txt")
assert result == "my_file_name.txt"

def test_emoji_preserved(self):
"""Test that emoji are preserved (matches LibreChat's \\p{Emoji})."""
result = OutputProcessor.sanitize_filename("chart\U0001F4CA.csv")
assert result == "chart\U0001F4CA.csv"

def test_nfd_normalized_to_nfc(self):
"""Test that decomposed Unicode is NFC-normalized before sanitizing."""
# e + combining acute (U+0301) -> precomposed e-acute
result = OutputProcessor.sanitize_filename("Café.txt")
assert result == "Café.txt"

def test_brackets_replaced(self):
"""Test that brackets are replaced with underscores."""
Expand Down Expand Up @@ -169,6 +220,13 @@ def test_librechat_skill_bundle_pattern(self):
== "skills/foo/SKILL.md"
)

def test_unicode_segments_preserved(self):
"""Test that Unicode directory and file names are preserved."""
assert (
OutputProcessor.sanitize_relative_path("報告/2024年/レポート.xlsx")
== "報告/2024年/レポート.xlsx"
)

def test_sanitize_filename_unchanged_for_basename_callers(self):
"""Regression: sanitize_filename still flattens (legacy upload behavior)."""
# Existing single-call sites rely on this.
Expand Down
Loading