LibreCodeInterpreter/src/api/files.py at 48921812dda75df4c819eaf2dbb1a94923d9fb72 · joohan-lee/LibreCodeInterpreter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
"""File management API endpoints."""

# Standard library imports
from datetime import datetime, timezone
import inspect
from pathlib import Path
from typing import List, Optional
from urllib.parse import quote

# Third-party imports
import structlog
from fastapi import (
    APIRouter,
    HTTPException,
    Request,
    UploadFile,
    File,
    Form,
    Query,
)
from fastapi.responses import StreamingResponse
from starlette.datastructures import UploadFile as StarletteUploadFile
from unidecode import unidecode

# Local application imports
from ..config import settings
from ..dependencies import FileServiceDep, SessionServiceDep
from ..models import SessionCreate
from ..services.execution.output import OutputProcessor

logger = structlog.get_logger(__name__)
router = APIRouter()


_ASCII_FILENAME_CHARS = (
    "-_.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
)


def _ascii_fallback_filename(name: str) -> str:
    """Generate an ASCII-safe fallback filename component."""
    safe_basename = Path(name).name
    transliterated = unidecode(safe_basename)
    transliterated = transliterated.replace(" ", "_")
    sanitized = "".join(
        ch if ch in _ASCII_FILENAME_CHARS else "_" for ch in transliterated
    )
    return sanitized or "download"


def _build_content_disposition(
    filename: Optional[str], fallback_identifier: str
) -> str:
    """Build Content-Disposition header that supports Unicode filenames."""
    default_name = fallback_identifier or "download"
    original_name = Path(filename or default_name).name
    ascii_fallback = _ascii_fallback_filename(original_name)
    encoded_original = quote(original_name, safe="")
    return f"attachment; filename=\"{ascii_fallback}\"; filename*=UTF-8''{encoded_original}"


@router.post("/upload")
async def upload_file(
    file: Optional[UploadFile] = File(None),
    files: Optional[List[UploadFile]] = File(None),
    entity_id: Optional[str] = Form(None),
    file_service: FileServiceDep = None,
    session_service: SessionServiceDep = None,
):
    """Upload files with multipart form handling - LibreChat compatible.

    Accepts files in either 'file' (singular) or 'files' (plural) field names.
    LibreChat uses 'file' while our tests use 'files'.
    """
    try:
        # Handle both singular and plural field names
        upload_files = []

        # LibreChat sends single file with field name 'file'
        if file is not None:
            upload_files = [file]
        # Tests and other clients may use 'files'
        elif files is not None:
            upload_files = files
        else:
            raise HTTPException(
                status_code=422,
                detail={
                    "error": "Request validation failed",
                    "error_type": "validation",
                    "details": [
                        {
                            "field": "body -> files",
                            "message": "Field required",
                            "code": "missing",
                        }
                    ],
                },
            )

        # Validate uploads via service layer
        validation_error = file_service.validate_uploads(
            filenames=[f.filename or "" for f in upload_files],
            file_sizes=[f.size for f in upload_files],
        )
        if inspect.isawaitable(validation_error):
            validation_error = await validation_error
        if (
            isinstance(validation_error, tuple)
            and len(validation_error) == 2
            and isinstance(validation_error[0], int)
        ):
            raise HTTPException(
                status_code=validation_error[0], detail=validation_error[1]
            )

        uploaded_files = []

        # Create a real session for file uploads
        # This enables session reuse when files are referenced in /exec
        metadata = {}
        if entity_id:
            metadata["entity_id"] = entity_id
        session = await session_service.create_session(SessionCreate(metadata=metadata))
        session_id = session.session_id

        # Determine if this is an agent file (uploaded with entity_id)
        # Agent files are read-only and cannot be modified by user code
        is_agent_file = entity_id is not None and len(entity_id) > 0

        for file in upload_files:
            # Read file content
            content = await file.read()

            # Sanitize filename to match what will be used in container
            sanitized_name = OutputProcessor.sanitize_filename(file.filename)

            # Store with sanitized name so S3, sandbox, and cleanup all use the same name
            file_id = await file_service.store_uploaded_file(
                session_id=session_id,
                filename=sanitized_name,
                content=content,
                content_type=file.content_type,
                is_agent_file=is_agent_file,
                original_filename=file.filename,
            )

            uploaded_files.append(
                {
                    "id": file_id,
                    "name": sanitized_name,
                    "session_id": session_id,
                    "content": None,  # LibreChat doesn't return content in upload response
                    "size": len(content),
                    "lastModified": datetime.utcnow().isoformat(),
                    "etag": f'"{file_id}"',
                    "metadata": {
                        "content-type": file.content_type or "application/octet-stream",
                        "original-filename": file.filename,
                    },
                    "contentType": file.content_type or "application/octet-stream",
                }
            )

        logger.info(
            "Files uploaded successfully",
            count=len(uploaded_files),
            entity_id=entity_id,
        )

        # Return LibreChat-compatible response
        # Note: Production API returns different format with fileId instead of id
        return {
            "message": "success",
            "session_id": session_id,
            "files": [
                {"filename": file["name"], "fileId": file["id"]}
                for file in uploaded_files
            ],
        }

    except HTTPException:
        raise
    except Exception as e:
        logger.error("Failed to upload files", error=str(e), entity_id=entity_id)
        raise HTTPException(status_code=500, detail="Failed to upload files")


# TODO(librechat-compat): /upload/batch duplicates the per-file storage flow
# from /upload above. Kept separate to avoid touching the stable single-file
# endpoint while we prove out the batch path. If both endpoints stay in
# production unchanged for a release cycle, factor a shared
# `_store_files_to_session()` helper that both call.
@router.post("/upload/batch")
async def upload_files_batch(
    request: Request,
    file_service: FileServiceDep = None,
    session_service: SessionServiceDep = None,
):
    """Batch file upload — LibreChat compatible.

    LibreChat (`crud.js:118` in librechat) sends multi-file uploads here as
    multipart with the field name `file` repeated once per file. Per-file
    failures are reported individually in the response rather than failing
    the whole batch — LibreChat's caller distinguishes `succeeded`/`failed`
    counts and reads each `files[].status`.

    Filenames may include subdirectories (e.g. `skills/foo/SKILL.md` from
    skill priming). Subdirectory structure is preserved via
    `OutputProcessor.sanitize_relative_path()`; LibreChat then echoes them
    back to its agent code, which checks `f.filename.endsWith('/SKILL.md')`.
    """
    form = await request.form()
    upload_files: List[UploadFile] = [
        v
        for k, v in form.multi_items()
        if k == "file" and isinstance(v, StarletteUploadFile)
    ]

    if not upload_files:
        # LibreChat guards with `if (filesToUpload.length === 0) return null`
        # before calling, so reaching this branch means a misconfigured
        # client. Match the existing /upload contract for missing files.
        raise HTTPException(
            status_code=422,
            detail={
                "error": "Request validation failed",
                "error_type": "validation",
                "details": [
                    {
                        "field": "body -> file",
                        "message": "At least one file required",
                        "code": "missing",
                    }
                ],
            },
        )

    if len(upload_files) > settings.max_files_per_session:
        raise HTTPException(
            status_code=413,
            detail=(
                f"Too many files in batch. Maximum "
                f"{settings.max_files_per_session} files allowed per upload."
            ),
        )

    entity_id_raw = form.get("entity_id")
    entity_id: Optional[str] = (
        entity_id_raw if isinstance(entity_id_raw, str) and entity_id_raw else None
    )
    # LibreChat sends kind=skill/agent (not entity_id) for skill-priming uploads.
    # Treat these as agent files so skill bundles bypass the user-facing extension
    # whitelist and are correctly tagged read-only in the sandbox.
    kind_raw = form.get("kind")
    is_agent_file = entity_id is not None or (
        isinstance(kind_raw, str) and kind_raw in ("skill", "agent")
    )

    read_only_raw = form.get("read_only")
    is_read_only = isinstance(read_only_raw, str) and read_only_raw.lower() in (
        "1",
        "true",
        "yes",
    )

    metadata = {"entity_id": entity_id} if entity_id else {}
    session = await session_service.create_session(SessionCreate(metadata=metadata))
    session_id = session.session_id

    max_size_bytes = settings.max_file_size_mb * 1024 * 1024
    results: List[dict] = []
    succeeded = 0
    failed = 0

    for upload in upload_files:
        original_filename = upload.filename or "unknown"
        try:
            content = await upload.read()
            size = len(content)
            if size > max_size_bytes:
                raise ValueError(f"File exceeds {settings.max_file_size_mb}MB limit")
            # Skill-priming uploads (entity_id set) come from the LibreChat host
            # itself, not end users. Skill bundles legitimately ship arbitrary
            # extensions (.xsd schemas, .toml configs, .lock files, .d.ts type
            # defs, etc.) — extending the user-facing allowlist for every new
            # skill is unsustainable. The sandbox is the actual security
            # boundary; extension filtering exists to stop end-user uploads
            # of executables via /upload, not to second-guess the LibreChat
            # host's skill loader. Skip the extension check for the agent path.
            if not is_agent_file and not settings.is_file_allowed(original_filename):
                raise ValueError(f"File type not allowed: {original_filename}")

            # Preserve subdirectory structure (LibreChat skill bundles ship
            # `skills/<name>/SKILL.md` etc.) while sanitizing each segment.
            stored_filename = OutputProcessor.sanitize_relative_path(original_filename)

            file_id = await file_service.store_uploaded_file(
                session_id=session_id,
                filename=stored_filename,
                content=content,
                content_type=upload.content_type,
                is_agent_file=is_agent_file,
                is_read_only=is_read_only,
                original_filename=original_filename,
            )

            results.append(
                {
                    "status": "success",
                    "fileId": file_id,
                    "filename": stored_filename,
                }
            )
            succeeded += 1
        except Exception as exc:
            logger.warning(
                "Batch upload entry failed",
                filename=original_filename,
                error=str(exc),
            )
            results.append(
                {
                    "status": "error",
                    "filename": original_filename,
                    "error": str(exc),
                }
            )
            failed += 1

    if failed == 0:
        message = "success"
    elif succeeded == 0:
        message = "error"
    else:
        message = "partial"

    logger.info(
        "Batch upload completed",
        session_id=session_id,
        entity_id=entity_id,
        succeeded=succeeded,
        failed=failed,
    )

    return {
        "message": message,
        "session_id": session_id,
        "storage_session_id": session_id,  # LibreChat alias for session_id
        "files": results,
        "succeeded": succeeded,
        "failed": failed,
    }


@router.get("/files/{session_id}")
async def list_files(
    session_id: str,
    detail: Optional[str] = Query(
        None,
        description="Detail level: 'simple' for basic info, otherwise full details",
    ),
    file_service: FileServiceDep = None,
):
    """List all files in a session with optional detail parameter - LibreChat compatible."""
    try:
        files = await file_service.list_files(session_id)

        if not files:
            # Return empty array instead of 404
            return []

        if detail == "summary":
            # Return minimal summary required by client contract
            summary_files = []
            for file_info in files:
                dt = file_info.created_at
                # Ensure UTC with 'Z' and millisecond precision
                if isinstance(dt, str):
                    try:
                        dt = datetime.fromisoformat(dt)
                    except Exception:
                        dt = datetime.utcnow()
                if dt.tzinfo is None:
                    dt = dt.replace(tzinfo=timezone.utc)
                last_modified = dt.isoformat(timespec="milliseconds").replace(
                    "+00:00", "Z"
                )
                summary_files.append(
                    {
                        "name": f"{session_id}/{file_info.file_id}",
                        "lastModified": last_modified,
                    }
                )
            return summary_files
        elif detail == "simple":
            # Return simple file information
            simple_files = []
            for file_info in files:
                # Return sanitized filename to match container
                sanitized_name = OutputProcessor.sanitize_filename(file_info.filename)
                simple_files.append(
                    {
                        "id": file_info.file_id,
                        "name": sanitized_name,
                        "path": file_info.path,
                    }
                )
            return simple_files
        else:
            # Return full file details - LibreChat format
            detailed_files = []
            for file_info in files:
                detailed_files.append(
                    {
                        "name": f"{session_id}/{file_info.file_id}",
                        "id": file_info.file_id,
                        "session_id": session_id,
                        "content": None,  # Not returned in list
                        "size": file_info.size,
                        "lastModified": file_info.created_at.isoformat(),
                        "etag": f'"{file_info.file_id}"',
                        "metadata": {
                            "content-type": file_info.content_type,
                            "original-filename": file_info.original_filename
                            or file_info.filename,
                        },
                        "contentType": file_info.content_type,
                    }
                )
            return detailed_files

    except Exception as e:
        logger.error("Failed to list files", session_id=session_id, error=str(e))
        # Return 404 if session not found
        raise HTTPException(status_code=404, detail="Session not found")


@router.get("/sessions/{session_id}/objects/{file_id}")
async def get_session_object_metadata(
    session_id: str,
    file_id: str,
    file_service: FileServiceDep = None,
):
    """Session-liveness probe used by LibreChat's `primeFiles()`.

    LibreChat's `process.js:363` reads `lastModified` only — if the value
    parses to >23h ago (or this endpoint 404s), it treats the session as
    expired and re-uploads the file from its own storage. We return the
    file's `created_at`, normalized to UTC + `Z`, matching the format used
    by `GET /files/{session_id}?detail=summary`.
    """
    try:
        file_info = await file_service.get_file_info(session_id, file_id)
    except Exception as e:
        logger.warning(
            "Failed to look up session object metadata",
            session_id=session_id,
            file_id=file_id,
            error=str(e),
        )
        raise HTTPException(status_code=404, detail="File not found")

    if file_info is None:
        raise HTTPException(status_code=404, detail="File not found")

    dt = file_info.created_at
    if isinstance(dt, str):
        try:
            dt = datetime.fromisoformat(dt)
        except ValueError:
            dt = datetime.now(timezone.utc)
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)

    last_modified = dt.isoformat(timespec="milliseconds").replace("+00:00", "Z")
    return {"lastModified": last_modified}


@router.get("/download/{session_id}/{file_id}")
async def download_file(
    session_id: str, file_id: str, file_service: FileServiceDep = None
):
    """Download a file directly - LibreChat compatible."""
    try:
        # Get file info first
        file_info = await file_service.get_file_info(session_id, file_id)
        if not file_info:
            raise HTTPException(status_code=404, detail="File not found")

        # Get file content
        file_content = await file_service.get_file_content(session_id, file_id)
        if file_content is None:
            raise HTTPException(status_code=404, detail="File content not found")

        # Create a generator that yields chunks for proper streaming
        async def generate_chunks():
            chunk_size = 8192  # 8KB chunks
            bytes_remaining = len(file_content)
            offset = 0

            while bytes_remaining > 0:
                chunk_size_to_read = min(chunk_size, bytes_remaining)
                yield file_content[offset : offset + chunk_size_to_read]
                offset += chunk_size_to_read
                bytes_remaining -= chunk_size_to_read

        # Determine content type based on file extension if needed
        content_type = file_info.content_type or "application/octet-stream"
        if content_type == "application/octet-stream" and file_info.filename:
            # Try to guess content type from filename
            import mimetypes

            guessed_type, _ = mimetypes.guess_type(file_info.filename)
            if guessed_type:
                content_type = guessed_type

        content_disposition = _build_content_disposition(
            file_info.filename, file_info.file_id
        )

        # Return streaming response WITHOUT Content-Length to force chunked encoding
        return StreamingResponse(
            generate_chunks(),
            media_type=content_type,
            headers={
                "Content-Disposition": content_disposition,
                "Cache-Control": "private, max-age=3600",
            },
        )

    except HTTPException:
        raise
    except Exception as e:
        logger.error(
            "Failed to download file",
            session_id=session_id,
            file_id=file_id,
            error=str(e),
        )
        raise HTTPException(status_code=404, detail="File not found")