1717import io
1818import os
1919import time
20+ from collections import OrderedDict
2021
2122import pandas as pd
2223from fastapi import FastAPI , UploadFile , File , HTTPException , Body , Query , Depends , BackgroundTasks
2324from fastapi .middleware .cors import CORSMiddleware
2425from fastapi .responses import StreamingResponse
26+ from fastapi .staticfiles import StaticFiles
2527from pydantic import BaseModel
2628from pathlib import Path
2729
5254from stringsight .utils .paths import _get_persistent_data_dir , _get_results_dir , _get_cache_dir
5355
5456# -------------------------------------------------------------------------
55- # Simple in-memory cache for parsed JSONL data with TTL
57+ # Bounded LRU cache for parsed JSONL data with TTL
5658# -------------------------------------------------------------------------
57- _JSONL_CACHE : Dict [str , tuple [List [Dict [str , Any ]], datetime ]] = {}
59+ _MAX_CACHE_ENTRIES = 10 # Maximum number of files to cache
60+ _MAX_CACHE_SIZE_MB = 100 # Maximum total cache size in MB
5861_CACHE_TTL = timedelta (minutes = 15 ) # Cache for 15 minutes
62+ _JSONL_CACHE : OrderedDict [str , tuple [List [Dict [str , Any ]], datetime , int ]] = OrderedDict () # key -> (data, timestamp, size_bytes)
5963_CACHE_LOCK = threading .Lock ()
64+ _cache_stats = {"hits" : 0 , "misses" : 0 , "evictions" : 0 , "total_size_mb" : 0.0 }
6065
6166def _get_file_hash (path : Path ) -> str :
6267 """Get a hash of file path and modification time for cache key."""
@@ -65,10 +70,15 @@ def _get_file_hash(path: Path) -> str:
6570 return hashlib .md5 (key_str .encode ()).hexdigest ()
6671
6772def _get_cached_jsonl (path : Path , nrows : int | None = None ) -> List [Dict [str , Any ]]:
68- """Read JSONL file with caching. Cache key includes file mtime to auto-invalidate on changes.
73+ """Read JSONL file with bounded LRU caching. Cache key includes file mtime to auto-invalidate on changes.
6974
7075 Only caches full file reads (nrows=None) to avoid cache bloat. For partial reads,
7176 reads directly from disk.
77+
78+ Cache is bounded by:
79+ - Maximum 10 files
80+ - Maximum 100MB total size
81+ - 15-minute TTL
7282 """
7383 # Only cache full file reads to avoid memory bloat
7484 if nrows is not None :
@@ -79,24 +89,46 @@ def _get_cached_jsonl(path: Path, nrows: int | None = None) -> List[Dict[str, An
7989
8090 with _CACHE_LOCK :
8191 if cache_key in _JSONL_CACHE :
82- cached_data , cached_time = _JSONL_CACHE [cache_key ]
92+ cached_data , cached_time , cached_size = _JSONL_CACHE [cache_key ]
8393 # Check if cache is still valid
8494 if datetime .now () - cached_time < _CACHE_TTL :
95+ # Move to end (mark as recently used)
96+ _JSONL_CACHE .move_to_end (cache_key )
97+ _cache_stats ["hits" ] += 1
8598 logger .debug (f"Cache hit for { path .name } " )
8699 return cached_data
87100 else :
88101 # Remove expired entry
89102 del _JSONL_CACHE [cache_key ]
103+ _cache_stats ["total_size_mb" ] -= cached_size / (1024 * 1024 )
90104 logger .debug (f"Cache expired for { path .name } " )
91105
92106 # Cache miss - read from disk
93- logger .debug (f"Cache miss for { path .name } , reading from disk" )
107+ _cache_stats ["misses" ] += 1
108+ logger .debug (f"Cache miss for { path .name } , reading from disk (hits: { _cache_stats ['hits' ]} , misses: { _cache_stats ['misses' ]} )" )
94109 data = _read_jsonl_as_list (path , nrows )
95110
96111 # Store in cache (only if full file read)
97112 if nrows is None :
113+ import sys
114+ data_size = sys .getsizeof (data )
115+ data_size_mb = data_size / (1024 * 1024 )
116+
98117 with _CACHE_LOCK :
99- _JSONL_CACHE [cache_key ] = (data , datetime .now ())
118+ # Evict entries if needed to stay under limits
119+ while len (_JSONL_CACHE ) >= _MAX_CACHE_ENTRIES or _cache_stats ["total_size_mb" ] + data_size_mb > _MAX_CACHE_SIZE_MB :
120+ if not _JSONL_CACHE :
121+ break
122+ # Remove oldest entry (FIFO eviction)
123+ evicted_key , (_ , _ , evicted_size ) = _JSONL_CACHE .popitem (last = False )
124+ _cache_stats ["total_size_mb" ] -= evicted_size / (1024 * 1024 )
125+ _cache_stats ["evictions" ] += 1
126+ logger .debug (f"Evicted cache entry (total evictions: { _cache_stats ['evictions' ]} )" )
127+
128+ # Add new entry
129+ _JSONL_CACHE [cache_key ] = (data , datetime .now (), data_size )
130+ _cache_stats ["total_size_mb" ] += data_size_mb
131+ logger .debug (f"Cached { path .name } ({ data_size_mb :.2f} MB, total cache: { _cache_stats ['total_size_mb' ]:.2f} MB)" )
100132
101133 return data
102134
@@ -130,7 +162,7 @@ def _resolve_within_base(user_path: str) -> Path:
130162 target = (base / target ).resolve () if not target .is_absolute () else target .resolve ()
131163 try :
132164 target .relative_to (base )
133- except Exception :
165+ except ValueError :
134166 raise HTTPException (status_code = 400 , detail = "Path is outside the allowed base directory" )
135167 if not target .exists ():
136168 raise HTTPException (status_code = 404 , detail = f"Path not found: { target } " )
@@ -371,6 +403,12 @@ def _startup_init_db() -> None:
371403app .include_router (extraction .router )
372404app .include_router (clustering .router )
373405
406+ # Mount final_results directory as static files for direct file access
407+ final_results_path = Path ("final_results" )
408+ if final_results_path .exists ():
409+ app .mount ("/final_results" , StaticFiles (directory = str (final_results_path )), name = "final_results" )
410+ logger .info (f"Mounted /final_results from { final_results_path .absolute ()} " )
411+
374412# NOTE:
375413# All of the primary API endpoints are implemented in `stringsight/routers/*` and
376414# are registered above via `app.include_router(...)`.
@@ -410,7 +448,7 @@ def _run_cluster_job(job: ClusterJob, req: ClusterRunRequest):
410448 try :
411449 asyncio .run (_run_cluster_job_async (job , req ))
412450 except Exception as e :
413- logger .error (f"Error in background cluster job: { e } " )
451+ logger .error (f"Error in background cluster job { job . id } : { e } " , exc_info = True )
414452 with _CLUSTER_JOBS_LOCK :
415453 job .state = "error"
416454 job .error = str (e )
@@ -449,11 +487,13 @@ async def _run_cluster_job_async(job: ClusterJob, req: ClusterRunRequest):
449487 if hasattr (_llm_utils , "_default_llm_utils" ):
450488 _llm_utils ._default_llm_utils = None # type: ignore
451489 except Exception :
490+ # Intentionally silent - cache clearing is best-effort
452491 pass
453492 try :
454493 if hasattr (_cu , "_cache" ):
455494 _cu ._cache = None # type: ignore
456495 except Exception :
496+ # Intentionally silent - cache clearing is best-effort
457497 pass
458498
459499 # Preprocess operationalRows to handle score_columns conversion
@@ -545,8 +585,8 @@ async def _run_cluster_job_async(job: ClusterJob, req: ClusterRunRequest):
545585 meta = p .get ("meta" , {})
546586 )
547587 properties .append (prop )
548- except Exception as e :
549- logger .warning (f"Skipping invalid property: { e } " )
588+ except ( KeyError , TypeError , ValueError ) as e :
589+ logger .warning (f"Skipping invalid property (missing/invalid fields) : { e } " )
550590 continue
551591
552592 if not properties :
0 commit comments