lisadunlap
diff --git a/‎stringsight/api.py‎
Lines changed: 50 additions & 10 deletions b/‎stringsight/api.py‎
Lines changed: 50 additions & 10 deletions
diff --git a/‎stringsight/clusterers/base.py‎
Lines changed: 15 additions & 1 deletion b/‎stringsight/clusterers/base.py‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎stringsight/clusterers/hdbscan.py‎
Lines changed: 8 additions & 1 deletion b/‎stringsight/clusterers/hdbscan.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎stringsight/db_models/job.py‎
Lines changed: 1 addition & 1 deletion b/‎stringsight/db_models/job.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎stringsight/logging_config.py‎
Lines changed: 71 additions & 13 deletions b/‎stringsight/logging_config.py‎
Lines changed: 71 additions & 13 deletions
@@ -17,11 +17,13 @@
 import io
 import os
 import time
+from collections import OrderedDict
 
 import pandas as pd
 from fastapi import FastAPI, UploadFile, File, HTTPException, Body, Query, Depends, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
 from pathlib import Path
 
@@ -52,11 +54,14 @@
 from stringsight.utils.paths import _get_persistent_data_dir, _get_results_dir, _get_cache_dir
 
 # -------------------------------------------------------------------------
-# Simple in-memory cache for parsed JSONL data with TTL
+# Bounded LRU cache for parsed JSONL data with TTL
 # -------------------------------------------------------------------------
-_JSONL_CACHE: Dict[str, tuple[List[Dict[str, Any]], datetime]] = {}
+_MAX_CACHE_ENTRIES = 10  # Maximum number of files to cache
+_MAX_CACHE_SIZE_MB = 100  # Maximum total cache size in MB
 _CACHE_TTL = timedelta(minutes=15)  # Cache for 15 minutes
+_JSONL_CACHE: OrderedDict[str, tuple[List[Dict[str, Any]], datetime, int]] = OrderedDict()  # key -> (data, timestamp, size_bytes)
 _CACHE_LOCK = threading.Lock()
+_cache_stats = {"hits": 0, "misses": 0, "evictions": 0, "total_size_mb": 0.0}
 
 def _get_file_hash(path: Path) -> str:
     """Get a hash of file path and modification time for cache key."""
@@ -65,10 +70,15 @@ def _get_file_hash(path: Path) -> str:
     return hashlib.md5(key_str.encode()).hexdigest()
 
 def _get_cached_jsonl(path: Path, nrows: int | None = None) -> List[Dict[str, Any]]:
-    """Read JSONL file with caching. Cache key includes file mtime to auto-invalidate on changes.
+    """Read JSONL file with bounded LRU caching. Cache key includes file mtime to auto-invalidate on changes.
 
     Only caches full file reads (nrows=None) to avoid cache bloat. For partial reads,
     reads directly from disk.
+
+    Cache is bounded by:
+    - Maximum 10 files
+    - Maximum 100MB total size
+    - 15-minute TTL
     """
     # Only cache full file reads to avoid memory bloat
     if nrows is not None:
@@ -79,24 +89,46 @@ def _get_cached_jsonl(path: Path, nrows: int | None = None) -> List[Dict[str, An
 
     with _CACHE_LOCK:
         if cache_key in _JSONL_CACHE:
-            cached_data, cached_time = _JSONL_CACHE[cache_key]
+            cached_data, cached_time, cached_size = _JSONL_CACHE[cache_key]
             # Check if cache is still valid
             if datetime.now() - cached_time < _CACHE_TTL:
+                # Move to end (mark as recently used)
+                _JSONL_CACHE.move_to_end(cache_key)
+                _cache_stats["hits"] += 1
                 logger.debug(f"Cache hit for {path.name}")
                 return cached_data
             else:
                 # Remove expired entry
                 del _JSONL_CACHE[cache_key]
+                _cache_stats["total_size_mb"] -= cached_size / (1024 * 1024)
                 logger.debug(f"Cache expired for {path.name}")
 
     # Cache miss - read from disk
-    logger.debug(f"Cache miss for {path.name}, reading from disk")
+    _cache_stats["misses"] += 1
+    logger.debug(f"Cache miss for {path.name}, reading from disk (hits: {_cache_stats['hits']}, misses: {_cache_stats['misses']})")
     data = _read_jsonl_as_list(path, nrows)
 
     # Store in cache (only if full file read)
     if nrows is None:
+        import sys
+        data_size = sys.getsizeof(data)
+        data_size_mb = data_size / (1024 * 1024)
+
         with _CACHE_LOCK:
-            _JSONL_CACHE[cache_key] = (data, datetime.now())
+            # Evict entries if needed to stay under limits
+            while len(_JSONL_CACHE) >= _MAX_CACHE_ENTRIES or _cache_stats["total_size_mb"] + data_size_mb > _MAX_CACHE_SIZE_MB:
+                if not _JSONL_CACHE:
+                    break
+                # Remove oldest entry (FIFO eviction)
+                evicted_key, (_, _, evicted_size) = _JSONL_CACHE.popitem(last=False)
+                _cache_stats["total_size_mb"] -= evicted_size / (1024 * 1024)
+                _cache_stats["evictions"] += 1
+                logger.debug(f"Evicted cache entry (total evictions: {_cache_stats['evictions']})")
+
+            # Add new entry
+            _JSONL_CACHE[cache_key] = (data, datetime.now(), data_size)
+            _cache_stats["total_size_mb"] += data_size_mb
+            logger.debug(f"Cached {path.name} ({data_size_mb:.2f}MB, total cache: {_cache_stats['total_size_mb']:.2f}MB)")
 
     return data
 
@@ -130,7 +162,7 @@ def _resolve_within_base(user_path: str) -> Path:
     target = (base / target).resolve() if not target.is_absolute() else target.resolve()
     try:
         target.relative_to(base)
-    except Exception:
+    except ValueError:
         raise HTTPException(status_code=400, detail="Path is outside the allowed base directory")
     if not target.exists():
         raise HTTPException(status_code=404, detail=f"Path not found: {target}")
@@ -371,6 +403,12 @@ def _startup_init_db() -> None:
 app.include_router(extraction.router)
 app.include_router(clustering.router)
 
+# Mount final_results directory as static files for direct file access
+final_results_path = Path("final_results")
+if final_results_path.exists():
+    app.mount("/final_results", StaticFiles(directory=str(final_results_path)), name="final_results")
+    logger.info(f"Mounted /final_results from {final_results_path.absolute()}")
+
 # NOTE:
 # All of the primary API endpoints are implemented in `stringsight/routers/*` and
 # are registered above via `app.include_router(...)`.
@@ -410,7 +448,7 @@ def _run_cluster_job(job: ClusterJob, req: ClusterRunRequest):
     try:
         asyncio.run(_run_cluster_job_async(job, req))
     except Exception as e:
-        logger.error(f"Error in background cluster job: {e}")
+        logger.error(f"Error in background cluster job {job.id}: {e}", exc_info=True)
         with _CLUSTER_JOBS_LOCK:
             job.state = "error"
             job.error = str(e)
@@ -449,11 +487,13 @@ async def _run_cluster_job_async(job: ClusterJob, req: ClusterRunRequest):
             if hasattr(_llm_utils, "_default_llm_utils"):
                 _llm_utils._default_llm_utils = None  # type: ignore
         except Exception:
+            # Intentionally silent - cache clearing is best-effort
             pass
         try:
             if hasattr(_cu, "_cache"):
                 _cu._cache = None  # type: ignore
         except Exception:
+            # Intentionally silent - cache clearing is best-effort
             pass
 
         # Preprocess operationalRows to handle score_columns conversion
@@ -545,8 +585,8 @@ async def _run_cluster_job_async(job: ClusterJob, req: ClusterRunRequest):
                     meta=p.get("meta", {})
                 )
                 properties.append(prop)
-            except Exception as e:
-                logger.warning(f"Skipping invalid property: {e}")
+            except (KeyError, TypeError, ValueError) as e:
+                logger.warning(f"Skipping invalid property (missing/invalid fields): {e}")
                 continue
 
         if not properties:
 
@@ -291,12 +291,26 @@ def _build_clusters_from_df(self, df: pd.DataFrame, column_name: str) -> List[Cl
 
         Group rows by cluster id, extract labels and collect
         `question_id`, `{column_name}` and `id` values for each cluster.
+
+        Filters out rows where property_description is empty or NaN to ensure
+        only valid properties are included in clusters.
         """
         label_col = f"{column_name}_cluster_label"
         id_col = f"{column_name}_cluster_id"
 
+        # Filter out invalid properties (empty or NaN property descriptions)
+        if column_name in df.columns:
+            valid_mask = df[column_name].notna() & (df[column_name].astype(str).str.strip() != "")
+            df_filtered = df[valid_mask].copy()
+
+            invalid_count = len(df) - len(df_filtered)
+            if invalid_count > 0:
+                self.log(f"Filtered out {invalid_count} properties with empty descriptions from clustering")
+        else:
+            df_filtered = df
+
         clusters: List[Cluster] = []
-        for cid, group in df.groupby(id_col):
+        for cid, group in df_filtered.groupby(id_col):
             cid_group = group[group[id_col] == cid]
             label = str(cid_group[label_col].iloc[0])
 
 
@@ -131,8 +131,14 @@ async def cluster(self, data: PropertyDataset, column_name: str, progress_callba
         # `to_dataframe(type="properties")` may include conversation rows without any extracted
         # properties (i.e., missing/NaN `column_name`). Those rows cannot be clustered and can
         # also coerce cluster id dtypes to float downstream, which breaks group metadata mapping.
+        # Also filter out empty strings to ensure only valid properties are clustered.
         if column_name in df.columns:
+            initial_count = len(df)
             df = df[df[column_name].notna()].copy()
+            df = df[df[column_name].astype(str).str.strip() != ""].copy()
+            filtered_count = initial_count - len(df)
+            if filtered_count > 0:
+                self.log(f"Filtered out {filtered_count} properties with empty or missing descriptions before clustering")
 
         if getattr(self, "verbose", False):
             logger.debug(f"DataFrame shape after to_dataframe: {df.shape}")
@@ -251,7 +257,8 @@ async def _cluster_group_async(group_info):
                     if progress_callback:
                         try:
                             progress_callback((i + 1) / total_groups)
-                        except Exception:
+                        except Exception as e:
+                            logger.debug(f"Progress callback failed: {e}")
                             pass
                 clustered_df = pd.concat(clustered_parts, ignore_index=True)
         else:
 
@@ -30,7 +30,7 @@ class Job(Base):  # type: ignore[misc, valid-type]
     error_message = Column(Text, nullable=True)
 
     # Timestamps
-    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    created_at = Column(DateTime(timezone=True), server_default=func.now(), index=True)
     updated_at = Column(DateTime(timezone=True), onupdate=func.now())
 
     # Relationships
 
@@ -5,27 +5,39 @@
 via environment variables:
 - STRINGSIGHT_LOG_LEVEL: Set logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
 - STRINGSIGHT_LOG_FORMAT: Custom log format (optional)
+- STRINGSIGHT_JSON_LOGS: Enable JSON structured logging (true/false)
+
+Default format includes timestamp, level, module name, and message:
+    [2024-01-15 10:30:45] INFO [module.name] Message here
 
 Usage:
     from stringsight.logging_config import get_logger
-    
+
     logger = get_logger(__name__)
     logger.debug("Debug message")
     logger.info("Info message")
     logger.warning("Warning message")
-    logger.error("Error message")
+    logger.error("Error message", exc_info=True)  # Include stack trace
+
+    # Add extra context fields
+    logger.error("Job failed", exc_info=True, extra={"job_id": job_id})
 """
 
 import logging
 import os
 import sys
-from typing import Optional
+import json
+from typing import Optional, Any, Dict
 
 
-# Default log format - simple format without timestamp/level for cleaner output
-DEFAULT_LOG_FORMAT = "%(message)s"
+# Default log format - includes timestamp, level, module name, and message
+# Format: [2024-01-15 10:30:45] INFO [module.name] Message here
+DEFAULT_LOG_FORMAT = "[%(asctime)s] %(levelname)s [%(name)s] %(message)s"
 DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
 
+# Simple format option (for backwards compatibility, set via env var)
+SIMPLE_LOG_FORMAT = "%(message)s"
+
 
 def _get_log_level() -> int:
     """Get the logging level from environment variable or default to INFO."""
@@ -38,6 +50,38 @@ def _get_log_format() -> str:
     return os.environ.get("STRINGSIGHT_LOG_FORMAT", DEFAULT_LOG_FORMAT)
 
 
+def _use_json_logs() -> bool:
+    """Check if JSON logging is enabled via environment variable."""
+    json_logs_env = os.environ.get("STRINGSIGHT_JSON_LOGS", "false").lower()
+    return json_logs_env in ("true", "1", "yes")
+
+
+class JSONFormatter(logging.Formatter):
+    """JSON formatter for structured logging in production."""
+
+    def format(self, record: logging.LogRecord) -> str:
+        """Format log record as JSON."""
+        log_data: Dict[str, Any] = {
+            "timestamp": self.formatTime(record, self.datefmt),
+            "level": record.levelname,
+            "logger": record.name,
+            "message": record.getMessage(),
+        }
+
+        # Add exception info if present
+        if record.exc_info:
+            log_data["exception"] = self.formatException(record.exc_info)
+
+        # Add extra fields if present
+        if hasattr(record, "job_id"):
+            log_data["job_id"] = record.job_id
+        if hasattr(record, "stage"):
+            log_data["stage"] = record.stage
+
+        return json.dumps(log_data)
+
+
+
 def configure_logging(
     level: Optional[int] = None,
     format_string: Optional[str] = None,
@@ -59,15 +103,29 @@ def configure_logging(
 
     if date_format is None:
         date_format = DEFAULT_DATE_FORMAT
-    
+
+    # Check if JSON logging is enabled
+    use_json = _use_json_logs()
+
     # Configure the root logger
-    logging.basicConfig(
-        level=level,
-        format=format_string,
-        datefmt=date_format,
-        stream=sys.stdout,
-        force=True  # Override any existing configuration
-    )
+    if use_json:
+        # Use JSON formatter for structured logging
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(JSONFormatter(datefmt=date_format))
+        logging.basicConfig(
+            level=level,
+            handlers=[handler],
+            force=True
+        )
+    else:
+        # Use standard text formatter
+        logging.basicConfig(
+            level=level,
+            format=format_string,
+            datefmt=date_format,
+            stream=sys.stdout,
+            force=True  # Override any existing configuration
+        )
 
     # Suppress noisy third-party library logs
     logging.getLogger("LiteLLM").setLevel(logging.WARNING)