Skip to content

Commit 85dcd36

Browse files
committed
backend file loading cachign
1 parent 785b369 commit 85dcd36

3 files changed

Lines changed: 72 additions & 32 deletions

File tree

stringsight/api.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -331,10 +331,10 @@ def _startup_init_db() -> None:
331331
# This sets up environment variables for cache and results directories
332332
_get_cache_dir() # Call this to auto-configure cache if RENDER_DISK_PATH is set
333333

334-
# GZIP compression disabled - can add significant CPU overhead
335-
# Uncomment below if network transfer is the bottleneck:
336-
# from fastapi.middleware.gzip import GZipMiddleware
337-
# app.add_middleware(GZipMiddleware, minimum_size=10000, compresslevel=1)
334+
# GZIP compression enabled for improved network performance
335+
# Uses moderate compression level (5) to balance CPU and transfer speed
336+
from fastapi.middleware.gzip import GZipMiddleware
337+
app.add_middleware(GZipMiddleware, minimum_size=1000, compresslevel=5)
338338

339339
# CORS configuration - allow all origins for development and production
340340
app.add_middleware(

stringsight/prompt_generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def generate_prompts(
7171
logger.info(f"Using task description: {task_desc_for_generation[:100]}...")
7272

7373
try:
74-
logger.info(f"Attempting dynamic prompt generation with {len(dataset.conversations)} conversations, {dynamic_prompt_samples} samples")
74+
logger.info(f"Attempting dynamic prompt generation with {len(dataset.conversations)} total conversations, sampling {min(dynamic_prompt_samples, len(dataset.conversations))} for prompt generation")
7575
generator = DynamicPromptGenerator(seed=seed)
7676
result = generator.generate_all_prompts(
7777
task_description=task_desc_for_generation,

stringsight/routers/validation.py

Lines changed: 67 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,7 @@ def get_conversations(dataset: str, offset: int = 0, limit: int | None = None) -
506506
"""Get conversations with pagination.
507507
508508
Returns only the requested slice, not entire file.
509+
Uses caching for improved performance.
509510
510511
Args:
511512
dataset: Dataset name (folder under final_results/)
@@ -525,15 +526,12 @@ def get_conversations(dataset: str, offset: int = 0, limit: int | None = None) -
525526
if not conversations_file.exists():
526527
raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset}")
527528

528-
conversations = []
529-
import json
530-
with open(conversations_file, encoding='utf-8', errors='replace') as f:
531-
for i, line in enumerate(f):
532-
if i < offset:
533-
continue
534-
if limit is not None and i >= offset + limit:
535-
break
536-
conversations.append(json.loads(line))
529+
# Use cached read for entire file, then slice in memory
530+
all_conversations = _get_cached_jsonl(conversations_file)
531+
532+
# Apply pagination to cached data
533+
end_idx = offset + limit if limit is not None else len(all_conversations)
534+
conversations = all_conversations[offset:end_idx]
537535

538536
return {
539537
"data": conversations,
@@ -547,6 +545,8 @@ def get_conversations(dataset: str, offset: int = 0, limit: int | None = None) -
547545
def get_properties(dataset: str) -> Dict[str, Any]:
548546
"""Get properties (usually smaller, can load all at once).
549547
548+
Uses caching for improved performance.
549+
550550
Args:
551551
dataset: Dataset name (folder under final_results/)
552552
@@ -562,11 +562,8 @@ def get_properties(dataset: str) -> Dict[str, Any]:
562562
if not properties_file.exists():
563563
return {"data": []}
564564

565-
properties = []
566-
import json
567-
with open(properties_file) as f:
568-
for line in f:
569-
properties.append(json.loads(line))
565+
# Use cached read instead of manual parsing
566+
properties = _get_cached_jsonl(properties_file)
570567

571568
return {"data": properties}
572569

@@ -575,6 +572,8 @@ def get_properties(dataset: str) -> Dict[str, Any]:
575572
def get_clusters(dataset: str) -> Dict[str, Any]:
576573
"""Get clusters.
577574
575+
Uses caching for improved performance.
576+
578577
Args:
579578
dataset: Dataset name (folder under final_results/)
580579
@@ -590,15 +589,12 @@ def get_clusters(dataset: str) -> Dict[str, Any]:
590589
if not clusters_file.exists():
591590
return {"data": []}
592591

593-
import json
592+
# Use cached read for both JSONL and JSON files
594593
if clusters_file.suffix == ".jsonl":
595-
clusters = []
596-
with open(clusters_file) as f:
597-
for line in f:
598-
clusters.append(json.loads(line))
594+
clusters = _get_cached_jsonl(clusters_file)
599595
else:
600-
with open(clusters_file) as f:
601-
clusters = json.load(f)
596+
# For JSON files, use the cached JSON reader
597+
clusters = _read_json_safe(clusters_file)
602598

603599
return {"data": clusters}
604600

@@ -607,6 +603,8 @@ def get_clusters(dataset: str) -> Dict[str, Any]:
607603
def get_metrics(dataset: str) -> Dict[str, Any]:
608604
"""Get all metrics files.
609605
606+
Uses caching for improved performance.
607+
610608
Args:
611609
dataset: Dataset name (folder under final_results/)
612610
@@ -616,19 +614,61 @@ def get_metrics(dataset: str) -> Dict[str, Any]:
616614
final_results_dir = Path("final_results") / dataset
617615
metrics = {}
618616

619-
import json
620617
for metric_type in ["model_cluster_scores_df", "cluster_scores_df", "model_scores_df"]:
621618
metric_file = final_results_dir / f"{metric_type}.jsonl"
622619
if metric_file.exists():
623-
data = []
624-
with open(metric_file) as f:
625-
for line in f:
626-
data.append(json.loads(line))
627-
metrics[metric_type] = data
620+
# Use cached read for metrics files
621+
metrics[metric_type] = _get_cached_jsonl(metric_file)
628622

629623
return metrics
630624

631625

626+
@router.get("/results/{dataset}/all")
627+
def get_all_dataset_data(dataset: str, conversations_limit: int = 1000) -> Dict[str, Any]:
628+
"""Get all dataset data in a single request (combined endpoint).
629+
630+
This is a performance optimization that reduces network round trips by combining
631+
conversations, properties, clusters, and metrics into a single response.
632+
Uses caching for all data sources.
633+
634+
Args:
635+
dataset: Dataset name (folder under final_results/)
636+
conversations_limit: Maximum number of conversations to return (default: 1000)
637+
638+
Returns:
639+
Dict with conversations, properties, clusters, metrics, and metadata
640+
"""
641+
# Use the cached individual endpoint functions
642+
conversations_result = get_conversations(dataset, offset=0, limit=conversations_limit)
643+
properties_result = get_properties(dataset)
644+
clusters_result = get_clusters(dataset)
645+
metrics_result = get_metrics(dataset)
646+
647+
# Calculate total conversations for metadata
648+
final_results_dir = Path("final_results") / dataset
649+
conversations_file = final_results_dir / "clustered_results_lightweight.jsonl"
650+
if not conversations_file.exists():
651+
conversations_file = final_results_dir / "conversations.jsonl"
652+
653+
total_conversations = 0
654+
if conversations_file.exists():
655+
with open(conversations_file, encoding='utf-8', errors='replace') as f:
656+
total_conversations = sum(1 for _ in f)
657+
658+
return {
659+
"conversations": conversations_result["data"],
660+
"properties": properties_result["data"],
661+
"clusters": clusters_result["data"],
662+
"metrics": {
663+
"model_cluster_scores_df": metrics_result.get("model_cluster_scores_df", []),
664+
"cluster_scores_df": metrics_result.get("cluster_scores_df", []),
665+
"model_scores_df": metrics_result.get("model_scores_df", [])
666+
},
667+
"total_conversations": total_conversations,
668+
"has_more": conversations_result["has_more"]
669+
}
670+
671+
632672
@router.get("/results/{dataset}/summary")
633673
def get_dataset_summary(dataset: str) -> Dict[str, Any]:
634674
"""Get dataset summary (fast - just metadata, no full data).

0 commit comments

Comments
 (0)