Skip to content

Commit a5f4c4c

Browse files
committed
feat: Add comprehensive evaluation metrics system
- Create evaluation module (utils/evaluation.py) with: * QueryEvaluator class for running evaluations * Metrics: relevance, accuracy, completeness, coherence * RAG metrics: context relevance, answer faithfulness, answer relevancy * Performance metrics: latency (avg, p50, p95, p99), tokens, cost * System metrics: cache hit rate, error rate - Add evaluation API endpoints: * POST /evaluation/run - Run evaluation on queries * GET /evaluation/sample-dataset - Get sample evaluation dataset - Create EvaluationDashboard React component with: * Configuration panel (sample dataset or custom queries) * Summary cards with key metrics * Detailed summary sections (Performance, Quality, RAG) * Individual query results with all metrics * Color-coded success/error states - Add Evaluation tab to main navigation - Support for expected answers for accuracy measurement - Batch evaluation support
1 parent 2b67417 commit a5f4c4c

5 files changed

Lines changed: 1213 additions & 3 deletions

File tree

api.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@
5555
sanitize_error_message,
5656
verify_token,
5757
)
58+
from utils.evaluation import (
59+
QueryEvaluator,
60+
create_sample_evaluation_dataset,
61+
EvaluationSummary,
62+
QueryEvaluationResult,
63+
)
5864

5965
# Load environment variables
6066
load_dotenv()
@@ -457,6 +463,123 @@ async def get_recent_calls_endpoint(
457463
)
458464

459465

466+
# =============================================================================
467+
# EVALUATION ENDPOINTS
468+
# =============================================================================
469+
470+
471+
class EvaluationRequest(BaseModel):
472+
"""Request model for evaluation"""
473+
queries: List[Dict[str, Any]] = Field(
474+
...,
475+
description="List of queries to evaluate. Each query should have 'query' and optionally 'expected_answer'"
476+
)
477+
use_llm: bool = Field(True, description="Whether to use LLM for answer generation")
478+
use_sample_dataset: bool = Field(
479+
False,
480+
description="If true, use built-in sample dataset instead of provided queries"
481+
)
482+
483+
484+
@app.post("/evaluation/run", tags=["Evaluation"])
485+
async def run_evaluation(
486+
request: EvaluationRequest,
487+
user: Optional[Dict] = Depends(optional_auth),
488+
):
489+
"""
490+
Run evaluation metrics on a set of queries
491+
492+
Evaluates:
493+
- Query performance (latency, throughput)
494+
- Response quality (relevance, accuracy, completeness, coherence)
495+
- RAG metrics (context relevance, answer faithfulness, answer relevancy)
496+
- System metrics (cache hit rate, error rate)
497+
"""
498+
if not rag_instance:
499+
raise HTTPException(status_code=503, detail="RAG instance not initialized")
500+
501+
try:
502+
openai_api_key = os.getenv("OPENAI_API_KEY")
503+
evaluator = QueryEvaluator(rag_instance=rag_instance, openai_api_key=openai_api_key)
504+
505+
# Use sample dataset if requested
506+
if request.use_sample_dataset:
507+
queries = create_sample_evaluation_dataset()
508+
else:
509+
queries = request.queries
510+
511+
# Run evaluation
512+
summary = evaluator.evaluate_batch(queries, use_llm=request.use_llm)
513+
514+
# Convert to dict for JSON serialization
515+
result = {
516+
"summary": {
517+
"total_queries": summary.total_queries,
518+
"successful_queries": summary.successful_queries,
519+
"failed_queries": summary.failed_queries,
520+
"avg_latency_ms": round(summary.avg_latency_ms, 2),
521+
"p50_latency_ms": round(summary.p50_latency_ms, 2),
522+
"p95_latency_ms": round(summary.p95_latency_ms, 2),
523+
"p99_latency_ms": round(summary.p99_latency_ms, 2),
524+
"total_tokens": summary.total_tokens,
525+
"total_cost_usd": round(summary.total_cost_usd, 4),
526+
"avg_relevance": round(summary.avg_relevance, 3),
527+
"avg_accuracy": round(summary.avg_accuracy, 3),
528+
"avg_completeness": round(summary.avg_completeness, 3),
529+
"avg_coherence": round(summary.avg_coherence, 3),
530+
"avg_context_relevance": round(summary.avg_context_relevance, 3),
531+
"avg_answer_faithfulness": round(summary.avg_answer_faithfulness, 3),
532+
"avg_answer_relevancy": round(summary.avg_answer_relevancy, 3),
533+
"cache_hit_rate": round(summary.cache_hit_rate, 3),
534+
"error_rate": round(summary.error_rate, 3),
535+
},
536+
"results": [
537+
{
538+
"query": r.query,
539+
"expected_answer": r.expected_answer,
540+
"actual_answer": r.actual_answer,
541+
"latency_ms": round(r.latency_ms, 2),
542+
"tokens_used": r.tokens_used,
543+
"cost_usd": round(r.cost_usd, 4),
544+
"relevance_score": round(r.relevance_score, 3),
545+
"accuracy_score": round(r.accuracy_score, 3),
546+
"completeness_score": round(r.completeness_score, 3),
547+
"coherence_score": round(r.coherence_score, 3),
548+
"context_relevance": round(r.context_relevance, 3),
549+
"answer_faithfulness": round(r.answer_faithfulness, 3),
550+
"answer_relevancy": round(r.answer_relevancy, 3),
551+
"cache_hit": r.cache_hit,
552+
"success": r.success,
553+
"error": r.error,
554+
"timestamp": r.timestamp,
555+
}
556+
for r in summary.results
557+
]
558+
}
559+
560+
logger.info(
561+
"evaluation_completed",
562+
total_queries=summary.total_queries,
563+
success_rate=1 - summary.error_rate,
564+
avg_latency_ms=summary.avg_latency_ms
565+
)
566+
567+
return result
568+
569+
except Exception as e:
570+
logger.error("evaluation_failed", error=str(e), exc_info=True)
571+
raise HTTPException(
572+
status_code=500,
573+
detail=f"Failed to run evaluation: {sanitize_error_message(str(e))}"
574+
)
575+
576+
577+
@app.get("/evaluation/sample-dataset", tags=["Evaluation"])
578+
async def get_sample_dataset():
579+
"""Get the sample evaluation dataset"""
580+
return {"queries": create_sample_evaluation_dataset()}
581+
582+
460583
@app.get("/admin/status", tags=["Admin"])
461584
async def get_system_status():
462585
"""

frontend/src/App.tsx

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,23 @@ import DocumentationPage from './components/DocumentationPage';
88
import { AuraDBAnalyticsDashboard } from './components/AuraDBAnalyticsDashboard';
99
import { ThemeExtractionView } from './components/ThemeExtractionView';
1010
import { AnalyticsDashboard } from './components/AnalyticsDashboard';
11+
import { EvaluationDashboard } from './components/EvaluationDashboard';
1112

12-
type TabKey = 'home' | 'query' | 'semantic' | 'pipeline' | 'auradb' | 'themes' | 'analytics' | 'docs';
13+
type TabKey = 'home' | 'query' | 'semantic' | 'pipeline' | 'auradb' | 'themes' | 'analytics' | 'evaluation' | 'docs';
1314

1415
export default function App() {
1516
const [activeTab, setActiveTab] = useState<TabKey>('home');
1617

1718
// Handle hash routing
1819
useEffect(() => {
1920
const hash = window.location.hash.slice(1);
20-
if (hash && ['home', 'query', 'semantic', 'pipeline', 'auradb', 'themes', 'analytics', 'docs'].includes(hash)) {
21+
if (hash && ['home', 'query', 'semantic', 'pipeline', 'auradb', 'themes', 'analytics', 'evaluation', 'docs'].includes(hash)) {
2122
setActiveTab(hash as TabKey);
2223
}
2324

2425
const handleHashChange = () => {
2526
const newHash = window.location.hash.slice(1);
26-
if (newHash && ['home', 'query', 'semantic', 'pipeline', 'auradb', 'themes', 'analytics', 'docs'].includes(newHash)) {
27+
if (newHash && ['home', 'query', 'semantic', 'pipeline', 'auradb', 'themes', 'analytics', 'evaluation', 'docs'].includes(newHash)) {
2728
setActiveTab(newHash as TabKey);
2829
}
2930
};
@@ -186,6 +187,7 @@ export default function App() {
186187
activeTab === 'auradb' ? styles.mainFull :
187188
activeTab === 'themes' ? styles.mainFull :
188189
activeTab === 'analytics' ? styles.mainFull :
190+
activeTab === 'evaluation' ? styles.mainFull :
189191
styles.main
190192
}>
191193
{activeTab === 'home' && <LandingPage />}
@@ -195,6 +197,7 @@ export default function App() {
195197
{activeTab === 'auradb' && <AuraDBAnalyticsDashboard />}
196198
{activeTab === 'themes' && <ThemeExtractionView />}
197199
{activeTab === 'analytics' && <AnalyticsDashboard />}
200+
{activeTab === 'evaluation' && <EvaluationDashboard />}
198201
{activeTab === 'docs' && <DocumentationPage />}
199202
</main>
200203
</div>

0 commit comments

Comments
 (0)