|
55 | 55 | sanitize_error_message, |
56 | 56 | verify_token, |
57 | 57 | ) |
| 58 | +from utils.evaluation import ( |
| 59 | + QueryEvaluator, |
| 60 | + create_sample_evaluation_dataset, |
| 61 | + EvaluationSummary, |
| 62 | + QueryEvaluationResult, |
| 63 | +) |
58 | 64 |
|
59 | 65 | # Load environment variables |
60 | 66 | load_dotenv() |
@@ -457,6 +463,123 @@ async def get_recent_calls_endpoint( |
457 | 463 | ) |
458 | 464 |
|
459 | 465 |
|
| 466 | +# ============================================================================= |
| 467 | +# EVALUATION ENDPOINTS |
| 468 | +# ============================================================================= |
| 469 | + |
| 470 | + |
| 471 | +class EvaluationRequest(BaseModel): |
| 472 | + """Request model for evaluation""" |
| 473 | + queries: List[Dict[str, Any]] = Field( |
| 474 | + ..., |
| 475 | + description="List of queries to evaluate. Each query should have 'query' and optionally 'expected_answer'" |
| 476 | + ) |
| 477 | + use_llm: bool = Field(True, description="Whether to use LLM for answer generation") |
| 478 | + use_sample_dataset: bool = Field( |
| 479 | + False, |
| 480 | + description="If true, use built-in sample dataset instead of provided queries" |
| 481 | + ) |
| 482 | + |
| 483 | + |
| 484 | +@app.post("/evaluation/run", tags=["Evaluation"]) |
| 485 | +async def run_evaluation( |
| 486 | + request: EvaluationRequest, |
| 487 | + user: Optional[Dict] = Depends(optional_auth), |
| 488 | +): |
| 489 | + """ |
| 490 | + Run evaluation metrics on a set of queries |
| 491 | + |
| 492 | + Evaluates: |
| 493 | + - Query performance (latency, throughput) |
| 494 | + - Response quality (relevance, accuracy, completeness, coherence) |
| 495 | + - RAG metrics (context relevance, answer faithfulness, answer relevancy) |
| 496 | + - System metrics (cache hit rate, error rate) |
| 497 | + """ |
| 498 | + if not rag_instance: |
| 499 | + raise HTTPException(status_code=503, detail="RAG instance not initialized") |
| 500 | + |
| 501 | + try: |
| 502 | + openai_api_key = os.getenv("OPENAI_API_KEY") |
| 503 | + evaluator = QueryEvaluator(rag_instance=rag_instance, openai_api_key=openai_api_key) |
| 504 | + |
| 505 | + # Use sample dataset if requested |
| 506 | + if request.use_sample_dataset: |
| 507 | + queries = create_sample_evaluation_dataset() |
| 508 | + else: |
| 509 | + queries = request.queries |
| 510 | + |
| 511 | + # Run evaluation |
| 512 | + summary = evaluator.evaluate_batch(queries, use_llm=request.use_llm) |
| 513 | + |
| 514 | + # Convert to dict for JSON serialization |
| 515 | + result = { |
| 516 | + "summary": { |
| 517 | + "total_queries": summary.total_queries, |
| 518 | + "successful_queries": summary.successful_queries, |
| 519 | + "failed_queries": summary.failed_queries, |
| 520 | + "avg_latency_ms": round(summary.avg_latency_ms, 2), |
| 521 | + "p50_latency_ms": round(summary.p50_latency_ms, 2), |
| 522 | + "p95_latency_ms": round(summary.p95_latency_ms, 2), |
| 523 | + "p99_latency_ms": round(summary.p99_latency_ms, 2), |
| 524 | + "total_tokens": summary.total_tokens, |
| 525 | + "total_cost_usd": round(summary.total_cost_usd, 4), |
| 526 | + "avg_relevance": round(summary.avg_relevance, 3), |
| 527 | + "avg_accuracy": round(summary.avg_accuracy, 3), |
| 528 | + "avg_completeness": round(summary.avg_completeness, 3), |
| 529 | + "avg_coherence": round(summary.avg_coherence, 3), |
| 530 | + "avg_context_relevance": round(summary.avg_context_relevance, 3), |
| 531 | + "avg_answer_faithfulness": round(summary.avg_answer_faithfulness, 3), |
| 532 | + "avg_answer_relevancy": round(summary.avg_answer_relevancy, 3), |
| 533 | + "cache_hit_rate": round(summary.cache_hit_rate, 3), |
| 534 | + "error_rate": round(summary.error_rate, 3), |
| 535 | + }, |
| 536 | + "results": [ |
| 537 | + { |
| 538 | + "query": r.query, |
| 539 | + "expected_answer": r.expected_answer, |
| 540 | + "actual_answer": r.actual_answer, |
| 541 | + "latency_ms": round(r.latency_ms, 2), |
| 542 | + "tokens_used": r.tokens_used, |
| 543 | + "cost_usd": round(r.cost_usd, 4), |
| 544 | + "relevance_score": round(r.relevance_score, 3), |
| 545 | + "accuracy_score": round(r.accuracy_score, 3), |
| 546 | + "completeness_score": round(r.completeness_score, 3), |
| 547 | + "coherence_score": round(r.coherence_score, 3), |
| 548 | + "context_relevance": round(r.context_relevance, 3), |
| 549 | + "answer_faithfulness": round(r.answer_faithfulness, 3), |
| 550 | + "answer_relevancy": round(r.answer_relevancy, 3), |
| 551 | + "cache_hit": r.cache_hit, |
| 552 | + "success": r.success, |
| 553 | + "error": r.error, |
| 554 | + "timestamp": r.timestamp, |
| 555 | + } |
| 556 | + for r in summary.results |
| 557 | + ] |
| 558 | + } |
| 559 | + |
| 560 | + logger.info( |
| 561 | + "evaluation_completed", |
| 562 | + total_queries=summary.total_queries, |
| 563 | + success_rate=1 - summary.error_rate, |
| 564 | + avg_latency_ms=summary.avg_latency_ms |
| 565 | + ) |
| 566 | + |
| 567 | + return result |
| 568 | + |
| 569 | + except Exception as e: |
| 570 | + logger.error("evaluation_failed", error=str(e), exc_info=True) |
| 571 | + raise HTTPException( |
| 572 | + status_code=500, |
| 573 | + detail=f"Failed to run evaluation: {sanitize_error_message(str(e))}" |
| 574 | + ) |
| 575 | + |
| 576 | + |
| 577 | +@app.get("/evaluation/sample-dataset", tags=["Evaluation"]) |
| 578 | +async def get_sample_dataset(): |
| 579 | + """Get the sample evaluation dataset""" |
| 580 | + return {"queries": create_sample_evaluation_dataset()} |
| 581 | + |
| 582 | + |
460 | 583 | @app.get("/admin/status", tags=["Admin"]) |
461 | 584 | async def get_system_status(): |
462 | 585 | """ |
|
0 commit comments