-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfaithfulness_evaluation.py
More file actions
162 lines (129 loc) · 5.41 KB
/
faithfulness_evaluation.py
File metadata and controls
162 lines (129 loc) · 5.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""
Example: Faithfulness Evaluation with RAGAS
KB Section: 3. Technical Challenges - Evaluation & Quality
KB Link: https://maree217.github.io/copilot-architect-kb#challenges
Description:
Demonstrates how to evaluate RAG responses for faithfulness and relevance using
the RAGAS framework. Critical for production systems to prevent hallucinations.
Target Metrics:
- Faithfulness: > 0.85 (response grounded in retrieved context)
- Answer Relevancy: > 0.80 (response answers the question)
Prerequisites:
- pip install ragas openai python-dotenv
- OPENAI_API_KEY environment variable
Usage:
$ python faithfulness_evaluation.py
"""
import os
from dotenv import load_dotenv
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy
# Load environment variables
load_dotenv()
def evaluate_rag_response(
question: str,
generated_answer: str,
retrieved_contexts: list[str],
ground_truth: str = None
):
"""
Evaluate a RAG response for faithfulness and relevance.
Args:
question: User's question
generated_answer: LLM's generated response
retrieved_contexts: List of retrieved document chunks
ground_truth: Optional reference answer for comparison
Returns:
dict: Evaluation results with faithfulness and relevancy scores
"""
# Prepare dataset for RAGAS
dataset = {
"question": [question],
"answer": [generated_answer],
"contexts": [retrieved_contexts],
}
# Add ground truth if available (improves evaluation)
if ground_truth:
dataset["ground_truths"] = [ground_truth]
# Run evaluation
result = evaluate(
dataset=dataset,
metrics=[faithfulness, answer_relevancy]
)
return result
def main():
"""Example usage with a financial services compliance scenario"""
# Simulated RAG scenario: Banking refund policy query
question = "What is the refund policy for fraudulent transactions?"
# Simulated retrieved context from vector search
retrieved_chunks = [
"For fraudulent transactions, customers must report within 60 days. "
"Full refunds are provided for verified fraud cases. "
"Provisional credit is issued within 10 business days during investigation.",
"Our fraud protection policy covers unauthorized transactions. "
"Customers are not liable for fraudulent charges if reported promptly. "
"The bank conducts a thorough investigation within 90 days."
]
# Generated response from LLM
generated_response = (
"For fraudulent transactions, you must report them within 60 days. "
"You'll receive a full refund for verified fraud cases. "
"During the investigation, we'll issue provisional credit within 10 business days. "
"The investigation typically completes within 90 days."
)
# Ground truth (optional - helps with relevancy scoring)
reference_answer = (
"Report fraudulent transactions within 60 days for full refund. "
"Provisional credit provided within 10 business days. "
"Investigation completes in 90 days."
)
# Evaluate
print("🔍 Evaluating RAG Response...")
print(f"\nQuestion: {question}")
print(f"\nGenerated Answer:\n{generated_response}")
print(f"\nRetrieved Contexts:")
for i, context in enumerate(retrieved_chunks, 1):
print(f" {i}. {context}")
result = evaluate_rag_response(
question=question,
generated_answer=generated_response,
retrieved_contexts=retrieved_chunks,
ground_truth=reference_answer
)
# Display results
print("\n" + "="*60)
print("📊 EVALUATION RESULTS")
print("="*60)
faithfulness_score = result['faithfulness']
relevancy_score = result['answer_relevancy']
print(f"\n✓ Faithfulness: {faithfulness_score:.3f}")
print(f" Target: > 0.85")
print(f" Status: {'✅ PASS' if faithfulness_score > 0.85 else '❌ FAIL'}")
print(f" Meaning: Response is {'grounded in' if faithfulness_score > 0.85 else 'NOT fully grounded in'} retrieved context")
print(f"\n✓ Answer Relevancy: {relevancy_score:.3f}")
print(f" Target: > 0.80")
print(f" Status: {'✅ PASS' if relevancy_score > 0.80 else '❌ FAIL'}")
print(f" Meaning: Response {'answers' if relevancy_score > 0.80 else 'may not fully answer'} the question")
# Production recommendations
print("\n" + "="*60)
print("💡 PRODUCTION RECOMMENDATIONS")
print("="*60)
if faithfulness_score < 0.85:
print("\n⚠️ FAITHFULNESS BELOW THRESHOLD:")
print(" - Review retrieved contexts for relevance")
print(" - Adjust prompt to emphasize grounding")
print(" - Consider adding citation requirements")
print(" - Implement chain-of-thought prompting")
if relevancy_score < 0.80:
print("\n⚠️ RELEVANCY BELOW THRESHOLD:")
print(" - Review query understanding")
print(" - Improve retrieval (reranking, hybrid search)")
print(" - Refine prompt for question focus")
print(" - Consider query rewriting")
if faithfulness_score > 0.85 and relevancy_score > 0.80:
print("\n✅ ALL METRICS PASS - Response is production-ready")
print(" - Faithfulness ensures no hallucination")
print(" - Relevancy ensures question is answered")
print(" - Safe to deploy in compliance-critical applications")
if __name__ == "__main__":
main()