-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathtest_hallucination.py
More file actions
119 lines (95 loc) · 3.59 KB
/
Copy pathtest_hallucination.py
File metadata and controls
119 lines (95 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Hallucination detection test using LLM-as-judge.
This test demonstrates how to detect factual inaccuracies in model responses
by comparing them against provided knowledge using an LLM judge, similar to
tau's evaluate_nl_assertions approach.
"""
import json
from typing import Any, Dict, List
import pytest
import litellm
from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
# Configure the judge model for LiteLLM
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"
def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
"""Convert HaluEval dataset to EvaluationRow objects."""
return [
EvaluationRow(
messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
ground_truth=item["right_answer"],
)
for item in data
]
@pytest.mark.asyncio
@evaluation_test(
input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
dataset_adapter=hallucination_dataset_adapter,
completion_params=[
{
"temperature": 0.0,
"max_tokens": 512,
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
}
],
rollout_processor=SingleTurnRolloutProcessor(),
passed_threshold=0.33,
num_runs=1,
mode="pointwise",
)
async def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
"""
Test for response correctness using LLM-as-judge.
"""
messages = row.messages
assistant_response = messages[-1].content
if not assistant_response:
return EvaluateResult(score=0.0, reason="❌ No assistant response found")
correct_answer = row.ground_truth
system_prompt = """
TASK
- You will be given an assistant's response and the correct answer.
- Your job is to evaluate whether the assistant's response is factually consistent with the correct answer.
- Grade whether the assistant got it right or wrong.
FORMAT
- Your response should be a JSON object with the following fields:
- `reasoning`: a short explanation for your classification
- `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise
Example response structure:
{
"reasoning": "<reasoning trace>",
"is_correct": <true or false>
}
"""
user_prompt = f"""
assistant_response:
{assistant_response}
correct_answer:
{correct_answer}
"""
try:
response = await litellm.acompletion(
model=JUDGE_MODEL,
messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
temperature=0.1,
max_tokens=500,
)
result_data = json.loads(response.choices[0].message.content)
is_correct = result_data.get("is_correct", False)
reasoning = result_data.get("reasoning", "Could not parse reasoning")
except Exception as e:
# Fallback if parsing fails
is_correct = False
reasoning = f"Evaluation failed: {str(e)}"
score = 1.0 if is_correct else 0.0
if is_correct:
assessment = "✅ Response is correct"
else:
assessment = "❌ Response is incorrect"
reason = f"{assessment}\nReasoning: {reasoning}"
row.evaluation_result = EvaluateResult(
score=score,
reason=reason,
metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)},
)
return row