python-sdk/tests/pytest/test_hallucination.py at 4a4baa10196bf6042e3479e8823d429b017e20ae · eval-protocol/python-sdk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Hallucination detection test using LLM-as-judge.

This test demonstrates how to detect factual inaccuracies in model responses
by comparing them against provided knowledge using an LLM judge, similar to
tau's evaluate_nl_assertions approach.
"""

import json
from typing import Any, Dict, List
import pytest

import litellm

from eval_protocol.models import EvaluateResult, EvaluationRow, Message, MetricResult
from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test

# Configure the judge model for LiteLLM
JUDGE_MODEL = "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905"


def hallucination_dataset_adapter(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
    """Convert HaluEval dataset to EvaluationRow objects."""
    return [
        EvaluationRow(
            messages=[Message(role="user", content=f"Knowledge: {item['knowledge']}\n\nQuestion: {item['question']}")],
            ground_truth=item["right_answer"],
        )
        for item in data
    ]


@pytest.mark.asyncio
@evaluation_test(
    input_dataset=["tests/pytest/data/halueval_sample_dataset.jsonl"],
    dataset_adapter=hallucination_dataset_adapter,
    completion_params=[
        {
            "temperature": 0.0,
            "max_tokens": 512,
            "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
        }
    ],
    rollout_processor=SingleTurnRolloutProcessor(),
    passed_threshold=0.33,
    num_runs=1,
    mode="pointwise",
)
async def test_hallucination_detection(row: EvaluationRow) -> EvaluationRow:
    """
    Test for response correctness using LLM-as-judge.
    """
    messages = row.messages
    assistant_response = messages[-1].content

    if not assistant_response:
        return EvaluateResult(score=0.0, reason="❌ No assistant response found")

    correct_answer = row.ground_truth

    system_prompt = """
    TASK
    - You will be given an assistant's response and the correct answer.
    - Your job is to evaluate whether the assistant's response is factually consistent with the correct answer.
    - Grade whether the assistant got it right or wrong.

    FORMAT
    - Your response should be a JSON object with the following fields:
    - `reasoning`: a short explanation for your classification
    - `is_correct`: `true` if the assistant's response matches the correct answer, `false` otherwise

    Example response structure:
    {
        "reasoning": "<reasoning trace>",
        "is_correct": <true or false>
    }
    """

    user_prompt = f"""
    assistant_response:
    {assistant_response}

    correct_answer:
    {correct_answer}
    """

    try:
        response = await litellm.acompletion(
            model=JUDGE_MODEL,
            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
            temperature=0.1,
            max_tokens=500,
        )

        result_data = json.loads(response.choices[0].message.content)
        is_correct = result_data.get("is_correct", False)
        reasoning = result_data.get("reasoning", "Could not parse reasoning")

    except Exception as e:
        # Fallback if parsing fails
        is_correct = False
        reasoning = f"Evaluation failed: {str(e)}"

    score = 1.0 if is_correct else 0.0

    if is_correct:
        assessment = "✅ Response is correct"
    else:
        assessment = "❌ Response is incorrect"

    reason = f"{assessment}\nReasoning: {reasoning}"

    row.evaluation_result = EvaluateResult(
        score=score,
        reason=reason,
        metrics={"llm_judge": MetricResult(score=score, reason=reasoning, is_score_valid=True)},
    )

    return row