Skip to content

Commit c4a2d47

Browse files
committed
test: add unit tests for evaluation metrics module
Signed-off-by: YASHBHIWANIA <yashbhiwania772@gmail.com>
1 parent c0ce39f commit c4a2d47

2 files changed

Lines changed: 181 additions & 0 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import sys
2+
import os
3+
4+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
"""
2+
Unit tests for evaluation metrics module.
3+
Tests verify metric factory functions return correct types,
4+
use correct thresholds, and handle NotImplementedError cases.
5+
"""
6+
import pytest
7+
from unittest.mock import MagicMock
8+
from deepeval.metrics import (
9+
ContextualPrecisionMetric,
10+
ContextualRecallMetric,
11+
HallucinationMetric,
12+
AnswerRelevancyMetric,
13+
BiasMetric,
14+
ToxicityMetric,
15+
GEval,
16+
)
17+
from deepeval.models.base_model import DeepEvalBaseLLM
18+
19+
from src.metrics.retrieval import (
20+
make_contextual_precision_metric,
21+
make_contextual_recall_metric,
22+
make_contextual_relevancy_metric,
23+
make_faithfulness_metric,
24+
make_hallucination_metric,
25+
PRECISION_THRESHOLD,
26+
RECALL_THRESHOLD,
27+
HALLUCINATION_THRESHOLD,
28+
)
29+
from src.metrics.content import (
30+
make_answer_relevancy_metric,
31+
make_bias_metric,
32+
make_toxicity_metric,
33+
ANSRELEVANCY_THRESHOLD,
34+
BIAS_THRESHOLD,
35+
TOXICITY_THRESHOLD,
36+
)
37+
from src.metrics.geval import make_correctness_metric
38+
39+
40+
@pytest.fixture
41+
def mock_model():
42+
"""Mock DeepEvalBaseLLM model for testing."""
43+
return MagicMock(spec=DeepEvalBaseLLM)
44+
45+
46+
class TestRetrievalMetrics:
47+
"""Tests for retrieval-based evaluation metrics."""
48+
49+
def test_make_contextual_precision_metric_returns_correct_type(self, mock_model):
50+
metric = make_contextual_precision_metric(mock_model)
51+
assert isinstance(metric, ContextualPrecisionMetric)
52+
53+
def test_make_contextual_precision_metric_threshold(self, mock_model):
54+
metric = make_contextual_precision_metric(mock_model)
55+
assert metric.threshold == PRECISION_THRESHOLD
56+
57+
def test_make_contextual_precision_metric_includes_reason(self, mock_model):
58+
metric = make_contextual_precision_metric(mock_model)
59+
assert metric.include_reason is True
60+
61+
def test_make_contextual_recall_metric_returns_correct_type(self, mock_model):
62+
metric = make_contextual_recall_metric(mock_model)
63+
assert isinstance(metric, ContextualRecallMetric)
64+
65+
def test_make_contextual_recall_metric_threshold(self, mock_model):
66+
metric = make_contextual_recall_metric(mock_model)
67+
assert metric.threshold == RECALL_THRESHOLD
68+
69+
def test_make_contextual_recall_metric_includes_reason(self, mock_model):
70+
metric = make_contextual_recall_metric(mock_model)
71+
assert metric.include_reason is True
72+
73+
def test_make_hallucination_metric_returns_correct_type(self, mock_model):
74+
metric = make_hallucination_metric(mock_model)
75+
assert isinstance(metric, HallucinationMetric)
76+
77+
def test_make_hallucination_metric_threshold(self, mock_model):
78+
metric = make_hallucination_metric(mock_model)
79+
assert metric.threshold == HALLUCINATION_THRESHOLD
80+
81+
def test_make_hallucination_metric_includes_reason(self, mock_model):
82+
metric = make_hallucination_metric(mock_model)
83+
assert metric.include_reason is True
84+
85+
def test_make_contextual_relevancy_metric_raises_not_implemented(self, mock_model):
86+
"""ContextualRelevancyMetric is disabled due to protobuf incompatibility."""
87+
with pytest.raises(NotImplementedError, match="protobuf incompatability"):
88+
make_contextual_relevancy_metric(mock_model)
89+
90+
def test_make_faithfulness_metric_raises_not_implemented(self, mock_model):
91+
"""FaithfulnessMetric is disabled due to protobuf incompatibility."""
92+
with pytest.raises(NotImplementedError, match="protobuf incompatability"):
93+
make_faithfulness_metric(mock_model)
94+
95+
96+
class TestContentMetrics:
97+
"""Tests for content-based evaluation metrics."""
98+
99+
def test_make_answer_relevancy_metric_returns_correct_type(self, mock_model):
100+
metric = make_answer_relevancy_metric(mock_model)
101+
assert isinstance(metric, AnswerRelevancyMetric)
102+
103+
def test_make_answer_relevancy_metric_threshold(self, mock_model):
104+
metric = make_answer_relevancy_metric(mock_model)
105+
assert metric.threshold == ANSRELEVANCY_THRESHOLD
106+
107+
def test_make_answer_relevancy_metric_includes_reason(self, mock_model):
108+
metric = make_answer_relevancy_metric(mock_model)
109+
assert metric.include_reason is True
110+
111+
def test_make_bias_metric_returns_correct_type(self, mock_model):
112+
metric = make_bias_metric(mock_model)
113+
assert isinstance(metric, BiasMetric)
114+
115+
def test_make_bias_metric_threshold(self, mock_model):
116+
metric = make_bias_metric(mock_model)
117+
assert metric.threshold == BIAS_THRESHOLD
118+
119+
def test_make_bias_metric_includes_reason(self, mock_model):
120+
metric = make_bias_metric(mock_model)
121+
assert metric.include_reason is True
122+
123+
def test_make_toxicity_metric_returns_correct_type(self, mock_model):
124+
metric = make_toxicity_metric(mock_model)
125+
assert isinstance(metric, ToxicityMetric)
126+
127+
def test_make_toxicity_metric_threshold(self, mock_model):
128+
metric = make_toxicity_metric(mock_model)
129+
assert metric.threshold == TOXICITY_THRESHOLD
130+
131+
def test_make_toxicity_metric_includes_reason(self, mock_model):
132+
metric = make_toxicity_metric(mock_model)
133+
assert metric.include_reason is True
134+
135+
136+
class TestGEvalMetrics:
137+
"""Tests for GEval custom LLM-based metrics."""
138+
139+
def test_make_correctness_metric_returns_geval(self, mock_model):
140+
metric = make_correctness_metric(mock_model)
141+
assert isinstance(metric, GEval)
142+
143+
def test_make_correctness_metric_name(self, mock_model):
144+
metric = make_correctness_metric(mock_model)
145+
assert metric.name == "Correctness"
146+
147+
def test_make_correctness_metric_has_evaluation_steps(self, mock_model):
148+
metric = make_correctness_metric(mock_model)
149+
assert metric.evaluation_steps is not None
150+
assert len(metric.evaluation_steps) > 0
151+
152+
def test_make_correctness_metric_has_criteria(self, mock_model):
153+
metric = make_correctness_metric(mock_model)
154+
assert metric.criteria is not None
155+
assert "factually correct" in metric.criteria
156+
157+
158+
class TestThresholdValues:
159+
"""Tests to verify threshold constants are within valid range."""
160+
161+
def test_precision_threshold_valid(self):
162+
assert 0.0 <= PRECISION_THRESHOLD <= 1.0
163+
164+
def test_recall_threshold_valid(self):
165+
assert 0.0 <= RECALL_THRESHOLD <= 1.0
166+
167+
def test_hallucination_threshold_valid(self):
168+
assert 0.0 <= HALLUCINATION_THRESHOLD <= 1.0
169+
170+
def test_answer_relevancy_threshold_valid(self):
171+
assert 0.0 <= ANSRELEVANCY_THRESHOLD <= 1.0
172+
173+
def test_bias_threshold_valid(self):
174+
assert 0.0 <= BIAS_THRESHOLD <= 1.0
175+
176+
def test_toxicity_threshold_valid(self):
177+
assert 0.0 <= TOXICITY_THRESHOLD <= 1.0

0 commit comments

Comments
 (0)