Skip to content

Commit 916df2a

Browse files
committed
fix tests
1 parent 49f5e2e commit 916df2a

1 file changed

Lines changed: 17 additions & 66 deletions

File tree

packages/sdk/server-ai/tests/test_judge.py

Lines changed: 17 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -114,15 +114,6 @@ def test_judge_initializes_with_evaluation_metric_key(
114114
assert eval_schema['required'] == ['score', 'reasoning']
115115
assert 'score' in eval_schema['properties'] and 'reasoning' in eval_schema['properties']
116116

117-
def test_judge_initializes_without_evaluation_metric_key(
118-
self, judge_config_without_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
119-
):
120-
"""Judge should initialize but have None for evaluation_response_structure."""
121-
judge = Judge(judge_config_without_key, tracker, mock_ai_provider)
122-
123-
assert judge._ai_config == judge_config_without_key
124-
assert judge._evaluation_response_structure is None
125-
126117

127118
class TestJudgeEvaluate:
128119
"""Tests for Judge.evaluate() method."""
@@ -158,14 +149,12 @@ async def test_evaluate_success_with_valid_response(
158149
"""Evaluate should return JudgeResponse with valid evaluation."""
159150
mock_response = StructuredResponse(
160151
data={
161-
'evaluations': {
162-
'$ld:ai:judge:relevance': {
163-
'score': 0.85,
164-
'reasoning': 'The response is highly relevant to the input.'
165-
}
152+
'evaluation': {
153+
'score': 0.85,
154+
'reasoning': 'The response is highly relevant to the input.'
166155
}
167156
},
168-
raw_response='{"evaluations": {...}}',
157+
raw_response='{"evaluation": {"score": 0.85, "reasoning": "..."}}',
169158
metrics=LDAIMetrics(success=True)
170159
)
171160

@@ -210,46 +199,14 @@ async def test_evaluate_success_with_evaluation_response_shape(
210199
assert result.evals['$ld:ai:judge:relevance'].score == 0.9
211200
assert 'accurate' in result.evals['$ld:ai:judge:relevance'].reasoning.lower()
212201

213-
@pytest.mark.asyncio
214-
async def test_evaluate_success_with_evaluations_backward_compat(
215-
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
216-
):
217-
"""Evaluate should accept legacy shape { evaluations: { score, reasoning } }."""
218-
mock_response = StructuredResponse(
219-
data={
220-
'evaluations': {
221-
'score': 0.7,
222-
'reasoning': 'Partially correct.',
223-
}
224-
},
225-
raw_response='{"evaluations": {"score": 0.7, "reasoning": "..."}}',
226-
metrics=LDAIMetrics(success=True),
227-
)
228-
mock_ai_provider.invoke_structured_model.return_value = mock_response
229-
tracker.track_metrics_of = AsyncMock(return_value=mock_response)
230-
231-
judge = Judge(judge_config_with_key, tracker, mock_ai_provider)
232-
result = await judge.evaluate("input", "output")
233-
234-
assert result is not None
235-
assert result.success is True
236-
assert result.evals['$ld:ai:judge:relevance'].score == 0.7
237-
238202
@pytest.mark.asyncio
239203
async def test_evaluate_handles_missing_evaluation_in_response(
240204
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker, mock_ai_provider
241205
):
242206
"""Evaluate should handle missing evaluation in response."""
243207
mock_response = StructuredResponse(
244-
data={
245-
'evaluations': {
246-
'wrong-key': {
247-
'score': 0.5,
248-
'reasoning': 'Some reasoning'
249-
}
250-
}
251-
},
252-
raw_response='{"evaluations": {...}}',
208+
data={},
209+
raw_response='{}',
253210
metrics=LDAIMetrics(success=True)
254211
)
255212

@@ -271,14 +228,12 @@ async def test_evaluate_handles_invalid_score(
271228
"""Evaluate should handle invalid score values."""
272229
mock_response = StructuredResponse(
273230
data={
274-
'evaluations': {
275-
'$ld:ai:judge:relevance': {
276-
'score': 1.5,
277-
'reasoning': 'Some reasoning'
278-
}
231+
'evaluation': {
232+
'score': 1.5,
233+
'reasoning': 'Some reasoning'
279234
}
280235
},
281-
raw_response='{"evaluations": {...}}',
236+
raw_response='{"evaluation": {"score": 1.5, "reasoning": "..."}}',
282237
metrics=LDAIMetrics(success=True)
283238
)
284239

@@ -300,13 +255,11 @@ async def test_evaluate_handles_missing_reasoning(
300255
"""Evaluate should handle missing reasoning."""
301256
mock_response = StructuredResponse(
302257
data={
303-
'evaluations': {
304-
'$ld:ai:judge:relevance': {
305-
'score': 0.8,
306-
}
258+
'evaluation': {
259+
'score': 0.8,
307260
}
308261
},
309-
raw_response='{"evaluations": {...}}',
262+
raw_response='{"evaluation": {"score": 0.8}}',
310263
metrics=LDAIMetrics(success=True)
311264
)
312265

@@ -364,14 +317,12 @@ async def test_evaluate_messages_calls_evaluate(
364317

365318
mock_response = StructuredResponse(
366319
data={
367-
'evaluations': {
368-
'$ld:ai:judge:relevance': {
369-
'score': 0.9,
370-
'reasoning': 'Very relevant'
371-
}
320+
'evaluation': {
321+
'score': 0.9,
322+
'reasoning': 'Very relevant'
372323
}
373324
},
374-
raw_response='{"evaluations": {...}}',
325+
raw_response='{"evaluation": {"score": 0.9, "reasoning": "..."}}',
375326
metrics=LDAIMetrics(success=True)
376327
)
377328

0 commit comments

Comments
 (0)