Skip to content

Commit 35c736a

Browse files
ASRagabclaude
andcommitted
feat(tests): add dimension scoring and hard constraint integration tests
Add 4 provider integration tests exercising production scoring paths: - Dimension-weighted scoring (OpenAI, Anthropic): verifies per-dimension scores are returned and weighted correctly via real API calls - Hard constraint violation (OpenAI, Anthropic): verifies score is forced to 0.0 when artifact violates "under 50 words" constraint 222 tests passing (214 unit + 8 integration). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Entire-Checkpoint: e9550ad6d479
1 parent 90ef351 commit 35c736a

1 file changed

Lines changed: 79 additions & 0 deletions

File tree

tests/test_llm_judge.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,85 @@ def test_integration_google_judge(self):
284284
assert 0.0 <= score <= 1.0
285285
assert "reasoning" in side_info or "error" not in side_info
286286

287+
# --- Dimension-weighted scoring (production path) ---
288+
289+
_DIMS = [
290+
{"name": "clarity", "weight": 0.5},
291+
{"name": "specificity", "weight": 0.3},
292+
{"name": "conciseness", "weight": 0.2},
293+
]
294+
_DIM_ARTIFACT = (
295+
"To install, run `pip install my-package`. "
296+
"Then call `my_package.run(config_path='settings.toml')` "
297+
"to start the service on port 8080."
298+
)
299+
300+
@pytest.mark.integration
301+
@pytest.mark.skipif(
302+
not __import__("os").environ.get("OPENAI_API_KEY"),
303+
reason="OPENAI_API_KEY required",
304+
)
305+
def test_integration_openai_dimension_scoring(self):
306+
evaluator = llm_judge_evaluator(
307+
"Score technical documentation quality.",
308+
model="openai/gpt-4o-mini",
309+
quality_dimensions=self._DIMS,
310+
)
311+
score, side_info = evaluator(self._DIM_ARTIFACT)
312+
assert 0.0 <= score <= 1.0
313+
# Dimension-weighted mode should return per-dimension scores
314+
assert any(k in side_info for k in ("clarity", "specificity", "conciseness"))
315+
316+
@pytest.mark.integration
317+
@pytest.mark.skipif(
318+
not __import__("os").environ.get("ANTHROPIC_API_KEY"),
319+
reason="ANTHROPIC_API_KEY required",
320+
)
321+
def test_integration_anthropic_dimension_scoring(self):
322+
evaluator = llm_judge_evaluator(
323+
"Score technical documentation quality.",
324+
model="anthropic/claude-haiku-4-5-20251001",
325+
quality_dimensions=self._DIMS,
326+
)
327+
score, side_info = evaluator(self._DIM_ARTIFACT)
328+
assert 0.0 <= score <= 1.0
329+
assert any(k in side_info for k in ("clarity", "specificity", "conciseness"))
330+
331+
# --- Hard constraint enforcement ---
332+
333+
_CONSTRAINT_ARTIFACT_VIOLATING = "x " * 300 # ~300 words, violates "under 50 words"
334+
335+
@pytest.mark.integration
336+
@pytest.mark.skipif(
337+
not __import__("os").environ.get("OPENAI_API_KEY"),
338+
reason="OPENAI_API_KEY required",
339+
)
340+
def test_integration_openai_hard_constraint_violation(self):
341+
evaluator = llm_judge_evaluator(
342+
"Score text quality.",
343+
model="openai/gpt-4o-mini",
344+
quality_dimensions=[{"name": "clarity", "weight": 1.0}],
345+
hard_constraints=["Text must be under 50 words"],
346+
)
347+
score, side_info = evaluator(self._CONSTRAINT_ARTIFACT_VIOLATING)
348+
# Model should detect the constraint violation → score forced to 0.0
349+
assert score == 0.0 or side_info.get("hard_constraint_violation") is True
350+
351+
@pytest.mark.integration
352+
@pytest.mark.skipif(
353+
not __import__("os").environ.get("ANTHROPIC_API_KEY"),
354+
reason="ANTHROPIC_API_KEY required",
355+
)
356+
def test_integration_anthropic_hard_constraint_violation(self):
357+
evaluator = llm_judge_evaluator(
358+
"Score text quality.",
359+
model="anthropic/claude-haiku-4-5-20251001",
360+
quality_dimensions=[{"name": "clarity", "weight": 1.0}],
361+
hard_constraints=["Text must be under 50 words"],
362+
)
363+
score, side_info = evaluator(self._CONSTRAINT_ARTIFACT_VIOLATING)
364+
assert score == 0.0 or side_info.get("hard_constraint_violation") is True
365+
287366

288367
class TestComputeWeightedScore:
289368
def test_basic_weighted_average(self):

0 commit comments

Comments
 (0)