@@ -284,6 +284,85 @@ def test_integration_google_judge(self):
284284 assert 0.0 <= score <= 1.0
285285 assert "reasoning" in side_info or "error" not in side_info
286286
287+ # --- Dimension-weighted scoring (production path) ---
288+
289+ _DIMS = [
290+ {"name" : "clarity" , "weight" : 0.5 },
291+ {"name" : "specificity" , "weight" : 0.3 },
292+ {"name" : "conciseness" , "weight" : 0.2 },
293+ ]
294+ _DIM_ARTIFACT = (
295+ "To install, run `pip install my-package`. "
296+ "Then call `my_package.run(config_path='settings.toml')` "
297+ "to start the service on port 8080."
298+ )
299+
300+ @pytest .mark .integration
301+ @pytest .mark .skipif (
302+ not __import__ ("os" ).environ .get ("OPENAI_API_KEY" ),
303+ reason = "OPENAI_API_KEY required" ,
304+ )
305+ def test_integration_openai_dimension_scoring (self ):
306+ evaluator = llm_judge_evaluator (
307+ "Score technical documentation quality." ,
308+ model = "openai/gpt-4o-mini" ,
309+ quality_dimensions = self ._DIMS ,
310+ )
311+ score , side_info = evaluator (self ._DIM_ARTIFACT )
312+ assert 0.0 <= score <= 1.0
313+ # Dimension-weighted mode should return per-dimension scores
314+ assert any (k in side_info for k in ("clarity" , "specificity" , "conciseness" ))
315+
316+ @pytest .mark .integration
317+ @pytest .mark .skipif (
318+ not __import__ ("os" ).environ .get ("ANTHROPIC_API_KEY" ),
319+ reason = "ANTHROPIC_API_KEY required" ,
320+ )
321+ def test_integration_anthropic_dimension_scoring (self ):
322+ evaluator = llm_judge_evaluator (
323+ "Score technical documentation quality." ,
324+ model = "anthropic/claude-haiku-4-5-20251001" ,
325+ quality_dimensions = self ._DIMS ,
326+ )
327+ score , side_info = evaluator (self ._DIM_ARTIFACT )
328+ assert 0.0 <= score <= 1.0
329+ assert any (k in side_info for k in ("clarity" , "specificity" , "conciseness" ))
330+
331+ # --- Hard constraint enforcement ---
332+
333+ _CONSTRAINT_ARTIFACT_VIOLATING = "x " * 300 # ~300 words, violates "under 50 words"
334+
335+ @pytest .mark .integration
336+ @pytest .mark .skipif (
337+ not __import__ ("os" ).environ .get ("OPENAI_API_KEY" ),
338+ reason = "OPENAI_API_KEY required" ,
339+ )
340+ def test_integration_openai_hard_constraint_violation (self ):
341+ evaluator = llm_judge_evaluator (
342+ "Score text quality." ,
343+ model = "openai/gpt-4o-mini" ,
344+ quality_dimensions = [{"name" : "clarity" , "weight" : 1.0 }],
345+ hard_constraints = ["Text must be under 50 words" ],
346+ )
347+ score , side_info = evaluator (self ._CONSTRAINT_ARTIFACT_VIOLATING )
348+ # Model should detect the constraint violation → score forced to 0.0
349+ assert score == 0.0 or side_info .get ("hard_constraint_violation" ) is True
350+
351+ @pytest .mark .integration
352+ @pytest .mark .skipif (
353+ not __import__ ("os" ).environ .get ("ANTHROPIC_API_KEY" ),
354+ reason = "ANTHROPIC_API_KEY required" ,
355+ )
356+ def test_integration_anthropic_hard_constraint_violation (self ):
357+ evaluator = llm_judge_evaluator (
358+ "Score text quality." ,
359+ model = "anthropic/claude-haiku-4-5-20251001" ,
360+ quality_dimensions = [{"name" : "clarity" , "weight" : 1.0 }],
361+ hard_constraints = ["Text must be under 50 words" ],
362+ )
363+ score , side_info = evaluator (self ._CONSTRAINT_ARTIFACT_VIOLATING )
364+ assert score == 0.0 or side_info .get ("hard_constraint_violation" ) is True
365+
287366
288367class TestComputeWeightedScore :
289368 def test_basic_weighted_average (self ):
0 commit comments