|
10 | 10 | from ldai.tracker import TokenUsage |
11 | 11 | from ldclient import Context |
12 | 12 |
|
13 | | -from ldai_optimizer.client import OptimizationClient, _compute_validation_count, _find_model_config |
| 13 | +from ldai_optimizer.client import OptimizationClient, _compute_validation_count, _find_model_config, _judge_passed |
14 | 14 | from ldai_optimizer.dataclasses import ( |
15 | 15 | AIJudgeCallConfig, |
16 | 16 | GroundTruthOptimizationOptions, |
|
28 | 28 | _acceptance_criteria_implies_duration_optimization, |
29 | 29 | build_new_variation_prompt, |
30 | 30 | variation_prompt_acceptance_criteria, |
| 31 | + variation_prompt_feedback, |
31 | 32 | variation_prompt_improvement_instructions, |
32 | 33 | variation_prompt_overfit_warning, |
33 | 34 | variation_prompt_preamble, |
@@ -1847,6 +1848,8 @@ def _make_mock_api_client() -> MagicMock: |
1847 | 1848 | mock.post_agent_optimization_result = MagicMock(return_value="result-uuid-789") |
1848 | 1849 | mock.patch_agent_optimization_result = MagicMock() |
1849 | 1850 | mock.get_model_configs = MagicMock(return_value=[]) |
| 1851 | + # Default: AI Configs do not have isInverted set |
| 1852 | + mock.get_ai_config = MagicMock(return_value={}) |
1850 | 1853 | return mock |
1851 | 1854 |
|
1852 | 1855 |
|
@@ -4404,3 +4407,251 @@ async def test_optimization_key_in_post_url_uses_string_key_not_uuid(self): |
4404 | 4407 | assert opt_key_arg == "my-optimization", ( |
4405 | 4408 | f"Expected string key 'my-optimization', got '{opt_key_arg}'" |
4406 | 4409 | ) |
| 4410 | + |
| 4411 | + |
| 4412 | +# --------------------------------------------------------------------------- |
| 4413 | +# _judge_passed helper |
| 4414 | +# --------------------------------------------------------------------------- |
| 4415 | + |
| 4416 | + |
| 4417 | +class TestJudgePassed: |
| 4418 | + def test_standard_judge_passes_at_or_above_threshold(self): |
| 4419 | + assert _judge_passed(0.8, 0.8, is_inverted=False) is True |
| 4420 | + assert _judge_passed(1.0, 0.8, is_inverted=False) is True |
| 4421 | + |
| 4422 | + def test_standard_judge_fails_below_threshold(self): |
| 4423 | + assert _judge_passed(0.5, 0.8, is_inverted=False) is False |
| 4424 | + |
| 4425 | + def test_inverted_judge_passes_at_or_below_threshold(self): |
| 4426 | + assert _judge_passed(0.1, 0.3, is_inverted=True) is True |
| 4427 | + assert _judge_passed(0.3, 0.3, is_inverted=True) is True |
| 4428 | + |
| 4429 | + def test_inverted_judge_fails_above_threshold(self): |
| 4430 | + assert _judge_passed(0.8, 0.3, is_inverted=True) is False |
| 4431 | + |
| 4432 | + |
| 4433 | +# --------------------------------------------------------------------------- |
| 4434 | +# _evaluate_response with inverted judges |
| 4435 | +# --------------------------------------------------------------------------- |
| 4436 | + |
| 4437 | + |
| 4438 | +class TestEvaluateResponseInvertedJudges: |
| 4439 | + def setup_method(self): |
| 4440 | + self.client = _make_client() |
| 4441 | + |
| 4442 | + def _ctx_with_scores(self, scores: Dict[str, JudgeResult]) -> OptimizationContext: |
| 4443 | + return OptimizationContext( |
| 4444 | + scores=scores, |
| 4445 | + completion_response="Some response.", |
| 4446 | + current_instructions="Do X.", |
| 4447 | + current_parameters={}, |
| 4448 | + current_variables={}, |
| 4449 | + iteration=1, |
| 4450 | + ) |
| 4451 | + |
| 4452 | + def test_inverted_judge_passes_when_score_below_threshold(self): |
| 4453 | + self.client._options = _make_options( |
| 4454 | + judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)} |
| 4455 | + ) |
| 4456 | + ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.1)}) |
| 4457 | + assert self.client._evaluate_response(ctx) is True |
| 4458 | + |
| 4459 | + def test_inverted_judge_passes_at_exact_threshold(self): |
| 4460 | + self.client._options = _make_options( |
| 4461 | + judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)} |
| 4462 | + ) |
| 4463 | + ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.3)}) |
| 4464 | + assert self.client._evaluate_response(ctx) is True |
| 4465 | + |
| 4466 | + def test_inverted_judge_fails_when_score_above_threshold(self): |
| 4467 | + self.client._options = _make_options( |
| 4468 | + judges={"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)} |
| 4469 | + ) |
| 4470 | + ctx = self._ctx_with_scores({"toxicity": JudgeResult(score=0.8)}) |
| 4471 | + assert self.client._evaluate_response(ctx) is False |
| 4472 | + |
| 4473 | + def test_mixed_judges_all_must_pass(self): |
| 4474 | + """A standard judge and an inverted judge must both pass for overall pass.""" |
| 4475 | + self.client._options = _make_options( |
| 4476 | + judges={ |
| 4477 | + "relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False), |
| 4478 | + "toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True), |
| 4479 | + } |
| 4480 | + ) |
| 4481 | + # Both pass: relevance high, toxicity low |
| 4482 | + ctx = self._ctx_with_scores({ |
| 4483 | + "relevance": JudgeResult(score=0.9), |
| 4484 | + "toxicity": JudgeResult(score=0.1), |
| 4485 | + }) |
| 4486 | + assert self.client._evaluate_response(ctx) is True |
| 4487 | + |
| 4488 | + def test_mixed_judges_fails_when_inverted_judge_too_high(self): |
| 4489 | + self.client._options = _make_options( |
| 4490 | + judges={ |
| 4491 | + "relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False), |
| 4492 | + "toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True), |
| 4493 | + } |
| 4494 | + ) |
| 4495 | + # Relevance passes but toxicity fails (score too high) |
| 4496 | + ctx = self._ctx_with_scores({ |
| 4497 | + "relevance": JudgeResult(score=0.9), |
| 4498 | + "toxicity": JudgeResult(score=0.8), |
| 4499 | + }) |
| 4500 | + assert self.client._evaluate_response(ctx) is False |
| 4501 | + |
| 4502 | + def test_mixed_judges_fails_when_standard_judge_too_low(self): |
| 4503 | + self.client._options = _make_options( |
| 4504 | + judges={ |
| 4505 | + "relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False), |
| 4506 | + "toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True), |
| 4507 | + } |
| 4508 | + ) |
| 4509 | + # Toxicity passes but relevance fails (score too low) |
| 4510 | + ctx = self._ctx_with_scores({ |
| 4511 | + "relevance": JudgeResult(score=0.5), |
| 4512 | + "toxicity": JudgeResult(score=0.1), |
| 4513 | + }) |
| 4514 | + assert self.client._evaluate_response(ctx) is False |
| 4515 | + |
| 4516 | + |
| 4517 | +# --------------------------------------------------------------------------- |
| 4518 | +# _build_options_from_config reads isInverted via get_ai_config REST call |
| 4519 | +# --------------------------------------------------------------------------- |
| 4520 | + |
| 4521 | + |
| 4522 | +class TestBuildOptionsFromConfigIsInverted: |
| 4523 | + def setup_method(self): |
| 4524 | + self.client = _make_client() |
| 4525 | + self.client._agent_key = "my-agent" |
| 4526 | + self.client._initialize_class_members_from_config(_make_agent_config()) |
| 4527 | + self.client._options = _make_options() |
| 4528 | + self.api_client = _make_mock_api_client() |
| 4529 | + |
| 4530 | + def _build(self, config=None, options=None) -> OptimizationOptions: |
| 4531 | + return self.client._build_options_from_config( |
| 4532 | + config or dict(_API_CONFIG), |
| 4533 | + options or _make_from_config_options(), |
| 4534 | + self.api_client, |
| 4535 | + optimization_key="opt-key-123", |
| 4536 | + run_id="run-uuid-456", |
| 4537 | + model_configs=[], |
| 4538 | + ) |
| 4539 | + |
| 4540 | + def test_is_inverted_true_when_ai_config_returns_isInverted(self): |
| 4541 | + """is_inverted is set from the AI Config REST API response for each judge.""" |
| 4542 | + self.api_client.get_ai_config.return_value = {"isInverted": True} |
| 4543 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4544 | + {"key": "toxicity", "threshold": 0.3}, |
| 4545 | + ]) |
| 4546 | + result = self._build(config=config) |
| 4547 | + assert result.judges["toxicity"].is_inverted is True |
| 4548 | + |
| 4549 | + def test_is_inverted_false_when_ai_config_has_no_isInverted(self): |
| 4550 | + self.api_client.get_ai_config.return_value = {} |
| 4551 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4552 | + {"key": "relevance", "threshold": 0.8}, |
| 4553 | + ]) |
| 4554 | + result = self._build(config=config) |
| 4555 | + assert result.judges["relevance"].is_inverted is False |
| 4556 | + |
| 4557 | + def test_is_inverted_false_when_ai_config_has_isInverted_false(self): |
| 4558 | + self.api_client.get_ai_config.return_value = {"isInverted": False} |
| 4559 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4560 | + {"key": "relevance", "threshold": 0.8}, |
| 4561 | + ]) |
| 4562 | + result = self._build(config=config) |
| 4563 | + assert result.judges["relevance"].is_inverted is False |
| 4564 | + |
| 4565 | + def test_get_ai_config_called_once_per_judge(self): |
| 4566 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4567 | + {"key": "toxicity", "threshold": 0.3}, |
| 4568 | + {"key": "relevance", "threshold": 0.8}, |
| 4569 | + ]) |
| 4570 | + self._build(config=config) |
| 4571 | + assert self.api_client.get_ai_config.call_count == 2 |
| 4572 | + |
| 4573 | + def test_acceptance_statements_skip_get_ai_config(self): |
| 4574 | + """Acceptance statement judges are not backed by AI Configs.""" |
| 4575 | + config = dict(_API_CONFIG, judges=[], acceptanceStatements=[ |
| 4576 | + {"statement": "Be accurate.", "threshold": 0.9}, |
| 4577 | + ]) |
| 4578 | + self._build(config=config) |
| 4579 | + self.api_client.get_ai_config.assert_not_called() |
| 4580 | + |
| 4581 | + def test_raises_when_get_ai_config_fails(self): |
| 4582 | + """A failing get_ai_config call propagates — the build should not silently ignore it.""" |
| 4583 | + self.api_client.get_ai_config.side_effect = Exception("API error") |
| 4584 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4585 | + {"key": "toxicity", "threshold": 0.3}, |
| 4586 | + ]) |
| 4587 | + with pytest.raises(Exception, match="API error"): |
| 4588 | + self._build(config=config) |
| 4589 | + |
| 4590 | + def test_per_judge_isInverted_mixed(self): |
| 4591 | + """Different judges can have different isInverted values.""" |
| 4592 | + def _get_ai_config_side_effect(project_key, config_key): |
| 4593 | + return {"isInverted": True} if config_key == "toxicity" else {"isInverted": False} |
| 4594 | + |
| 4595 | + self.api_client.get_ai_config.side_effect = _get_ai_config_side_effect |
| 4596 | + config = dict(_API_CONFIG, acceptanceStatements=[], judges=[ |
| 4597 | + {"key": "toxicity", "threshold": 0.3}, |
| 4598 | + {"key": "relevance", "threshold": 0.8}, |
| 4599 | + ]) |
| 4600 | + result = self._build(config=config) |
| 4601 | + assert result.judges["toxicity"].is_inverted is True |
| 4602 | + assert result.judges["relevance"].is_inverted is False |
| 4603 | + |
| 4604 | + |
| 4605 | +# --------------------------------------------------------------------------- |
| 4606 | +# variation_prompt_feedback with inverted judges |
| 4607 | +# --------------------------------------------------------------------------- |
| 4608 | + |
| 4609 | + |
| 4610 | +class TestVariationPromptFeedbackInvertedJudges: |
| 4611 | + def _make_ctx(self, scores: Dict[str, JudgeResult], iteration: int = 1) -> OptimizationContext: |
| 4612 | + return OptimizationContext( |
| 4613 | + scores=scores, |
| 4614 | + completion_response="Some response.", |
| 4615 | + current_instructions="Do X.", |
| 4616 | + current_parameters={}, |
| 4617 | + current_variables={}, |
| 4618 | + iteration=iteration, |
| 4619 | + ) |
| 4620 | + |
| 4621 | + def test_inverted_judge_shows_passed_when_score_below_threshold(self): |
| 4622 | + ctx = self._make_ctx({"toxicity": JudgeResult(score=0.1, rationale="Very clean.")}) |
| 4623 | + judges = {"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)} |
| 4624 | + result = variation_prompt_feedback([ctx], judges) |
| 4625 | + assert "PASSED" in result |
| 4626 | + |
| 4627 | + def test_inverted_judge_shows_failed_when_score_above_threshold(self): |
| 4628 | + ctx = self._make_ctx({"toxicity": JudgeResult(score=0.8, rationale="Very toxic.")}) |
| 4629 | + judges = {"toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True)} |
| 4630 | + result = variation_prompt_feedback([ctx], judges) |
| 4631 | + assert "FAILED" in result |
| 4632 | + |
| 4633 | + def test_standard_judge_shows_passed_when_score_above_threshold(self): |
| 4634 | + ctx = self._make_ctx({"relevance": JudgeResult(score=0.9)}) |
| 4635 | + judges = {"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False)} |
| 4636 | + result = variation_prompt_feedback([ctx], judges) |
| 4637 | + assert "PASSED" in result |
| 4638 | + |
| 4639 | + def test_standard_judge_shows_failed_when_score_below_threshold(self): |
| 4640 | + ctx = self._make_ctx({"relevance": JudgeResult(score=0.5)}) |
| 4641 | + judges = {"relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False)} |
| 4642 | + result = variation_prompt_feedback([ctx], judges) |
| 4643 | + assert "FAILED" in result |
| 4644 | + |
| 4645 | + def test_mixed_judges_feedback_reflects_correct_pass_fail(self): |
| 4646 | + ctx = self._make_ctx({ |
| 4647 | + "relevance": JudgeResult(score=0.9), |
| 4648 | + "toxicity": JudgeResult(score=0.05), |
| 4649 | + }) |
| 4650 | + judges = { |
| 4651 | + "relevance": OptimizationJudge(threshold=0.8, acceptance_statement="Relevant.", is_inverted=False), |
| 4652 | + "toxicity": OptimizationJudge(threshold=0.3, acceptance_statement="Low toxicity.", is_inverted=True), |
| 4653 | + } |
| 4654 | + result = variation_prompt_feedback([ctx], judges) |
| 4655 | + # Both should be PASSED — relevance high enough, toxicity low enough |
| 4656 | + assert result.count("PASSED") == 2 |
| 4657 | + assert "FAILED" not in result |
0 commit comments