|
1 | 1 | """Tests for Judge functionality.""" |
2 | 2 |
|
3 | | -from unittest.mock import AsyncMock, MagicMock, call |
| 3 | +from unittest.mock import AsyncMock, MagicMock, call, patch |
4 | 4 |
|
5 | 5 | import pytest |
6 | 6 | from ldclient import Config, Context, LDClient |
7 | 7 | from ldclient.integrations.test_data import TestData |
8 | 8 |
|
| 9 | +from ldai import LDAIClient |
9 | 10 | from ldai.judge import Judge, _strip_legacy_judge_messages |
10 | 11 | from ldai.judge.evaluation_schema_builder import EvaluationSchemaBuilder |
11 | 12 | from ldai.models import ( |
@@ -543,6 +544,137 @@ async def test_evaluate_messages_calls_evaluate( |
543 | 544 | assert tracker.track_metrics_of_async.called |
544 | 545 |
|
545 | 546 |
|
| 547 | +class TestJudgeConfigStripsLegacyMessages: |
| 548 | + """Tests for ``LDAIClient.judge_config()`` legacy-message stripping. |
| 549 | +
|
| 550 | + Both ``LDAIClient.judge_config()`` (the direct public API) and |
| 551 | + ``LDAIClient.create_judge()`` (the wrapper API) reach the same internal |
| 552 | + ``_judge_config()`` method. ``_judge_config()`` is responsible for |
| 553 | + injecting ``message_history``/``response_to_evaluate`` markers so they |
| 554 | + survive Mustache rendering, then stripping any legacy template messages |
| 555 | + before returning the config. |
| 556 | + """ |
| 557 | + |
| 558 | + @pytest.fixture |
| 559 | + def context(self) -> Context: |
| 560 | + return Context.create('user-key') |
| 561 | + |
| 562 | + def _make_client(self, td: TestData) -> LDAIClient: |
| 563 | + config = Config('sdk-key', update_processor_class=td, send_events=False) |
| 564 | + return LDAIClient(LDClient(config=config)) |
| 565 | + |
| 566 | + def test_judge_config_strips_legacy_messages_from_returned_config(self, context): |
| 567 | + """Calling ``judge_config()`` directly (no variables) still strips legacy messages. |
| 568 | +
|
| 569 | + This is the regression for the bug where legacy messages leaked through |
| 570 | + the public ``judge_config()`` entry point because reserved-variable |
| 571 | + markers were only injected in ``_create_judge_instance``. |
| 572 | + """ |
| 573 | + td = TestData.data_source() |
| 574 | + td.update( |
| 575 | + td.flag('legacy-judge') |
| 576 | + .variations({ |
| 577 | + 'model': {'name': 'gpt-4'}, |
| 578 | + 'provider': {'name': 'openai'}, |
| 579 | + 'messages': [ |
| 580 | + {'role': 'system', 'content': 'You are a judge.'}, |
| 581 | + {'role': 'assistant', 'content': '{{message_history}}'}, |
| 582 | + {'role': 'user', 'content': 'Evaluate: {{response_to_evaluate}}'}, |
| 583 | + ], |
| 584 | + 'evaluationMetricKey': '$ld:ai:judge:relevance', |
| 585 | + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, |
| 586 | + }) |
| 587 | + .variation_for_all(0) |
| 588 | + ) |
| 589 | + client = self._make_client(td) |
| 590 | + |
| 591 | + result = client.judge_config('legacy-judge', context) |
| 592 | + |
| 593 | + assert result.enabled is True |
| 594 | + assert result.messages is not None |
| 595 | + assert len(result.messages) == 1 |
| 596 | + assert result.messages[0].role == 'system' |
| 597 | + assert result.messages[0].content == 'You are a judge.' |
| 598 | + |
| 599 | + def test_judge_config_passes_user_variables_to_template(self, context): |
| 600 | + """User variables are still interpolated into the system message.""" |
| 601 | + td = TestData.data_source() |
| 602 | + td.update( |
| 603 | + td.flag('parametric-judge') |
| 604 | + .variations({ |
| 605 | + 'model': {'name': 'gpt-4'}, |
| 606 | + 'provider': {'name': 'openai'}, |
| 607 | + 'messages': [ |
| 608 | + {'role': 'system', 'content': 'You are a {{tone}} judge.'}, |
| 609 | + ], |
| 610 | + 'evaluationMetricKey': '$ld:ai:judge:relevance', |
| 611 | + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, |
| 612 | + }) |
| 613 | + .variation_for_all(0) |
| 614 | + ) |
| 615 | + client = self._make_client(td) |
| 616 | + |
| 617 | + result = client.judge_config( |
| 618 | + 'parametric-judge', context, variables={'tone': 'strict'} |
| 619 | + ) |
| 620 | + |
| 621 | + assert result.messages is not None |
| 622 | + assert result.messages[0].content == 'You are a strict judge.' |
| 623 | + |
| 624 | + def test_judge_config_warns_on_reserved_variables(self, context): |
| 625 | + """``_judge_config`` warns when callers pass reserved variable names.""" |
| 626 | + td = TestData.data_source() |
| 627 | + td.update( |
| 628 | + td.flag('judge-config') |
| 629 | + .variations({ |
| 630 | + 'model': {'name': 'gpt-4'}, |
| 631 | + 'provider': {'name': 'openai'}, |
| 632 | + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], |
| 633 | + 'evaluationMetricKey': '$ld:ai:judge:relevance', |
| 634 | + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, |
| 635 | + }) |
| 636 | + .variation_for_all(0) |
| 637 | + ) |
| 638 | + client = self._make_client(td) |
| 639 | + |
| 640 | + with patch('ldai.client.log') as mock_log: |
| 641 | + client.judge_config( |
| 642 | + 'judge-config', |
| 643 | + context, |
| 644 | + variables={ |
| 645 | + 'message_history': 'should be ignored', |
| 646 | + 'response_to_evaluate': 'should be ignored', |
| 647 | + }, |
| 648 | + ) |
| 649 | + |
| 650 | + warning_messages = [c.args[0] for c in mock_log.warning.call_args_list] |
| 651 | + assert any("'message_history' is reserved" in m for m in warning_messages) |
| 652 | + assert any("'response_to_evaluate' is reserved" in m for m in warning_messages) |
| 653 | + |
| 654 | + def test_judge_config_does_not_warn_without_reserved_variables(self, context): |
| 655 | + """No warnings should be emitted when callers pass non-reserved variables.""" |
| 656 | + td = TestData.data_source() |
| 657 | + td.update( |
| 658 | + td.flag('judge-config') |
| 659 | + .variations({ |
| 660 | + 'model': {'name': 'gpt-4'}, |
| 661 | + 'provider': {'name': 'openai'}, |
| 662 | + 'messages': [{'role': 'system', 'content': 'You are a judge.'}], |
| 663 | + 'evaluationMetricKey': '$ld:ai:judge:relevance', |
| 664 | + '_ldMeta': {'enabled': True, 'variationKey': 'judge-v1', 'version': 1}, |
| 665 | + }) |
| 666 | + .variation_for_all(0) |
| 667 | + ) |
| 668 | + client = self._make_client(td) |
| 669 | + |
| 670 | + with patch('ldai.client.log') as mock_log: |
| 671 | + client.judge_config('judge-config', context, variables={'tone': 'strict'}) |
| 672 | + |
| 673 | + warning_messages = [c.args[0] for c in mock_log.warning.call_args_list] |
| 674 | + assert not any("'message_history' is reserved" in m for m in warning_messages) |
| 675 | + assert not any("'response_to_evaluate' is reserved" in m for m in warning_messages) |
| 676 | + |
| 677 | + |
546 | 678 | class TestEvaluationSchemaBuilder: |
547 | 679 | """Tests for EvaluationSchemaBuilder.""" |
548 | 680 |
|
|
0 commit comments