@@ -535,7 +535,7 @@ async def test_duration_context_added_to_instructions_when_latency_keyword_prese
535535 )
536536 _ , config , _ , _ = self .handle_judge_call .call_args .args
537537 assert "1500ms" in config .instructions
538- assert "mention the duration" in config .instructions
538+ assert "state the duration" in config .instructions
539539
540540 async def test_duration_context_includes_baseline_comparison_when_history_present (self ):
541541 """When history[0] has a duration, the judge instructions include a baseline comparison."""
@@ -1842,11 +1842,11 @@ async def test_apply_variation_response_calls_restore_and_logs_warning(self):
18421842
18431843 with patch ("ldai_optimizer.client.logger" ) as mock_logger :
18441844 await client ._generate_new_variation (iteration = 1 , variables = {})
1845- warning_calls = [
1846- call for call in mock_logger .warning .call_args_list
1845+ debug_calls = [
1846+ call for call in mock_logger .debug .call_args_list
18471847 if "user-123" in str (call ) or "business" in str (call )
18481848 ]
1849- assert len (warning_calls ) >= 1
1849+ assert len (debug_calls ) >= 1
18501850
18511851 assert "{{user_id}}" in client ._current_instructions
18521852 assert "user-123" not in client ._current_instructions
@@ -4727,15 +4727,13 @@ def test_uses_only_output_price_when_input_absent(self):
47274727 cost = estimate_cost (usage , model_config )
47284728 assert cost == pytest .approx (40 * 0.002 )
47294729
4730- def test_falls_back_to_total_token_count_when_no_pricing (self ):
4730+ def test_returns_none_when_no_pricing_in_config (self ):
47314731 usage = self ._usage (total = 100 )
4732- cost = estimate_cost (usage , {})
4733- assert cost == 100.0
4732+ assert estimate_cost (usage , {}) is None
47344733
4735- def test_falls_back_to_total_token_count_when_model_config_none (self ):
4734+ def test_returns_none_when_model_config_none (self ):
47364735 usage = self ._usage (total = 250 )
4737- cost = estimate_cost (usage , None )
4738- assert cost == 250.0
4736+ assert estimate_cost (usage , None ) is None
47394737
47404738 def test_ignores_cached_input_token_price (self ):
47414739 usage = self ._usage (total = 100 , inp = 60 , out = 40 )
@@ -4783,8 +4781,8 @@ def test_detects_costs_plural(self):
47834781 def test_detects_budget (self ):
47844782 assert _acceptance_criteria_implies_cost_optimization (self ._judge ("Stay within budget." ))
47854783
4786- def test_detects_tokens (self ):
4787- assert _acceptance_criteria_implies_cost_optimization (self ._judge ("Use fewer tokens ." ))
4784+ def test_does_not_detect_token_to_avoid_false_positives (self ):
4785+ assert not _acceptance_criteria_implies_cost_optimization (self ._judge ("Generate a valid authentication token ." ))
47884786
47894787 def test_detects_billing (self ):
47904788 assert _acceptance_criteria_implies_cost_optimization (self ._judge ("Minimize billing." ))
@@ -4803,7 +4801,7 @@ def test_no_match_on_unrelated_statement(self):
48034801 def test_multiple_judges_one_matches (self ):
48044802 judges = {
48054803 "j1" : OptimizationJudge (threshold = 0.9 , acceptance_statement = "Be accurate." ),
4806- "j2" : OptimizationJudge (threshold = 0.9 , acceptance_statement = "Use fewer tokens ." ),
4804+ "j2" : OptimizationJudge (threshold = 0.9 , acceptance_statement = "Keep costs low ." ),
48074805 }
48084806 assert _acceptance_criteria_implies_cost_optimization (judges )
48094807
@@ -4861,11 +4859,11 @@ def test_skips_gracefully_when_candidate_cost_none(self):
48614859 ctx = self ._ctx (None ) # type: ignore[arg-type]
48624860 assert self .client ._evaluate_cost (ctx ) is True
48634861
4864- def test_works_with_token_count_proxy (self ):
4865- # When no pricing data, cost is raw token count — gate still compares numerically
4866- self . _seed_history ( 1000.0 )
4867- assert self .client . _evaluate_cost ( self . _ctx ( 750.0 )) is True
4868- assert self .client ._evaluate_cost (self ._ctx (900.0 )) is False
4862+ def test_skips_gracefully_when_units_differ_across_model_switch (self ):
4863+ # If baseline was captured with pricing (USD) but candidate has no pricing,
4864+ # candidate cost is None and the gate skips rather than comparing incompatible units.
4865+ self ._seed_history ( 0.010 )
4866+ assert self .client ._evaluate_cost (self ._ctx (None )) is True
48694867
48704868
48714869# ---------------------------------------------------------------------------
@@ -4986,10 +4984,18 @@ def setup_method(self):
49864984 def _cost_judge (self ) -> OptimizationJudge :
49874985 return OptimizationJudge (
49884986 threshold = 0.9 ,
4989- acceptance_statement = "Use fewer tokens and keep costs low ." ,
4987+ acceptance_statement = "Keep costs low and stay within budget ." ,
49904988 )
49914989
4990+ def _set_pricing (self ):
4991+ """Give the client a model config with pricing so estimate_cost returns USD."""
4992+ self .client ._current_model = "gpt-4o"
4993+ self .client ._model_configs = [
4994+ {"id" : "gpt-4o" , "costPerInputToken" : 0.000005 , "costPerOutputToken" : 0.000015 }
4995+ ]
4996+
49924997 async def test_cost_context_injected_into_instructions (self ):
4998+ self ._set_pricing ()
49934999 usage = TokenUsage (total = 100 , input = 60 , output = 40 )
49945000 captured : list = []
49955001
@@ -5040,6 +5046,7 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge):
50405046 assert "cost/token-usage goal" not in instructions
50415047
50425048 async def test_baseline_cost_shown_when_history_present (self ):
5049+ self ._set_pricing ()
50435050 usage = TokenUsage (total = 100 , input = 60 , output = 40 )
50445051 captured : list = []
50455052
0 commit comments