@@ -5094,6 +5094,26 @@ def test_zero_usage_with_pricing_returns_zero(self):
50945094 model_config = {"costPerInputToken" : 0.001 , "costPerOutputToken" : 0.002 }
50955095 assert estimate_cost (usage , model_config ) == pytest .approx (0.0 )
50965096
5097+ def test_returns_none_when_both_token_counts_are_none (self ):
5098+ # Pricing exists but both input and output are None — no token counts to
5099+ # compute from, so we must return None rather than 0.0 to avoid
5100+ # cost-gate treating unknown cost as zero cost.
5101+ usage = TokenUsage (total = None , input = None , output = None )
5102+ model_config = {"costPerInputToken" : 0.001 , "costPerOutputToken" : 0.002 }
5103+ assert estimate_cost (usage , model_config ) is None
5104+
5105+ def test_returns_partial_cost_when_only_input_count_is_none (self ):
5106+ # Only output count available — should still compute a partial cost.
5107+ usage = TokenUsage (total = 40 , input = None , output = 40 )
5108+ model_config = {"costPerInputToken" : 0.001 , "costPerOutputToken" : 0.002 }
5109+ assert estimate_cost (usage , model_config ) == pytest .approx (40 * 0.002 )
5110+
5111+ def test_returns_partial_cost_when_only_output_count_is_none (self ):
5112+ # Only input count available — should still compute a partial cost.
5113+ usage = TokenUsage (total = 60 , input = 60 , output = None )
5114+ model_config = {"costPerInputToken" : 0.001 , "costPerOutputToken" : 0.002 }
5115+ assert estimate_cost (usage , model_config ) == pytest .approx (60 * 0.001 )
5116+
50975117
50985118# ---------------------------------------------------------------------------
50995119# _acceptance_criteria_implies_cost_optimization
@@ -5629,10 +5649,11 @@ def test_gate_scores_do_not_affect_result(self):
56295649 assert self .client ._all_judges_passing () is True
56305650
56315651 def test_uses_most_recent_history_entry (self ):
5632- """Only the last history entry is inspected."""
5652+ """In non-GT mode (_last_batch_size=1) only the last history entry is inspected."""
56335653 self .client ._options = _make_options (judges = {
56345654 "accuracy" : OptimizationJudge (threshold = 0.8 , acceptance_statement = "accurate" ),
56355655 })
5656+ self .client ._last_batch_size = 1
56365657 self .client ._history = [
56375658 self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.5 , rationale = "early fail" )}, iteration = 1 ),
56385659 self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 1.0 , rationale = "later pass" )}, iteration = 2 ),
@@ -5653,6 +5674,60 @@ def test_inverted_judge_fails_when_score_above_threshold(self):
56535674 self .client ._history = [self ._ctx_with_scores ({"toxicity" : JudgeResult (score = 0.5 , rationale = "toxic" )})]
56545675 assert self .client ._all_judges_passing () is False
56555676
5677+ # --- GT batch tests ---
5678+
5679+ def test_gt_batch_last_sample_passes_but_earlier_fails_returns_false (self ):
5680+ """Core GT bug: if any sample in the batch failed, must return False even if the last passed."""
5681+ self .client ._options = _make_options (judges = {
5682+ "accuracy" : OptimizationJudge (threshold = 0.8 , acceptance_statement = "accurate" ),
5683+ })
5684+ self .client ._last_batch_size = 3
5685+ self .client ._history = [
5686+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.3 , rationale = "fail" )}, iteration = 1 ), # FAILS
5687+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.9 , rationale = "ok" )}, iteration = 2 ),
5688+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.95 , rationale = "ok" )}, iteration = 3 ),
5689+ ]
5690+ assert self .client ._all_judges_passing () is False
5691+
5692+ def test_gt_batch_all_samples_pass_returns_true (self ):
5693+ self .client ._options = _make_options (judges = {
5694+ "accuracy" : OptimizationJudge (threshold = 0.8 , acceptance_statement = "accurate" ),
5695+ })
5696+ self .client ._last_batch_size = 3
5697+ self .client ._history = [
5698+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.85 , rationale = "ok" )}, iteration = 1 ),
5699+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.90 , rationale = "ok" )}, iteration = 2 ),
5700+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.95 , rationale = "ok" )}, iteration = 3 ),
5701+ ]
5702+ assert self .client ._all_judges_passing () is True
5703+
5704+ def test_gt_batch_middle_sample_fails_returns_false (self ):
5705+ self .client ._options = _make_options (judges = {
5706+ "accuracy" : OptimizationJudge (threshold = 0.8 , acceptance_statement = "accurate" ),
5707+ })
5708+ self .client ._last_batch_size = 3
5709+ self .client ._history = [
5710+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.95 , rationale = "ok" )}, iteration = 1 ),
5711+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.20 , rationale = "fail" )}, iteration = 2 ), # FAILS
5712+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.95 , rationale = "ok" )}, iteration = 3 ),
5713+ ]
5714+ assert self .client ._all_judges_passing () is False
5715+
5716+ def test_gt_batch_size_respected_ignores_older_batches (self ):
5717+ """Entries outside the current batch window should not influence the result."""
5718+ self .client ._options = _make_options (judges = {
5719+ "accuracy" : OptimizationJudge (threshold = 0.8 , acceptance_statement = "accurate" ),
5720+ })
5721+ self .client ._last_batch_size = 2
5722+ # 4 entries; batch covers last 2; first 2 are stale (from a previous attempt)
5723+ self .client ._history = [
5724+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.1 , rationale = "old fail" )}, iteration = 1 ),
5725+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.1 , rationale = "old fail" )}, iteration = 2 ),
5726+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.9 , rationale = "ok" )}, iteration = 3 ),
5727+ self ._ctx_with_scores ({"accuracy" : JudgeResult (score = 0.9 , rationale = "ok" )}, iteration = 4 ),
5728+ ]
5729+ assert self .client ._all_judges_passing () is True
5730+
56565731
56575732class TestBuildNewVariationPromptCost :
56585733 def _make_history (self ) -> list :
0 commit comments