Skip to content

Commit dc82818

Browse files
committed
fix: remove unnecessary token path
1 parent 94de596 commit dc82818

4 files changed

Lines changed: 47 additions & 56 deletions

File tree

packages/optimization/src/ldai_optimizer/client.py

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -791,34 +791,21 @@ async def _evaluate_acceptance_judge(
791791
else None
792792
)
793793
if current_cost is not None:
794-
has_pricing = (
795-
_find_model_config(self._current_model or "", self._model_configs) or {}
796-
).get("costPerInputToken") is not None
797-
if has_pricing:
798-
cost_str = f"${current_cost:.6f}"
799-
else:
800-
cost_str = f"{int(current_cost)} tokens"
801794
instructions += (
802795
f"\n\nThe acceptance criteria for this judge includes a cost/token-usage goal. "
803796
)
804797
if agent_usage is not None:
805798
instructions += (
806799
f"The agent's response used {agent_usage.input} input tokens "
807800
f"and {agent_usage.output} output tokens "
808-
f"(estimated cost: {cost_str}). "
801+
f"(estimated cost: ${current_cost:.6f}). "
809802
)
810803
if baseline_cost is not None:
811804
delta = current_cost - baseline_cost
812805
direction = "less" if delta < 0 else "more"
813-
if has_pricing:
814-
baseline_str = f"${baseline_cost:.6f}"
815-
delta_str = f"${abs(delta):.6f}"
816-
else:
817-
baseline_str = f"{int(baseline_cost)} tokens"
818-
delta_str = f"{int(abs(delta))} tokens"
819806
instructions += (
820-
f"The baseline cost (first iteration) was {baseline_str}. "
821-
f"This response cost {delta_str} {direction} than the baseline. "
807+
f"The baseline cost (first iteration) was ${baseline_cost:.6f}. "
808+
f"This response cost ${abs(delta):.6f} {direction} than the baseline. "
822809
)
823810
instructions += (
824811
"In your rationale, state the token usage and cost, and any change from baseline. "

packages/optimization/src/ldai_optimizer/prompts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
_COST_KEYWORDS = re.compile(
2020
r"\b(cheap|cheaper|cheapest|costs?|costly|expensive|budget|affordable|"
21-
r"tokens?|spend|spending|economical|cost-effective|frugal|"
21+
r"spend|spending|economical|cost-effective|frugal|"
2222
r"price|pricing|bill|billing)\b",
2323
re.IGNORECASE,
2424
)

packages/optimization/src/ldai_optimizer/util.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -322,37 +322,34 @@ def estimate_cost(
322322
usage: Optional["TokenUsage"],
323323
model_config: Optional[Dict[str, Any]],
324324
) -> Optional[float]:
325-
"""Estimate the monetary cost of a single agent call.
325+
"""Estimate the monetary cost of a single agent call in USD.
326326
327-
Uses ``costPerInputToken`` and ``costPerOutputToken`` from the model config
328-
when available. If the model config has no pricing fields, falls back to
329-
returning the raw total token count as a dimensionless proxy so the cost
330-
gate can still operate comparatively. Returns ``None`` only when ``usage``
331-
itself is ``None``.
327+
Uses ``costPerInputToken`` and ``costPerOutputToken`` from the model config.
328+
Returns ``None`` when either ``usage`` is ``None`` or no pricing fields are
329+
present on the model config — ensuring the return value is always in USD or
330+
absent, never a raw token count. This prevents unit-mismatch bugs when
331+
comparing costs across iterations where the model (and its pricing
332+
availability) may differ.
332333
333334
``costPerCachedInputToken`` is intentionally ignored — the estimate uses
334-
input/output tokens only, which is sufficient for relative comparison
335-
across optimization iterations.
335+
input/output tokens only.
336336
337337
:param usage: Token usage from the agent call. When ``None``, returns ``None``.
338338
:param model_config: Model config dict from ``get_model_configs()``, or ``None``.
339-
:return: Estimated cost in USD, or raw total token count as proxy, or ``None``.
339+
:return: Estimated cost in USD, or ``None`` if usage or pricing data is absent.
340340
"""
341341
if usage is None:
342342
return None
343343

344344
input_price = model_config.get("costPerInputToken") if model_config else None
345345
output_price = model_config.get("costPerOutputToken") if model_config else None
346346

347-
if input_price is not None or output_price is not None:
348-
cost = 0.0
349-
if input_price is not None and usage.input is not None:
350-
cost += usage.input * input_price
351-
if output_price is not None and usage.output is not None:
352-
cost += usage.output * output_price
353-
return cost
347+
if input_price is None and output_price is None:
348+
return None
354349

355-
logger.debug(
356-
"No pricing data on model config for cost estimation; falling back to total token count"
357-
)
358-
return float(usage.total or 0)
350+
cost = 0.0
351+
if input_price is not None and usage.input is not None:
352+
cost += usage.input * input_price
353+
if output_price is not None and usage.output is not None:
354+
cost += usage.output * output_price
355+
return cost

packages/optimization/tests/test_client.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ async def test_duration_context_added_to_instructions_when_latency_keyword_prese
535535
)
536536
_, config, _, _ = self.handle_judge_call.call_args.args
537537
assert "1500ms" in config.instructions
538-
assert "mention the duration" in config.instructions
538+
assert "state the duration" in config.instructions
539539

540540
async def test_duration_context_includes_baseline_comparison_when_history_present(self):
541541
"""When history[0] has a duration, the judge instructions include a baseline comparison."""
@@ -1842,11 +1842,11 @@ async def test_apply_variation_response_calls_restore_and_logs_warning(self):
18421842

18431843
with patch("ldai_optimizer.client.logger") as mock_logger:
18441844
await client._generate_new_variation(iteration=1, variables={})
1845-
warning_calls = [
1846-
call for call in mock_logger.warning.call_args_list
1845+
debug_calls = [
1846+
call for call in mock_logger.debug.call_args_list
18471847
if "user-123" in str(call) or "business" in str(call)
18481848
]
1849-
assert len(warning_calls) >= 1
1849+
assert len(debug_calls) >= 1
18501850

18511851
assert "{{user_id}}" in client._current_instructions
18521852
assert "user-123" not in client._current_instructions
@@ -4727,15 +4727,13 @@ def test_uses_only_output_price_when_input_absent(self):
47274727
cost = estimate_cost(usage, model_config)
47284728
assert cost == pytest.approx(40 * 0.002)
47294729

4730-
def test_falls_back_to_total_token_count_when_no_pricing(self):
4730+
def test_returns_none_when_no_pricing_in_config(self):
47314731
usage = self._usage(total=100)
4732-
cost = estimate_cost(usage, {})
4733-
assert cost == 100.0
4732+
assert estimate_cost(usage, {}) is None
47344733

4735-
def test_falls_back_to_total_token_count_when_model_config_none(self):
4734+
def test_returns_none_when_model_config_none(self):
47364735
usage = self._usage(total=250)
4737-
cost = estimate_cost(usage, None)
4738-
assert cost == 250.0
4736+
assert estimate_cost(usage, None) is None
47394737

47404738
def test_ignores_cached_input_token_price(self):
47414739
usage = self._usage(total=100, inp=60, out=40)
@@ -4783,8 +4781,8 @@ def test_detects_costs_plural(self):
47834781
def test_detects_budget(self):
47844782
assert _acceptance_criteria_implies_cost_optimization(self._judge("Stay within budget."))
47854783

4786-
def test_detects_tokens(self):
4787-
assert _acceptance_criteria_implies_cost_optimization(self._judge("Use fewer tokens."))
4784+
def test_does_not_detect_token_to_avoid_false_positives(self):
4785+
assert not _acceptance_criteria_implies_cost_optimization(self._judge("Generate a valid authentication token."))
47884786

47894787
def test_detects_billing(self):
47904788
assert _acceptance_criteria_implies_cost_optimization(self._judge("Minimize billing."))
@@ -4803,7 +4801,7 @@ def test_no_match_on_unrelated_statement(self):
48034801
def test_multiple_judges_one_matches(self):
48044802
judges = {
48054803
"j1": OptimizationJudge(threshold=0.9, acceptance_statement="Be accurate."),
4806-
"j2": OptimizationJudge(threshold=0.9, acceptance_statement="Use fewer tokens."),
4804+
"j2": OptimizationJudge(threshold=0.9, acceptance_statement="Keep costs low."),
48074805
}
48084806
assert _acceptance_criteria_implies_cost_optimization(judges)
48094807

@@ -4861,11 +4859,11 @@ def test_skips_gracefully_when_candidate_cost_none(self):
48614859
ctx = self._ctx(None) # type: ignore[arg-type]
48624860
assert self.client._evaluate_cost(ctx) is True
48634861

4864-
def test_works_with_token_count_proxy(self):
4865-
# When no pricing data, cost is raw token count — gate still compares numerically
4866-
self._seed_history(1000.0)
4867-
assert self.client._evaluate_cost(self._ctx(750.0)) is True
4868-
assert self.client._evaluate_cost(self._ctx(900.0)) is False
4862+
def test_skips_gracefully_when_units_differ_across_model_switch(self):
4863+
# If baseline was captured with pricing (USD) but candidate has no pricing,
4864+
# candidate cost is None and the gate skips rather than comparing incompatible units.
4865+
self._seed_history(0.010)
4866+
assert self.client._evaluate_cost(self._ctx(None)) is True
48694867

48704868

48714869
# ---------------------------------------------------------------------------
@@ -4986,10 +4984,18 @@ def setup_method(self):
49864984
def _cost_judge(self) -> OptimizationJudge:
49874985
return OptimizationJudge(
49884986
threshold=0.9,
4989-
acceptance_statement="Use fewer tokens and keep costs low.",
4987+
acceptance_statement="Keep costs low and stay within budget.",
49904988
)
49914989

4990+
def _set_pricing(self):
4991+
"""Give the client a model config with pricing so estimate_cost returns USD."""
4992+
self.client._current_model = "gpt-4o"
4993+
self.client._model_configs = [
4994+
{"id": "gpt-4o", "costPerInputToken": 0.000005, "costPerOutputToken": 0.000015}
4995+
]
4996+
49924997
async def test_cost_context_injected_into_instructions(self):
4998+
self._set_pricing()
49934999
usage = TokenUsage(total=100, input=60, output=40)
49945000
captured: list = []
49955001

@@ -5040,6 +5046,7 @@ async def _capture_judge_call(judge_key, judge_config, ctx, is_judge):
50405046
assert "cost/token-usage goal" not in instructions
50415047

50425048
async def test_baseline_cost_shown_when_history_present(self):
5049+
self._set_pricing()
50435050
usage = TokenUsage(total=100, input=60, output=40)
50445051
captured: list = []
50455052

0 commit comments

Comments
 (0)