Skip to content

Commit 9bedf9e

Browse files
committed
fix: don't only evaluate final input in GT results
1 parent d267832 commit 9bedf9e

3 files changed

Lines changed: 47 additions & 21 deletions

File tree

packages/optimization/src/ldai_optimizer/client.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,6 +1155,12 @@ async def _run_ground_truth_optimization(
11551155
sample_passed, optimize_context = self._apply_duration_gate(sample_passed, optimize_context)
11561156
sample_passed, optimize_context = self._apply_cost_gate(sample_passed, optimize_context)
11571157

1158+
# Flush gate scores to the API for this sample. Without this,
1159+
# the next sample's "generating" event closes out this record
1160+
# with a status-only PATCH before gate scores are sent, so only
1161+
# the last sample would ever show latency/cost gate entries.
1162+
self._safe_status_update("evaluating", optimize_context, linear_iter)
1163+
11581164
if not sample_passed:
11591165
logger.info(
11601166
"[GT Attempt %d] -> Sample %d/%d FAILED",
@@ -2110,21 +2116,23 @@ def _apply_duration_gate(
21102116
When the gate is active (any acceptance statement implies latency optimization),
21112117
evaluates whether the candidate's duration improved by at least
21122118
_DURATION_TOLERANCE vs the baseline. A synthetic ``_latency_gate`` entry is
2113-
added to scores with score=1.0 on pass or score=0.0 on fail so the outcome
2114-
is visible in the API result and UI.
2119+
always added to scores with score=1.0 on pass or score=0.0 on fail so the
2120+
outcome is visible in the API result and UI for every iteration.
2121+
2122+
The gate score is recorded even when ``passed_so_far`` is False (quality
2123+
judges already failed) so that latency telemetry is visible on all
2124+
iterations, not just passing ones. In that case it is informational only
2125+
and cannot block the iteration further.
21152126
2116-
The gate is skipped (no score entry added) when:
2117-
- No acceptance statement implies latency optimization.
2118-
- ``passed_so_far`` is already False (a prior check failed the sample).
2127+
The gate is skipped entirely (no score entry added) only when no acceptance
2128+
statement implies latency optimization.
21192129
21202130
:param passed_so_far: Whether all prior checks for this sample passed.
21212131
:param ctx: Current optimization context.
21222132
:return: (passed, updated_ctx) where passed reflects gate outcome.
21232133
"""
21242134
if not _acceptance_criteria_implies_duration_optimization(self._options.judges):
21252135
return passed_so_far, ctx
2126-
if not passed_so_far:
2127-
return passed_so_far, ctx
21282136
passed = self._evaluate_duration(ctx)
21292137
if passed:
21302138
if self._baseline_duration_ms is not None and ctx.duration_ms is not None:
@@ -2149,9 +2157,13 @@ def _apply_duration_gate(
21492157
score = 0.0
21502158
ctx = dataclasses.replace(
21512159
ctx,
2152-
scores={**ctx.scores, "_latency_gate": JudgeResult(score=score, rationale=rationale)},
2160+
scores={**ctx.scores, "_latency_gate": JudgeResult(
2161+
score=score,
2162+
rationale=rationale,
2163+
duration_ms=ctx.duration_ms,
2164+
)},
21532165
)
2154-
return passed, ctx
2166+
return passed_so_far and passed, ctx
21552167

21562168
def _apply_cost_gate(
21572169
self, passed_so_far: bool, ctx: OptimizationContext
@@ -2160,21 +2172,23 @@ def _apply_cost_gate(
21602172
21612173
When the gate is active (any acceptance statement implies cost optimization),
21622174
evaluates whether the candidate's estimated cost improved by at least
2163-
_COST_TOLERANCE vs the baseline. A synthetic ``_cost_gate`` entry is
2175+
_COST_TOLERANCE vs the baseline. A synthetic ``_cost_gate`` entry is always
21642176
added to scores with score=1.0 on pass or score=0.0 on fail.
21652177
2166-
The gate is skipped (no score entry added) when:
2167-
- No acceptance statement implies cost optimization.
2168-
- ``passed_so_far`` is already False (a prior check failed the sample).
2178+
The gate score is recorded even when ``passed_so_far`` is False (quality
2179+
judges already failed) so that cost telemetry is visible on all iterations,
2180+
not just passing ones. In that case it is informational only and cannot
2181+
block the iteration further.
2182+
2183+
The gate is skipped entirely (no score entry added) only when no acceptance
2184+
statement implies cost optimization.
21692185
21702186
:param passed_so_far: Whether all prior checks for this sample passed.
21712187
:param ctx: Current optimization context.
21722188
:return: (passed, updated_ctx) where passed reflects gate outcome.
21732189
"""
21742190
if not _acceptance_criteria_implies_cost_optimization(self._options.judges):
21752191
return passed_so_far, ctx
2176-
if not passed_so_far:
2177-
return passed_so_far, ctx
21782192
passed = self._evaluate_cost(ctx)
21792193
if passed:
21802194
if self._baseline_cost_usd is not None and ctx.estimated_cost_usd is not None:
@@ -2199,9 +2213,14 @@ def _apply_cost_gate(
21992213
score = 0.0
22002214
ctx = dataclasses.replace(
22012215
ctx,
2202-
scores={**ctx.scores, "_cost_gate": JudgeResult(score=score, rationale=rationale)},
2216+
scores={**ctx.scores, "_cost_gate": JudgeResult(
2217+
score=score,
2218+
rationale=rationale,
2219+
duration_ms=ctx.duration_ms,
2220+
estimated_cost_usd=ctx.estimated_cost_usd,
2221+
)},
22032222
)
2204-
return passed, ctx
2223+
return passed_so_far and passed, ctx
22052224

22062225
def _handle_success(
22072226
self, optimize_context: OptimizationContext, iteration: int

packages/optimization/src/ldai_optimizer/dataclasses.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class JudgeResult:
4343
rationale: Optional[str] = None
4444
duration_ms: Optional[float] = None
4545
usage: Optional[TokenUsage] = None
46+
estimated_cost_usd: Optional[float] = None
4647

4748
def to_json(self) -> Dict[str, Any]:
4849
"""
@@ -61,6 +62,8 @@ def to_json(self) -> Dict[str, Any]:
6162
"input": self.usage.input,
6263
"output": self.usage.output,
6364
}
65+
if self.estimated_cost_usd is not None:
66+
result["estimated_cost_usd"] = self.estimated_cost_usd
6467
return result
6568

6669

packages/optimization/tests/test_client.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5224,11 +5224,13 @@ def test_no_entry_added_when_gate_not_active(self):
52245224
assert passed is True
52255225
assert "_latency_gate" not in updated.scores
52265226

5227-
def test_no_entry_added_when_already_failed(self):
5227+
def test_gate_recorded_even_when_already_failed(self):
5228+
# Gate score is always written for telemetry; it cannot block an
5229+
# iteration that was already failing (passed_so_far=False).
52285230
ctx = self._ctx(1000)
52295231
passed, updated = self.client._apply_duration_gate(False, ctx)
52305232
assert passed is False
5231-
assert "_latency_gate" not in updated.scores
5233+
assert "_latency_gate" in updated.scores
52325234

52335235
def test_gate_pass_adds_score_1(self):
52345236
# baseline=2000ms, threshold=1600ms, candidate=1500ms → pass
@@ -5327,11 +5329,13 @@ def test_no_entry_added_when_gate_not_active(self):
53275329
assert passed is True
53285330
assert "_cost_gate" not in updated.scores
53295331

5330-
def test_no_entry_added_when_already_failed(self):
5332+
def test_gate_recorded_even_when_already_failed(self):
5333+
# Gate score is always written for telemetry; it cannot block an
5334+
# iteration that was already failing (passed_so_far=False).
53315335
ctx = self._ctx(0.005)
53325336
passed, updated = self.client._apply_cost_gate(False, ctx)
53335337
assert passed is False
5334-
assert "_cost_gate" not in updated.scores
5338+
assert "_cost_gate" in updated.scores
53355339

53365340
def test_gate_pass_adds_score_1(self):
53375341
# baseline=0.010, threshold=0.009, candidate=0.007 → pass

0 commit comments

Comments
 (0)