@@ -1466,8 +1466,14 @@ async def _run_ground_truth_optimization(
14661466 last_ctx .iteration ,
14671467 )
14681468 if self ._last_succeeded_context is None :
1469+ # No Phase 2 candidate won; restore the Phase 1 winner.
14691470 self ._last_run_succeeded = True
14701471 self ._last_succeeded_context = phase1_winner
1472+ elif self ._last_succeeded_context is not phase1_winner :
1473+ # Phase 2 selected a better model; return that context so
1474+ # callers (including auto_commit) see the actual final winner
1475+ # rather than the stale Phase 1 GT batch results.
1476+ return [self ._last_succeeded_context ]
14711477 return attempt_results
14721478
14731479 # We've hit max attempts for the batches, bail at this point
@@ -2690,10 +2696,13 @@ async def _run_cost_latency_phase(
26902696 if i < max_iters - 1 :
26912697 self ._safe_status_update ("turn completed" , ctx , iteration )
26922698
2693- # Report results: fail non-winners first (preserving _last_succeeded_context
2694- # as the Phase 1 winner until the very end), then succeed the best candidate.
2699+ # Send terminal FAILED status for each non-winning model attempt.
2700+ # We use _safe_status_update directly rather than _handle_failure so that
2701+ # exploratory Phase 2 misses don't corrupt _last_run_succeeded,
2702+ # _last_succeeded_context, or trigger on_failing_result — those are
2703+ # run-level signals that should only fire if the whole optimization fails.
26952704 for failed_ctx in non_candidates :
2696- self ._handle_failure ( failed_ctx , failed_ctx .iteration )
2705+ self ._safe_status_update ( "failure" , failed_ctx , failed_ctx .iteration )
26972706
26982707 if candidates :
26992708 best = self ._pick_best_candidate (candidates )
0 commit comments