Skip to content

Commit 82c1c26

Browse files
authored
fix(benchmarks): classify Gemini spend cap as terminal
Merge PR #92 after CI success. Treats Gemini monthly spend-cap exhaustion as terminal_billing, prevents wasteful retries, and aborts bundle runs cleanly with manifest/summary metadata. No release/site/npm/PyPI/Zenodo paths altered.
1 parent 304c8ed commit 82c1c26

6 files changed

Lines changed: 551 additions & 2 deletions

File tree

benchmarks/v4.1/providers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@
1111
ProviderConfig,
1212
ProviderError,
1313
ProviderResponse,
14+
TerminalProviderError,
1415
TransientProviderError,
1516
get_provider,
17+
is_terminal_billing_error,
1618
is_transient_error,
1719
register_provider,
1820
)
@@ -23,9 +25,11 @@
2325
"ProviderConfig",
2426
"ProviderError",
2527
"ProviderResponse",
28+
"TerminalProviderError",
2629
"TransientProviderError",
2730
"MockProvider",
2831
"get_provider",
32+
"is_terminal_billing_error",
2933
"is_transient_error",
3034
"register_provider",
3135
]

benchmarks/v4.1/providers/base.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,44 @@ class TransientProviderError(ProviderError):
3131
"""
3232

3333

34+
class TerminalProviderError(ProviderError):
35+
"""Subclass marking a TERMINAL, run-aborting provider failure.
36+
37+
Distinct from a plain permanent error: the executor should not only
38+
refuse to retry the offending call, it should also stop processing
39+
further calls in the current run because every subsequent call to
40+
the same provider is guaranteed to fail for the same reason.
41+
42+
The canonical example is a Gemini ``429 RESOURCE_EXHAUSTED`` whose
43+
message says ``Your project has exceeded its monthly spending cap``:
44+
until the cap is raised or reset, every other request from the same
45+
project will hit the identical billing hard cap, so retrying 6x per
46+
item and continuing through 1799 more items just burns wall-clock
47+
on a guaranteed-failing run (the symptom observed in workflow run
48+
26642239431: 230 errors, 0 raw outputs, 120 min wasted).
49+
"""
50+
51+
52+
# Substrings that identify a *terminal* billing / hard-cap condition.
53+
# These are matched BEFORE the transient token list so that messages
54+
# such as ``429 RESOURCE_EXHAUSTED ... monthly spending cap ...`` are
55+
# not misclassified as a transient rate limit. Match is case-insensitive
56+
# and substring-based.
57+
_TERMINAL_BILLING_TOKENS = (
58+
"monthly spending cap",
59+
"monthly spend cap",
60+
"spending cap",
61+
"spend cap",
62+
"spending limit reached",
63+
"billing hard cap",
64+
"billing cap exceeded",
65+
"exceeded its monthly",
66+
"exceeded the monthly",
67+
"ai.studio/spend",
68+
"ai studio at https://ai.studio/spend",
69+
)
70+
71+
3472
# HTTP status codes and provider error tokens that should be treated as
3573
# transient by the classifier below.
3674
_TRANSIENT_HTTP_CODES = (408, 425, 429, 500, 502, 503, 504)
@@ -62,6 +100,25 @@ class TransientProviderError(ProviderError):
62100
)
63101

64102

103+
def is_terminal_billing_error(exc: BaseException) -> bool:
104+
"""Return True iff ``exc`` looks like a terminal billing hard cap.
105+
106+
Specifically, a provider error whose human-readable message contains
107+
one of :data:`_TERMINAL_BILLING_TOKENS` (e.g. Gemini's
108+
``monthly spending cap``). These conditions are *not* transient: the
109+
provider returns the same response for every subsequent call from
110+
the affected project until a human raises or resets the cap, so
111+
retrying is wasted budget and continuing the run is wasted wall-clock.
112+
"""
113+
if isinstance(exc, TerminalProviderError):
114+
return True
115+
text = str(exc).lower()
116+
for tok in _TERMINAL_BILLING_TOKENS:
117+
if tok in text:
118+
return True
119+
return False
120+
121+
65122
def is_transient_error(exc: BaseException) -> bool:
66123
"""Classify an exception as transient (retryable) or not.
67124
@@ -72,7 +129,15 @@ def is_transient_error(exc: BaseException) -> bool:
72129
This is conservative on purpose: anything that does not match a
73130
known transient signal is treated as non-transient so we do not
74131
retry auth/permission/quota-permanent errors.
132+
133+
Terminal billing hard-caps win over the transient signal: a message
134+
such as ``429 RESOURCE_EXHAUSTED ... monthly spending cap ...`` is
135+
classified non-transient even though it carries both a 429 code and
136+
``RESOURCE_EXHAUSTED``, because the underlying cause is a permanent
137+
cap, not a per-second rate limit.
75138
"""
139+
if is_terminal_billing_error(exc):
140+
return False
76141
for attr in ("status_code", "http_status", "status"):
77142
v = getattr(exc, attr, None)
78143
if isinstance(v, int) and v in _TRANSIENT_HTTP_CODES:

benchmarks/v4.1/providers/gemini_adapter.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@
1919
ProviderConfig,
2020
ProviderError,
2121
ProviderResponse,
22+
TerminalProviderError,
2223
TransientProviderError,
2324
env_api_key,
25+
is_terminal_billing_error,
2426
is_transient_error,
2527
)
2628

@@ -79,6 +81,16 @@ def generate(self, system: str, user: str, config: ProviderConfig) -> ProviderRe
7981
},
8082
)
8183
except Exception as exc:
84+
# Terminal billing hard cap (e.g. "monthly spending cap")
85+
# comes through Gemini as a 429 RESOURCE_EXHAUSTED, which
86+
# would otherwise be misclassified as transient and retried.
87+
# Detect it first and surface as TerminalProviderError so the
88+
# executor stops the run cleanly instead of burning the whole
89+
# 120-min job re-trying every item.
90+
if is_terminal_billing_error(exc):
91+
raise TerminalProviderError(
92+
f"gemini terminal billing cap: {exc!s}"
93+
) from exc
8294
# Surface timeouts as transient so the retry loop can recover.
8395
if _is_timeout_error(exc) or is_transient_error(exc):
8496
raise TransientProviderError(

benchmarks/v4.1/runner/executor.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@
3737
ProviderConfig,
3838
ProviderError,
3939
ProviderResponse,
40+
TerminalProviderError,
4041
TransientProviderError,
42+
is_terminal_billing_error,
4143
is_transient_error,
4244
)
4345

@@ -116,7 +118,20 @@ def _build_call(
116118

117119

118120
def _classify(exc: BaseException) -> str:
119-
"""Stable short string used in retry logs."""
121+
"""Stable short string used in retry logs.
122+
123+
Classes:
124+
- ``transient``: retryable (429 rate limit cooldown, 5xx, timeout)
125+
- ``terminal_billing``: provider-side billing/spend hard cap. NOT
126+
retryable; the entire run should stop because every subsequent
127+
call will hit the identical cap.
128+
- ``permanent``: auth/config/schema errors. NOT retryable for this
129+
call but the rest of the run is unaffected.
130+
- ``unhandled``: anything we did not recognise — treated as permanent
131+
for safety so we never silently retry an unknown failure mode.
132+
"""
133+
if is_terminal_billing_error(exc):
134+
return "terminal_billing"
120135
if isinstance(exc, TransientProviderError):
121136
return "transient"
122137
if isinstance(exc, ProviderError):

benchmarks/v4.1/runner/executor_b_bundles.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
_utcnow_iso,
2727
ExecutionPlan,
2828
)
29+
from ..providers.base import is_terminal_billing_error
2930
from ..prompts.test_b_bundles import (
3031
TEST_B_BUNDLE_CONDITIONS,
3132
build_test_b_bundle_messages,
@@ -118,14 +119,32 @@ def _log(msg: str) -> None:
118119
sys.stderr.write(msg + "\n")
119120
sys.stderr.flush()
120121

121-
counts = {"ok": 0, "error": 0, "skipped_resumed": len(already_done)}
122+
counts = {
123+
"ok": 0,
124+
"error": 0,
125+
"skipped_resumed": len(already_done),
126+
"skipped_terminal_abort": 0,
127+
}
122128
latencies: list[int] = []
123129
by_condition: dict[str, int] = {}
124130
by_phase: dict[str, int] = {}
125131
by_bundle: dict[str, int] = {}
126132
tokens = {"input": 0, "output": 0}
127133

134+
# Set once a terminal billing / spend-cap condition is observed. Every
135+
# subsequent call to the same provider is guaranteed to fail with the
136+
# same error, so we stop dispatching new work and record the rest as
137+
# ``skipped_terminal_abort`` instead of burning the full 120-min job.
138+
abort_reason: dict[str, Any] = {}
139+
abort_lock = threading.Lock()
140+
128141
def _do_call(call: dict[str, Any]) -> None:
142+
if abort_reason:
143+
# A previous call already tripped the terminal-billing trap.
144+
# Skip without contacting the provider; the batch loop will
145+
# break out cleanly after the current batch drains.
146+
counts["skipped_terminal_abort"] += 1
147+
return
129148
t_start = _utcnow_iso()
130149
with progress_lock:
131150
progress["started"] += 1
@@ -187,6 +206,22 @@ def _do_call(call: dict[str, Any]) -> None:
187206
f"attempts={retried_attempts} "
188207
f"wall_ms={dt_ms}"
189208
)
209+
if final_class == "terminal_billing":
210+
with abort_lock:
211+
if not abort_reason:
212+
last_err = attempts[-1] if attempts else {}
213+
abort_reason.update({
214+
"first_run_id": call["run_id"],
215+
"error_type": last_err.get("error_type"),
216+
"error": last_err.get("error"),
217+
})
218+
_log(
219+
"[bb] TERMINAL-BILLING tripped: provider has "
220+
"hit a hard billing/spend cap. Aborting the "
221+
"run; remaining calls will be recorded as "
222+
f"skipped_terminal_abort. run_id={call['run_id']} "
223+
f"err={last_err.get('error')}"
224+
)
190225
return
191226
raw_writer.write({
192227
"run_id": call["run_id"],
@@ -286,6 +321,17 @@ def _record_wall_timeout(c: dict[str, Any]) -> None:
286321

287322
try:
288323
for batch_start in range(0, len(pending), plan.batch_size):
324+
if abort_reason:
325+
# A prior batch tripped the terminal-billing trap. Stop
326+
# dispatching new work entirely; the remaining ``pending``
327+
# are intentionally not contacted.
328+
_log(
329+
f"[bb] abort: skipping remaining "
330+
f"{len(pending) - batch_start} call(s) after "
331+
f"terminal_billing first observed in "
332+
f"run_id={abort_reason.get('first_run_id')}"
333+
)
334+
break
289335
batch = pending[batch_start:batch_start + plan.batch_size]
290336
if plan.concurrency <= 1:
291337
for call in batch:
@@ -357,6 +403,8 @@ def _worker() -> None:
357403
"test": "test_b_bundles",
358404
"counts": counts,
359405
"total_attempted": len(pending),
406+
"aborted": bool(abort_reason),
407+
"abort_reason": dict(abort_reason) if abort_reason else None,
360408
"by_condition": by_condition,
361409
"by_phase": by_phase,
362410
"by_bundle": by_bundle,
@@ -402,6 +450,8 @@ def _worker() -> None:
402450
"fixtures": fixtures_manifest,
403451
"repo_commit": repo_commit,
404452
"counts": counts,
453+
"aborted": bool(abort_reason),
454+
"abort_reason": dict(abort_reason) if abort_reason else None,
405455
"n_run_specs_total": len(pending) + counts["skipped_resumed"],
406456
"n_already_done_on_start": counts["skipped_resumed"],
407457
"conditions": list(conditions),

0 commit comments

Comments
 (0)