Skip to content

Commit 1ba680c

Browse files
authored
feat(bench): structured-outputs predictor + overfit controls (Tracer-Cloud#2794)
* full experiment package (revert + new variant + overfit controls) * added overfit into bench framework * fix(bench): address greptile review on structured-outputs PR * fixed A/A variant issue * fixed float division * the same experiment but for N=100
1 parent 0db3bd3 commit 1ba680c

12 files changed

Lines changed: 2283 additions & 62 deletions

tests/benchmarks/_framework/config.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,19 @@ class BenchmarkConfig(BaseModel):
131131
# produces less adjacent-token bias.
132132
agent_variant: Literal["default", "trimmed_prompt"] = "default"
133133

134+
# Predictor variant for adapters with a paper-format predictor stage.
135+
# ``"default"`` (the default) uses the text-emit predictor in
136+
# ``predictor/llm_call.py`` — fed back through opensre's LLM client wrapper.
137+
# ``"structured"`` swaps in the OpenAI structured-outputs variant in
138+
# ``predictor/llm_call_structured_openai.py`` — grammar-constrained sampling at
139+
# the API level, so ``root_cause`` and ``fault_taxonomy`` are emitted
140+
# from the closed vocabulary by construction (no off-vocab fallout).
141+
#
142+
# OpenAI-only (gpt-4o-2024-08-06+ or gpt-5). Honored only by the
143+
# CloudOpsBench adapter — cross-field lint refuses ``"structured"`` on
144+
# other adapters or with non-OpenAI llms.
145+
predictor_variant: Literal["default", "structured"] = "default"
146+
134147
# ----------------------------------------------------------------------- #
135148
# Pydantic-level validation #
136149
# ----------------------------------------------------------------------- #
@@ -214,6 +227,35 @@ def lint(self) -> list[str]:
214227
"or run against the cloudopsbench adapter."
215228
)
216229

230+
# Cross-field guard: predictor_variant="structured" requires the
231+
# cloudopsbench adapter (only adapter with a predictor stage) AND
232+
# an OpenAI-compatible LLM (structured outputs is OpenAI-only).
233+
if self.predictor_variant == "structured":
234+
if self.benchmark != "cloudopsbench":
235+
errors.append(
236+
f"predictor_variant=structured is honored only by the "
237+
f"cloudopsbench adapter, but benchmark={self.benchmark!r}. "
238+
"Set predictor_variant: default or run against cloudopsbench."
239+
)
240+
# Prefixes for OpenAI models that support structured outputs.
241+
# Includes the o-series (o1, o3, o4-mini) and gpt-series. Other
242+
# providers may add structured-output support — when they do, a
243+
# peer ``llm_call_structured_<provider>.py`` module lands and
244+
# the dispatcher routes by LLM provider. Until then, this guard
245+
# refuses non-OpenAI llms with a clear error.
246+
openai_prefixes = ("gpt-", "openai", "o1", "o3", "o4")
247+
non_openai_llms = [llm for llm in self.llms if not llm.startswith(openai_prefixes)]
248+
if non_openai_llms:
249+
errors.append(
250+
f"predictor_variant=structured currently supports OpenAI "
251+
f"models only (gpt-4o-2024-08-06+, gpt-5, o-series). "
252+
f"Found non-OpenAI llms: {non_openai_llms}. Either set "
253+
"predictor_variant: default or restrict llms to OpenAI "
254+
"models. Other-provider peer variants "
255+
"(llm_call_structured_anthropic.py, "
256+
"llm_call_structured_deepseek.py) are planned follow-ups."
257+
)
258+
217259
# Output dir must not be a managed system path. Compare BOTH the lexical
218260
# form and the resolved form (on macOS /etc → /private/etc symlink would
219261
# bypass a check against only one). The narrow prefix list intentionally

tests/benchmarks/_framework/overfit.py

Lines changed: 662 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)