Skip to content

Commit 63e1cd7

Browse files
authored
[OAI] Allow forcing Responses API for non-gpt-5 model names (#190)
## Summary **[OAI] Allow forcing Responses API for non-gpt-5 model names** * per-call `use_responses_api` (py) / `useResponsesApi` (js) flag forces the Responses API. routing becomes `isGPT5Model(model) || useResponsesApi`; flag is stripped before the request. * motivation: internal proxies may rewrite the model name for routing (e.g. a service-tier prefix), so a model that *requires* the Responses API can arrive under a name that doesn't start with `gpt-5`. the name check then sends it to Chat Completions and it fails, with no way to override. this flag lets such a model work regardless of its name. * per-call, not global: the model is chosen per call, so a global switch can't say "this model yes, that model no". keeps it next to `model`, like `temperature`/`maxTokens`. * also fixes a Responses-API bug found while testing: `reasoning_effort` was sent top-level (the API wants `reasoning.effort`), so any reasoning call routed to Responses 400'd. PTAL: FYI: ## Test plan * [x] unit tests (js + py, incl. built-in named scorers and reasoning.effort) * [x] manual smoke test — scratch scripts below, each runs a scorer 3 ways and prints the endpoint hit: ```bash OPENAI_API_KEY=sk-... [OPENAI_BASE_URL=https://us.api.openai.com/v1] python test.py OPENAI_API_KEY=sk-... [OPENAI_BASE_URL=https://us.api.openai.com/v1] node test.mjs # after `pnpm run build` ``` <details><summary><code>test.py</code></summary> ```python """Scratch check: gpt-4.1 supports both Chat Completions and Responses APIs. Run with OPENAI_API_KEY set. The request hook prints which endpoint each call hits. If your org is region-pinned, also set OPENAI_BASE_URL (e.g. https://us.api.openai.com/v1): OPENAI_API_KEY=sk-... OPENAI_BASE_URL=https://us.api.openai.com/v1 python test.py """ import os import httpx from openai import OpenAI from autoevals import Factuality, LLMClassifier, init init( OpenAI( base_url=os.environ.get("OPENAI_BASE_URL"), # None → SDK default (api.openai.com) http_client=httpx.Client(event_hooks={"request": [lambda r: print(" request →", r.url.path)]}), ) ) data = dict(output="6", expected="6", input="Add the numbers 1, 2, 3") print("gpt-4.1 (default → expect /chat/completions):") print(" score =", Factuality(model="gpt-4.1").eval(**data).score) print("gpt-4.1 + use_responses_api=True (→ expect /responses):") print(" score =", Factuality(model="gpt-4.1", use_responses_api=True).eval(**data).score) # Built-in named scorers don't forward reasoning_effort yet, so use LLMClassifier here. print("gpt-5.4 + medium reasoning (gpt-5 family → expect /responses):") clf = LLMClassifier( name="match", prompt_template="Is the submission {{output}} equal to {{expected}}? Answer Y or N.", choice_scores={"Y": 1, "N": 0}, model="gpt-5.4", reasoning_effort="medium", ) print(" score =", clf.eval(**data).score) ``` </details> <details><summary><code>test.mjs</code></summary> ```js // Scratch check: gpt-4.1 supports both Chat Completions and Responses APIs. // Run with OPENAI_API_KEY set. The fetch wrapper prints which endpoint each call hits. // If your org is region-pinned, also set OPENAI_BASE_URL (e.g. https://us.api.openai.com/v1): // OPENAI_API_KEY=sk-... OPENAI_BASE_URL=https://us.api.openai.com/v1 node test.mjs import { OpenAI } from "openai"; import { Factuality, LLMClassifierFromTemplate, init } from "./jsdist/index.mjs"; const client = new OpenAI({ baseURL: process.env.OPENAI_BASE_URL, // undefined → SDK default (api.openai.com) fetch: (url, opts) => { const u = typeof url === "string" ? url : url.url; console.log(" request →", new URL(u).pathname); return fetch(url, opts); }, }); init({ client }); const data = { output: "6", expected: "6", input: "Add the numbers 1, 2, 3" }; console.log("gpt-4.1 (default → expect /chat/completions):"); console.log(" score =", (await Factuality({ ...data, model: "gpt-4.1" })).score); console.log("gpt-4.1 + useResponsesApi:true (→ expect /responses):"); console.log( " score =", (await Factuality({ ...data, model: "gpt-4.1", useResponsesApi: true })).score, ); // Built-in named scorers don't forward reasoningEffort yet, so use LLMClassifierFromTemplate here. console.log("gpt-5.4 + medium reasoning (gpt-5 family → expect /responses):"); const clf = LLMClassifierFromTemplate({ name: "match", promptTemplate: "Is the submission {{output}} equal to {{expected}}? Answer Y or N.", choiceScores: { Y: 1, N: 0 }, model: "gpt-5.4", reasoningEffort: "medium", }); console.log(" score =", (await clf({ ...data })).score); ``` </details>
1 parent f861688 commit 63e1cd7

6 files changed

Lines changed: 311 additions & 12 deletions

File tree

js/llm.test.ts

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,13 +329,172 @@ Issue Description: {{page_content}}
329329
choiceScores: { "1": 1, "2": 0 },
330330
maxTokens: 256,
331331
temperature: 0.5,
332+
reasoningEffort: "medium",
332333
});
333334

334335
await classifier({ output: "test output", expected: "test expected" });
335336

336337
// Verify that temperature is in the request (max_tokens not supported by Responses API)
337338
expect(capturedRequestBody.temperature).toBe(0.5);
338339
expect(capturedRequestBody.max_tokens).toBeUndefined();
340+
// The Responses API nests reasoning effort under reasoning.effort.
341+
expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" });
342+
expect(capturedRequestBody.reasoning_effort).toBeUndefined();
343+
});
344+
345+
test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
346+
let responsesHit = false;
347+
let chatCompletionsHit = false;
348+
349+
server.use(
350+
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
351+
responsesHit = true;
352+
const body = (await request.json()) as any;
353+
// The control flag must be stripped before reaching the API.
354+
expect(body.use_responses_api).toBeUndefined();
355+
expect(body.useResponsesApi).toBeUndefined();
356+
return HttpResponse.json({
357+
id: "resp-test",
358+
object: "response",
359+
created: 1234567890,
360+
model: body.model,
361+
output: [
362+
{
363+
type: "function_call",
364+
call_id: "call_test",
365+
name: "select_choice",
366+
arguments: JSON.stringify({ choice: "1" }),
367+
},
368+
],
369+
});
370+
}),
371+
http.post(
372+
"https://api.openai.com/v1/chat/completions",
373+
async ({ request }) => {
374+
chatCompletionsHit = true;
375+
const body = (await request.json()) as any;
376+
return HttpResponse.json({
377+
id: "chatcmpl-test",
378+
object: "chat.completion",
379+
created: 1234567890,
380+
model: body.model,
381+
choices: [
382+
{
383+
index: 0,
384+
message: {
385+
role: "assistant",
386+
content: null,
387+
tool_calls: [
388+
{
389+
id: "call_test",
390+
type: "function",
391+
function: {
392+
name: "select_choice",
393+
arguments: JSON.stringify({ choice: "1" }),
394+
},
395+
},
396+
],
397+
},
398+
finish_reason: "stop",
399+
},
400+
],
401+
});
402+
},
403+
),
404+
);
405+
406+
init({
407+
client: new OpenAI({
408+
apiKey: "test-api-key",
409+
baseURL: "https://api.openai.com/v1",
410+
}),
411+
});
412+
413+
const classifier = LLMClassifierFromTemplate({
414+
name: "test",
415+
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
416+
choiceScores: { "1": 1, "2": 0 },
417+
});
418+
419+
// A proxy-served model that does NOT start with "gpt-5".
420+
const result = await classifier({
421+
output: "test output",
422+
expected: "test expected",
423+
model: "internal-proxy-model",
424+
useResponsesApi: true,
425+
});
426+
427+
expect(result.error).toBeUndefined();
428+
expect(responsesHit).toBe(true);
429+
expect(chatCompletionsHit).toBe(false);
430+
});
431+
432+
test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => {
433+
let responsesHit = false;
434+
let chatCompletionsHit = false;
435+
436+
server.use(
437+
http.post("https://api.openai.com/v1/responses", async () => {
438+
responsesHit = true;
439+
return HttpResponse.json({});
440+
}),
441+
http.post(
442+
"https://api.openai.com/v1/chat/completions",
443+
async ({ request }) => {
444+
chatCompletionsHit = true;
445+
const body = (await request.json()) as any;
446+
return HttpResponse.json({
447+
id: "chatcmpl-test",
448+
object: "chat.completion",
449+
created: 1234567890,
450+
model: body.model,
451+
choices: [
452+
{
453+
index: 0,
454+
message: {
455+
role: "assistant",
456+
content: null,
457+
tool_calls: [
458+
{
459+
id: "call_test",
460+
type: "function",
461+
function: {
462+
name: "select_choice",
463+
arguments: JSON.stringify({ choice: "1" }),
464+
},
465+
},
466+
],
467+
},
468+
finish_reason: "stop",
469+
},
470+
],
471+
});
472+
},
473+
),
474+
);
475+
476+
init({
477+
client: new OpenAI({
478+
apiKey: "test-api-key",
479+
baseURL: "https://api.openai.com/v1",
480+
}),
481+
});
482+
483+
const classifier = LLMClassifierFromTemplate({
484+
name: "test",
485+
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
486+
choiceScores: { "1": 1, "2": 0 },
487+
});
488+
489+
const result = await classifier({
490+
output: "test output",
491+
expected: "test expected",
492+
model: "gpt-4o-mini",
493+
});
494+
495+
expect(result.error).toBeUndefined();
496+
expect(chatCompletionsHit).toBe(true);
497+
expect(responsesHit).toBe(false);
339498
});
340499

341500
test("LLMClassifierFromTemplate uses configured default model", async () => {

js/llm.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ export type LLMArgs = {
7373
reasoningEffort?: ReasoningEffort;
7474
reasoningEnabled?: boolean;
7575
reasoningBudget?: number;
76+
/**
77+
* Force the request to use the Responses API, even when the model name does
78+
* not start with "gpt-5". Useful for proxy/internal setups that serve a
79+
* Responses-only model under a non-matching name.
80+
*/
81+
useResponsesApi?: boolean;
7682
} & OpenAIAuth;
7783

7884
/**
@@ -166,6 +172,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
166172
reasoningEffort,
167173
reasoningEnabled,
168174
reasoningBudget,
175+
useResponsesApi,
169176
cache,
170177
...remainingRenderArgs
171178
} = remaining;
@@ -176,6 +183,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
176183
reasoning_effort?: ReasoningEffort;
177184
reasoning_enabled?: boolean;
178185
reasoning_budget?: number;
186+
use_responses_api?: boolean;
179187
} = {};
180188
if (temperature !== undefined) {
181189
extraArgs.temperature = temperature;
@@ -192,6 +200,9 @@ export async function OpenAIClassifier<RenderArgs, Output>(
192200
if (reasoningBudget !== undefined) {
193201
extraArgs.reasoning_budget = reasoningBudget;
194202
}
203+
if (useResponsesApi !== undefined) {
204+
extraArgs.use_responses_api = useResponsesApi;
205+
}
195206

196207
const renderArgs = {
197208
output,
@@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
293304
reasoningEffort,
294305
reasoningEnabled,
295306
reasoningBudget,
307+
useResponsesApi,
296308
}: {
297309
name: string;
298310
promptTemplate: string;
@@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
304316
reasoningEffort?: ReasoningEffort;
305317
reasoningEnabled?: boolean;
306318
reasoningBudget?: number;
319+
useResponsesApi?: boolean;
307320
}): Scorer<string, LLMClassifierArgs<RenderArgs>> {
308321
const choiceStrings = Object.keys(choiceScores);
309322
const ret = async (
@@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
352365
reasoningEffort,
353366
reasoningEnabled,
354367
reasoningBudget,
368+
useResponsesApi,
355369
__choices: choiceStrings,
356370
// Thread template vars come first so explicit args can override
357371
...threadVars,

js/oai.ts

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ export interface CachedLLMParams {
1919
temperature?: number;
2020
max_tokens?: number;
2121
reasoning_effort?: ReasoningEffort;
22+
/**
23+
* Force the request to use the Responses API, even when the model name does
24+
* not start with "gpt-5". Useful for proxy/internal setups that serve a
25+
* Responses-only model under a name that doesn't match {@link isGPT5Model}.
26+
*/
27+
use_responses_api?: boolean;
2228
span_info?: {
2329
spanAttributes?: Record<string, string>;
2430
};
@@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean {
295301
return model.startsWith("gpt-5");
296302
}
297303

304+
/**
305+
* Whether to route the request through the Responses API. GPT-5 models require
306+
* it, and callers can force it via `useResponsesApi` for proxy/internal setups
307+
* that serve a Responses-only model under a name that doesn't start with "gpt-5".
308+
*/
309+
function isForcedResponsesMode(params: CachedLLMParams): boolean {
310+
return isGPT5Model(params.model) || params.use_responses_api === true;
311+
}
312+
298313
export async function cachedChatCompletion(
299314
params: CachedLLMParams,
300315
options: { cache?: ChatCache } & OpenAIAuth,
301316
): Promise<ChatCompletion> {
302317
const openai = buildOpenAIClient(options);
303318

319+
// Strip use_responses_api so it is never forwarded to either API.
320+
const { use_responses_api: _useResponsesApi, ...completionParams } = params;
321+
304322
const fullParams = globalThis.__inherited_braintrust_wrap_openai
305323
? {
306-
...params,
324+
...completionParams,
307325
span_info: {
308326
spanAttributes: {
309-
...params.span_info?.spanAttributes,
327+
...completionParams.span_info?.spanAttributes,
310328
purpose: "scorer",
311329
},
312330
},
313331
}
314-
: params;
332+
: completionParams;
315333

316-
// GPT-5 models require the Responses API
317-
if (isGPT5Model(params.model)) {
334+
// GPT-5 models require the Responses API; callers may also force it.
335+
if (isForcedResponsesMode(params)) {
318336
// Convert Chat Completions API params to Responses API params
319337
const responsesParams: any = {
320338
model: fullParams.model,
@@ -362,7 +380,8 @@ export async function cachedChatCompletion(
362380
}
363381
// Note: max_tokens is not supported by Responses API
364382
if (fullParams.reasoning_effort) {
365-
responsesParams.reasoning_effort = fullParams.reasoning_effort;
383+
// The Responses API nests this under reasoning.effort, unlike Chat Completions.
384+
responsesParams.reasoning = { effort: fullParams.reasoning_effort };
366385
}
367386
const response: any = await openai.responses.create(responsesParams);
368387

py/autoevals/llm.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ def __init__(
180180
reasoning_effort=None,
181181
reasoning_enabled=None,
182182
reasoning_budget=None,
183+
use_responses_api=None,
183184
engine=None,
184185
api_key=None,
185186
base_url=None,
@@ -210,6 +211,9 @@ def __init__(
210211
if reasoning_budget is not None:
211212
self.extra_args["reasoning_budget"] = reasoning_budget
212213

214+
if use_responses_api is not None:
215+
self.extra_args["use_responses_api"] = use_responses_api
216+
213217
self.render_args = {}
214218
if render_args:
215219
self.render_args.update(render_args)
@@ -366,6 +370,7 @@ def __init__(
366370
reasoning_effort=None,
367371
reasoning_enabled=None,
368372
reasoning_budget=None,
373+
use_responses_api=None,
369374
engine=None,
370375
api_key=None,
371376
base_url=None,
@@ -397,6 +402,7 @@ def __init__(
397402
reasoning_effort=reasoning_effort,
398403
reasoning_enabled=reasoning_enabled,
399404
reasoning_budget=reasoning_budget,
405+
use_responses_api=use_responses_api,
400406
engine=engine,
401407
api_key=api_key,
402408
base_url=base_url,
@@ -498,6 +504,7 @@ def __new__(
498504
use_cot=None,
499505
max_tokens=None,
500506
temperature=None,
507+
use_responses_api=None,
501508
api_key=None,
502509
base_url=None,
503510
client: Client | None = None,
@@ -513,6 +520,8 @@ def __new__(
513520
kwargs["max_tokens"] = max_tokens
514521
if temperature is not None:
515522
kwargs["temperature"] = temperature
523+
if use_responses_api is not None:
524+
kwargs["use_responses_api"] = use_responses_api
516525
if api_key is not None:
517526
kwargs["api_key"] = api_key
518527
if base_url is not None:

py/autoevals/oai.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -310,17 +310,21 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
310310
responses_params["tool_choice"] = "required"
311311

312312
# Copy supported parameters
313-
for key in ["temperature", "reasoning_effort"]:
314-
if key in kwargs:
315-
responses_params[key] = kwargs[key]
313+
if "temperature" in kwargs:
314+
responses_params["temperature"] = kwargs["temperature"]
315+
# The Responses API nests this under reasoning.effort, unlike Chat Completions.
316+
if "reasoning_effort" in kwargs:
317+
responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]}
316318

317319
return responses_params
318320

319321
if self.is_async:
320322

321323
async def complete_wrapper(**kwargs: Any) -> Any:
322324
model = kwargs.get("model", "")
323-
if is_gpt5_model(model):
325+
# Strip use_responses_api so it is never forwarded to either API.
326+
use_responses_api = kwargs.pop("use_responses_api", False)
327+
if is_gpt5_model(model) or use_responses_api:
324328
responses_params = prepare_responses_params(kwargs)
325329
response = await responses_create(**responses_params)
326330
return convert_responses_to_chat_completion(response)
@@ -330,7 +334,9 @@ async def complete_wrapper(**kwargs: Any) -> Any:
330334

331335
def complete_wrapper(**kwargs: Any) -> Any:
332336
model = kwargs.get("model", "")
333-
if is_gpt5_model(model):
337+
# Strip use_responses_api so it is never forwarded to either API.
338+
use_responses_api = kwargs.pop("use_responses_api", False)
339+
if is_gpt5_model(model) or use_responses_api:
334340
responses_params = prepare_responses_params(kwargs)
335341
response = responses_create(**responses_params)
336342
return convert_responses_to_chat_completion(response)

0 commit comments

Comments
 (0)