[OAI] Allow forcing Responses API for non-gpt-5 model names (#190)

wong-codaio · web-flow · commit 63e1cd791869 · 2026-06-02T06:37:59.000-07:00
## Summary **[OAI] Allow forcing Responses API for non-gpt-5 model names** * per-call `use_responses_api` (py) / `useResponsesApi` (js) flag forces the Responses API. routing becomes `isGPT5Model(model) || useResponsesApi`; flag is stripped before the request. * motivation: internal proxies may rewrite the model name for routing (e.g. a service-tier prefix), so a model that *requires* the Responses API can arrive under a name that doesn't start with `gpt-5`. the name check then sends it to Chat Completions and it fails, with no way to override. this flag lets such a model work regardless of its name. * per-call, not global: the model is chosen per call, so a global switch can't say "this model yes, that model no". keeps it next to `model`, like `temperature`/`maxTokens`. * also fixes a Responses-API bug found while testing: `reasoning_effort` was sent top-level (the API wants `reasoning.effort`), so any reasoning call routed to Responses 400'd. PTAL: FYI: ## Test plan * [x] unit tests (js + py, incl. built-in named scorers and reasoning.effort) * [x] manual smoke test — scratch scripts below, each runs a scorer 3 ways and prints the endpoint hit: ```bash OPENAI_API_KEY=sk-... [OPENAI_BASE_URL=https://us.api.openai.com/v1] python test.py OPENAI_API_KEY=sk-... [OPENAI_BASE_URL=https://us.api.openai.com/v1] node test.mjs # after `pnpm run build` ``` <details><summary><code>test.py</code></summary> ```python """Scratch check: gpt-4.1 supports both Chat Completions and Responses APIs. Run with OPENAI_API_KEY set. The request hook prints which endpoint each call hits. If your org is region-pinned, also set OPENAI_BASE_URL (e.g. https://us.api.openai.com/v1): OPENAI_API_KEY=sk-... OPENAI_BASE_URL=https://us.api.openai.com/v1 python test.py """ import os import httpx from openai import OpenAI from autoevals import Factuality, LLMClassifier, init init( OpenAI( base_url=os.environ.get("OPENAI_BASE_URL"), # None → SDK default (api.openai.com) http_client=httpx.Client(event_hooks={"request": [lambda r: print(" request →", r.url.path)]}), ) ) data = dict(output="6", expected="6", input="Add the numbers 1, 2, 3") print("gpt-4.1 (default → expect /chat/completions):") print(" score =", Factuality(model="gpt-4.1").eval(**data).score) print("gpt-4.1 + use_responses_api=True (→ expect /responses):") print(" score =", Factuality(model="gpt-4.1", use_responses_api=True).eval(**data).score) # Built-in named scorers don't forward reasoning_effort yet, so use LLMClassifier here. print("gpt-5.4 + medium reasoning (gpt-5 family → expect /responses):") clf = LLMClassifier( name="match", prompt_template="Is the submission {{output}} equal to {{expected}}? Answer Y or N.", choice_scores={"Y": 1, "N": 0}, model="gpt-5.4", reasoning_effort="medium", ) print(" score =", clf.eval(**data).score) ``` </details> <details><summary><code>test.mjs</code></summary> ```js // Scratch check: gpt-4.1 supports both Chat Completions and Responses APIs. // Run with OPENAI_API_KEY set. The fetch wrapper prints which endpoint each call hits. // If your org is region-pinned, also set OPENAI_BASE_URL (e.g. https://us.api.openai.com/v1): // OPENAI_API_KEY=sk-... OPENAI_BASE_URL=https://us.api.openai.com/v1 node test.mjs import { OpenAI } from "openai"; import { Factuality, LLMClassifierFromTemplate, init } from "./jsdist/index.mjs"; const client = new OpenAI({ baseURL: process.env.OPENAI_BASE_URL, // undefined → SDK default (api.openai.com) fetch: (url, opts) => { const u = typeof url === "string" ? url : url.url; console.log(" request →", new URL(u).pathname); return fetch(url, opts); }, }); init({ client }); const data = { output: "6", expected: "6", input: "Add the numbers 1, 2, 3" }; console.log("gpt-4.1 (default → expect /chat/completions):"); console.log(" score =", (await Factuality({ ...data, model: "gpt-4.1" })).score); console.log("gpt-4.1 + useResponsesApi:true (→ expect /responses):"); console.log( " score =", (await Factuality({ ...data, model: "gpt-4.1", useResponsesApi: true })).score, ); // Built-in named scorers don't forward reasoningEffort yet, so use LLMClassifierFromTemplate here. console.log("gpt-5.4 + medium reasoning (gpt-5 family → expect /responses):"); const clf = LLMClassifierFromTemplate({ name: "match", promptTemplate: "Is the submission {{output}} equal to {{expected}}? Answer Y or N.", choiceScores: { Y: 1, N: 0 }, model: "gpt-5.4", reasoningEffort: "medium", }); console.log(" score =", (await clf({ ...data })).score); ``` </details>
diff --git a/js/llm.test.ts b/js/llm.test.ts
@@ -329,13 +329,172 @@ Issue Description: {{page_content}}
       choiceScores: { "1": 1, "2": 0 },
       maxTokens: 256,
       temperature: 0.5,
+      reasoningEffort: "medium",
     });
 
     await classifier({ output: "test output", expected: "test expected" });
 
     // Verify that temperature is in the request (max_tokens not supported by Responses API)
     expect(capturedRequestBody.temperature).toBe(0.5);
     expect(capturedRequestBody.max_tokens).toBeUndefined();
+    // The Responses API nests reasoning effort under reasoning.effort.
+    expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" });
+    expect(capturedRequestBody.reasoning_effort).toBeUndefined();
+  });
+
+  test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        responsesHit = true;
+        const body = (await request.json()) as any;
+        // The control flag must be stripped before reaching the API.
+        expect(body.use_responses_api).toBeUndefined();
+        expect(body.useResponsesApi).toBeUndefined();
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: body.model,
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    // A proxy-served model that does NOT start with "gpt-5".
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "internal-proxy-model",
+      useResponsesApi: true,
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(responsesHit).toBe(true);
+    expect(chatCompletionsHit).toBe(false);
+  });
+
+  test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async () => {
+        responsesHit = true;
+        return HttpResponse.json({});
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "gpt-4o-mini",
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(chatCompletionsHit).toBe(true);
+    expect(responsesHit).toBe(false);
   });
 
   test("LLMClassifierFromTemplate uses configured default model", async () => {
diff --git a/js/llm.ts b/js/llm.ts
@@ -73,6 +73,12 @@ export type LLMArgs = {
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a non-matching name.
+   */
+  useResponsesApi?: boolean;
 } & OpenAIAuth;
 
 /**
@@ -166,6 +172,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoningEffort,
     reasoningEnabled,
     reasoningBudget,
+    useResponsesApi,
     cache,
     ...remainingRenderArgs
   } = remaining;
@@ -176,6 +183,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoning_effort?: ReasoningEffort;
     reasoning_enabled?: boolean;
     reasoning_budget?: number;
+    use_responses_api?: boolean;
   } = {};
   if (temperature !== undefined) {
     extraArgs.temperature = temperature;
@@ -192,6 +200,9 @@ export async function OpenAIClassifier<RenderArgs, Output>(
   if (reasoningBudget !== undefined) {
     extraArgs.reasoning_budget = reasoningBudget;
   }
+  if (useResponsesApi !== undefined) {
+    extraArgs.use_responses_api = useResponsesApi;
+  }
 
   const renderArgs = {
     output,
@@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort,
   reasoningEnabled,
   reasoningBudget,
+  useResponsesApi,
 }: {
   name: string;
   promptTemplate: string;
@@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  useResponsesApi?: boolean;
 }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
   const choiceStrings = Object.keys(choiceScores);
   const ret = async (
@@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
       reasoningEffort,
       reasoningEnabled,
       reasoningBudget,
+      useResponsesApi,
       __choices: choiceStrings,
       // Thread template vars come first so explicit args can override
       ...threadVars,
diff --git a/js/oai.ts b/js/oai.ts
@@ -19,6 +19,12 @@ export interface CachedLLMParams {
   temperature?: number;
   max_tokens?: number;
   reasoning_effort?: ReasoningEffort;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a name that doesn't match {@link isGPT5Model}.
+   */
+  use_responses_api?: boolean;
   span_info?: {
     spanAttributes?: Record<string, string>;
   };
@@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean {
   return model.startsWith("gpt-5");
 }
 
+/**
+ * Whether to route the request through the Responses API. GPT-5 models require
+ * it, and callers can force it via `useResponsesApi` for proxy/internal setups
+ * that serve a Responses-only model under a name that doesn't start with "gpt-5".
+ */
+function isForcedResponsesMode(params: CachedLLMParams): boolean {
+  return isGPT5Model(params.model) || params.use_responses_api === true;
+}
+
 export async function cachedChatCompletion(
   params: CachedLLMParams,
   options: { cache?: ChatCache } & OpenAIAuth,
 ): Promise<ChatCompletion> {
   const openai = buildOpenAIClient(options);
 
+  // Strip use_responses_api so it is never forwarded to either API.
+  const { use_responses_api: _useResponsesApi, ...completionParams } = params;
+
   const fullParams = globalThis.__inherited_braintrust_wrap_openai
     ? {
-        ...params,
+        ...completionParams,
         span_info: {
           spanAttributes: {
-            ...params.span_info?.spanAttributes,
+            ...completionParams.span_info?.spanAttributes,
             purpose: "scorer",
           },
         },
       }
-    : params;
+    : completionParams;
 
-  // GPT-5 models require the Responses API
-  if (isGPT5Model(params.model)) {
+  // GPT-5 models require the Responses API; callers may also force it.
+  if (isForcedResponsesMode(params)) {
     // Convert Chat Completions API params to Responses API params
     const responsesParams: any = {
       model: fullParams.model,
@@ -362,7 +380,8 @@ export async function cachedChatCompletion(
     }
     // Note: max_tokens is not supported by Responses API
     if (fullParams.reasoning_effort) {
-      responsesParams.reasoning_effort = fullParams.reasoning_effort;
+      // The Responses API nests this under reasoning.effort, unlike Chat Completions.
+      responsesParams.reasoning = { effort: fullParams.reasoning_effort };
     }
     const response: any = await openai.responses.create(responsesParams);
 
diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
@@ -180,6 +180,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -210,6 +211,9 @@ def __init__(
         if reasoning_budget is not None:
             self.extra_args["reasoning_budget"] = reasoning_budget
 
+        if use_responses_api is not None:
+            self.extra_args["use_responses_api"] = use_responses_api
+
         self.render_args = {}
         if render_args:
             self.render_args.update(render_args)
@@ -366,6 +370,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -397,6 +402,7 @@ def __init__(
             reasoning_effort=reasoning_effort,
             reasoning_enabled=reasoning_enabled,
             reasoning_budget=reasoning_budget,
+            use_responses_api=use_responses_api,
             engine=engine,
             api_key=api_key,
             base_url=base_url,
@@ -498,6 +504,7 @@ def __new__(
         use_cot=None,
         max_tokens=None,
         temperature=None,
+        use_responses_api=None,
         api_key=None,
         base_url=None,
         client: Client | None = None,
@@ -513,6 +520,8 @@ def __new__(
             kwargs["max_tokens"] = max_tokens
         if temperature is not None:
             kwargs["temperature"] = temperature
+        if use_responses_api is not None:
+            kwargs["use_responses_api"] = use_responses_api
         if api_key is not None:
             kwargs["api_key"] = api_key
         if base_url is not None:
diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py
@@ -310,17 +310,21 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
                         responses_params["tool_choice"] = "required"
 
                 # Copy supported parameters
-                for key in ["temperature", "reasoning_effort"]:
-                    if key in kwargs:
-                        responses_params[key] = kwargs[key]
+                if "temperature" in kwargs:
+                    responses_params["temperature"] = kwargs["temperature"]
+                # The Responses API nests this under reasoning.effort, unlike Chat Completions.
+                if "reasoning_effort" in kwargs:
+                    responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]}
 
                 return responses_params
 
             if self.is_async:
 
                 async def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = await responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)
@@ -330,7 +334,9 @@ async def complete_wrapper(**kwargs: Any) -> Any:
 
                 def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)
diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py