fix(api): align /v1/responses max_output_tokens cap to 128k for all models (refs #112)

james-6-23 · james-6-23 · commit 8fa4087d384b · 2026-05-03T04:59:59.000+08:00
The translator strips max_output_tokens before forwarding to Codex (which
rejects the field), so the local cap is only a sanity bound. Holding
non-gpt-5.5 models at 65536 rejected legitimate SDK defaults in the
100k+ range without any upstream benefit, and both CLIProxyAPI and
sub2api apply no local ceiling at all on the Codex path.

Collapse the per-model branch into a single 128000 cap and update the
OpenAPI description to make it explicit that the ceiling is enforced
upstream, not here. ResponsesMaxOutputTokensForModel keeps its signature
for API stability.
diff --git a/api/openapi.yaml b/api/openapi.yaml
@@ -342,7 +342,11 @@ components:
           type: integer
           minimum: 1
           maximum: 128000
-          description: Maximum output tokens. gpt-5.5 supports up to 128000; other models may be capped lower.
+          description: |
+            Client-side hint for maximum output tokens. The proxy strips this
+            field before forwarding to the Codex upstream (which does not
+            accept it), so the effective ceiling is enforced upstream, not here.
+            The 128000 cap above is a sanity bound for obviously-absurd values.
         temperature:
           type: number
           minimum: 0
diff --git a/api/validation.go b/api/validation.go
@@ -14,10 +14,14 @@ import (
 	"github.com/tidwall/gjson"
 )
 
-const (
-	defaultResponsesMaxOutputTokens = 65536
-	gpt55ResponsesMaxOutputTokens   = 128000
-)
+// responsesMaxOutputTokensCap is the upper bound enforced by the local
+// validator on the /v1/responses max_output_tokens field. The translator
+// strips the field before forwarding to the Codex upstream (which does not
+// accept it), so this cap only guards against obviously-absurd client values
+// — it does not control the actual output length, which is decided upstream.
+// Aligned to 128000 to match the highest cap OpenAI currently advertises on
+// any Codex-served model, so SDKs that default to 100k+ pass through.
+const responsesMaxOutputTokensCap = 128000
 
 // ValidationRule represents a validation rule function
 type ValidationRule func(value gjson.Result, path string) *ValidationError
@@ -424,16 +428,12 @@ func ChatCompletionValidationRules() map[string][]ValidationRule {
 	}
 }
 
-// ResponsesMaxOutputTokensForModel returns the downstream validation cap for
-// max_output_tokens. Most Codex models still use the legacy 64k output cap,
-// while gpt-5.5 clients may legitimately request up to 128k.
-func ResponsesMaxOutputTokensForModel(model string) int {
-	switch strings.ToLower(strings.TrimSpace(model)) {
-	case "gpt-5.5":
-		return gpt55ResponsesMaxOutputTokens
-	default:
-		return defaultResponsesMaxOutputTokens
-	}
+// ResponsesMaxOutputTokensForModel returns the local validation cap for
+// max_output_tokens. The cap is intentionally model-agnostic: the translator
+// drops the field before forwarding to Codex, so the real upstream ceiling is
+// enforced server-side. The model argument is kept for API compatibility.
+func ResponsesMaxOutputTokensForModel(_ string) int {
+	return responsesMaxOutputTokensCap
 }
 
 // ResponsesAPIValidationRules returns validation rules for responses API request
@@ -442,12 +442,11 @@ func ResponsesAPIValidationRules() map[string][]ValidationRule {
 	return ResponsesAPIValidationRulesForModel("")
 }
 
-func ResponsesAPIValidationRulesForModel(model string) map[string][]ValidationRule {
-	maxOutputTokens := ResponsesMaxOutputTokensForModel(model)
+func ResponsesAPIValidationRulesForModel(_ string) map[string][]ValidationRule {
 	return map[string][]ValidationRule{
 		"model": {Required(), TypeString(), MaxLength(64)},
 		// input validation is handled separately to support both string and array formats
-		"max_output_tokens": {TypeNumber(), MinValue(1), MaxValue(float64(maxOutputTokens))},
+		"max_output_tokens": {TypeNumber(), MinValue(1), MaxValue(float64(responsesMaxOutputTokensCap))},
 		"temperature":       {TypeNumber(), Range(0, 2)},
 		"top_p":             {TypeNumber(), Range(0, 1)},
 		"stream":            {TypeBoolean()},
diff --git a/api/validation_test.go b/api/validation_test.go
@@ -75,7 +75,7 @@ func TestValidateResponsesAPIRequestAllowsCompactionInputType(t *testing.T) {
 	}
 }
 
-func TestValidateResponsesAPIRequestUsesModelAwareMaxOutputTokens(t *testing.T) {
+func TestValidateResponsesAPIRequestMaxOutputTokensCap(t *testing.T) {
 	tests := []struct {
 		name  string
 		body  []byte
@@ -92,8 +92,13 @@ func TestValidateResponsesAPIRequestUsesModelAwareMaxOutputTokens(t *testing.T)
 			valid: false,
 		},
 		{
-			name:  "other models keep 64k output cap",
-			body:  []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":65537}`),
+			name:  "other models also allow up to 128k (aligned cap, upstream decides actual ceiling)",
+			body:  []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":100000}`),
+			valid: true,
+		},
+		{
+			name:  "other models reject above 128k",
+			body:  []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":128001}`),
 			valid: false,
 		},
 	}

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ func TestValidateResponsesAPIRequestAllowsCompactionInputType(t *testing.T) {`
`75`	`75`	`}`
`76`	`76`	`}`
`77`	`77`
`78`		`-func TestValidateResponsesAPIRequestUsesModelAwareMaxOutputTokens(t *testing.T) {`
	`78`	`+func TestValidateResponsesAPIRequestMaxOutputTokensCap(t *testing.T) {`
`79`	`79`	`tests := []struct {`
`80`	`80`	`name string`
`81`	`81`	`body []byte`
`@@ -92,8 +92,13 @@ func TestValidateResponsesAPIRequestUsesModelAwareMaxOutputTokens(t *testing.T)`
`92`	`92`	`valid: false,`
`93`	`93`	`},`
`94`	`94`	`{`
`95`		`- name: "other models keep 64k output cap",`
`96`		- body: []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":65537}`),
	`95`	`+ name: "other models also allow up to 128k (aligned cap, upstream decides actual ceiling)",`
	`96`	+ body: []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":100000}`),
	`97`	`+ valid: true,`
	`98`	`+ },`
	`99`	`+ {`
	`100`	`+ name: "other models reject above 128k",`
	`101`	+ body: []byte(`{"model":"gpt-5.4","input":"hello","max_output_tokens":128001}`),
`97`	`102`	`valid: false,`
`98`	`103`	`},`
`99`	`104`	`}`