fix: surface real upstream cooldowns

dwgx · dwgx · commit 736eefb093ef · 2026-06-07T00:37:04.000+09:00
diff --git a/README.md b/README.md
@@ -406,6 +406,15 @@ A: 看模型。Claude family `<tool_use>` 协议训练扎实最稳（free 账号
 **Q: 31 个 trial 账号一会儿就全 unavailable**
 A: 八成是用了周限模型 — `claude-opus-4-7-max` / `gpt-5.5-xhigh` / `claude-sonnet-4-7-thinking` 这类高 reasoning effort 变体每个账号每周只有 5 次配额，31 号 × 5 次 ≈ 150 次就到顶。换 `claude-sonnet-4.6` / `claude-haiku-4.5` daily 配额比较宽松。`docker logs windsurfapi-windsurf-api-1 | grep rate_limit` 看每个账号的 cooldown 字段验证。
 
+**Q: All accounts temporarily rate-limited / IP-level cooldown 是不是代理坏了**
+A: 通常不是。Windsurf 上游会对同一出口 IP + 同一模型的密集请求施加 cooldown，多个账号绑在同一出口时会一起被限流。WindsurfAPI 会停止继续烧账号并返回 `429 + Retry-After`；v2.0.140 起这个等待时间会按上游 `Resets in: 27m12s` 这类真实值返回，而不是固定提示 30 秒。解决方向是降并发、换更宽松模型、给账号绑定不同出口 IP，或者等上游 cooldown 到期。
+
+**Q: free 账号是不是本地限制成 1 分钟 1 次**
+A: 不是。本地 free tier RPM 默认是 10/min。你看到的 1/min 或一段时间后恢复，通常是 Windsurf 上游 free-tier 动态限频或模型 entitlement 限制。Dashboard 里看账号状态和模型可用清单；请求无权限模型时错误里的 `available_in_pool` 会列出当前账号池能用的模型。
+
+**Q: context deadline exceeded / Client.Timeout 能靠调大 .env timeout 解决吗**
+A: 不能。长 thinking / 长输出在约 236-243 秒断流，是 Windsurf provider/Cascade 单次 stream 窗口。WindsurfAPI 会把它标成 `upstream_deadline_exceeded` / `windsurf_provider_deadline`，并丢弃半截 Cascade 复用轨迹，避免下一轮上下文错乱。实际规避只能拆任务、降低 reasoning/max output，或换更快模型。
+
 ## 贡献者
 
 特别感谢下面的朋友，他们提交过 PR 或系统性地审了代码，让这个项目变得更稳：
diff --git a/docs/releases/RELEASE_NOTES_2.0.140.md b/docs/releases/RELEASE_NOTES_2.0.140.md
@@ -0,0 +1,25 @@
+# v2.0.140 - truthful upstream cooldowns
+
+## What changed
+
+- IP-level rate-limit burst short-circuiting now carries the real upstream cooldown instead of always telling clients to wait 30 seconds.
+- When Windsurf returns messages like `Resets in: 27m12s`, the 429 response now uses the same value in `Retry-After`, `error.retry_after_ms`, and the user-facing message.
+- Non-stream chat handling now supports the same dependency injection hooks as the stream path, so rate-limit behavior can be covered by behavior tests without starting a real language server.
+- Non-stream Cascade reuse invalidation now also recognizes structured `upstream_deadline_exceeded` / `windsurf_provider_deadline` responses, not only the raw upstream error text.
+- README FAQ now separates local RPM limits, upstream free-tier throttling, IP cooldowns, and the upstream ~240s provider deadline.
+
+## Context
+
+Issues #176 and #189 showed real upstream cooldowns around 26-30 minutes, but the IP-burst guard surfaced a fixed 30-second retry hint. The guard was already doing the right thing by stopping account burn; this release makes the operator/client-facing cooldown truthful.
+
+This does not bypass Windsurf upstream rate limits. It prevents misleading retry timing and reduces repeated hammering during an upstream IP cooldown.
+
+## Validation
+
+- `node --test test/rate-limit.test.js`
+- `npm.cmd run test:release`
+- `node --test test/stream-error.test.js test/cascade-timeout-invalidation.test.js test/stream-pool-exhausted-error.test.js`
+- `npm.cmd run test:shard -- 0 4 --timeout-ms=90000`
+- `npm.cmd run test:shard -- 1 4 --timeout-ms=90000`
+- `npm.cmd run test:shard -- 2 4 --timeout-ms=90000`
+- `npm.cmd run test:shard -- 3 4 --timeout-ms=90000`
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "windsurf-api",
-  "version": "2.0.139",
+  "version": "2.0.140",
   "description": "Windsurf to OpenAI + Anthropic compatible API proxy. Turns Windsurf's 107 AI models (Claude, GPT, Gemini, DeepSeek, Grok, Qwen, Kimi, GLM, SWE) into dual-protocol API endpoints. Zero npm deps.",
   "type": "module",
   "main": "src/index.js",
diff --git a/src/handlers/chat.js b/src/handlers/chat.js
@@ -52,6 +52,7 @@ import {
 const HEARTBEAT_MS = 15_000;
 const QUEUE_RETRY_MS = 1_000;
 const QUEUE_MAX_WAIT_MS = 30_000;
+const IP_RATE_LIMIT_BURST_FLOOR_MS = 30_000;
 
 // Build the option bag the v2.0.25 semantic key needs. tools / tool_choice /
 // preamble are baked into the digest so a tool schema change misses instead
@@ -826,7 +827,7 @@ export function repairToolCallArguments(tc, messages) {
   return tc;
 }
 
-export function rateLimitCooldownMs(message = '') {
+export function parseRateLimitCooldownMs(message = '') {
   const reset = String(message || '').match(/resets?\s+in\s*:?\s*((?:(?:\d+)\s*[hms]\s*)+)/i);
   if (reset) {
     let total = 0;
@@ -848,7 +849,41 @@ export function rateLimitCooldownMs(message = '') {
     return n * 1000;
   }
   if (/about an hour|in an hour|try again in.*hour/i.test(message)) return 60 * 60 * 1000;
-  return 60 * 1000;
+  return null;
+}
+
+export function rateLimitCooldownMs(message = '') {
+  return parseRateLimitCooldownMs(message) || 60 * 1000;
+}
+
+function formatRetryAfter(ms) {
+  const seconds = Math.max(1, Math.ceil(Number(ms) / 1000));
+  if (seconds >= 3600) {
+    const h = Math.floor(seconds / 3600);
+    const m = Math.ceil((seconds - h * 3600) / 60);
+    return m > 0 ? `${h}h${m}m` : `${h}h`;
+  }
+  if (seconds >= 60) {
+    const m = Math.floor(seconds / 60);
+    const s = seconds - m * 60;
+    return s > 0 ? `${m}m${s}s` : `${m}m`;
+  }
+  return `${seconds}s`;
+}
+
+export function rateLimitBurstCooldownMs({ message = '', retryAfterMs = 0, apiKey = '', modelKey = '' } = {}) {
+  const candidates = [IP_RATE_LIMIT_BURST_FLOOR_MS];
+  const retry = Number(retryAfterMs);
+  if (Number.isFinite(retry) && retry > 0) candidates.push(retry);
+  const parsed = parseRateLimitCooldownMs(message);
+  if (Number.isFinite(parsed) && parsed > 0) candidates.push(parsed);
+  if (apiKey) {
+    const availability = getAccountAvailability(apiKey, modelKey);
+    if (!availability.available && Number.isFinite(availability.retryAfterMs) && availability.retryAfterMs > 0) {
+      candidates.push(availability.retryAfterMs);
+    }
+  }
+  return Math.max(...candidates);
 }
 
 function genId() {
@@ -1492,6 +1527,9 @@ async function _handleChatCompletionsInner(body, context = {}) {
   const cachePolicy = body.__cachePolicy || null;
   const checkMessageRateLimitFn = context.checkMessageRateLimit || checkMessageRateLimit;
   const waitForAccountFn = context.waitForAccount || waitForAccount;
+  const ensureLsFn = context.ensureLs || ensureLs;
+  const getLsForFn = context.getLsFor || getLsFor;
+  const WindsurfClientClass = context.WindsurfClient || WindsurfClient;
 
   // Probe diagnostics: dump compact request shape for every call, plus a
   // tail of the last user turn. Keeps us able to see how third-party
@@ -2179,11 +2217,11 @@ async function _handleChatCompletionsInner(body, context = {}) {
       }
     }
 
-    try { await ensureLs(acct.proxy); } catch (e) {
+    try { await ensureLsFn(acct.proxy); } catch (e) {
       lastErr = isLsPoolExhausted(e) ? lsPoolExhaustedResponse(e) : { status: e.status || 503, body: { error: { message: e.message || String(e), type: e.type || 'ls_unavailable' } } };
       break;
     }
-    const ls = getLsFor(acct.proxy);
+    const ls = getLsForFn(acct.proxy);
     if (!ls) { lastErr = { status: 503, body: { error: { message: 'No LS instance available', type: 'ls_unavailable' } } }; break; }
     // Cascade pins cascade_id to a specific LS port too; if the LS it was
     // born on has been replaced, the cascade_id is dead.
@@ -2197,7 +2235,7 @@ async function _handleChatCompletionsInner(body, context = {}) {
       return n + (typeof c === 'string' ? c.length : Array.isArray(c) ? c.reduce((k, p) => k + (typeof p?.text === 'string' ? p.text.length : 0), 0) : 0);
     }, 0);
     log.info(`Chat[${reqId}]: model=${displayModel} flow=${useCascade ? 'cascade' : 'legacy'} attempt=${attempt + 1} account=${acct.email} ls=${ls.port} turns=${(messages||[]).length} chars=${_msgChars}${reuseEntry ? ' reuse=1' : ''}${emulateTools ? ' tools=emu' : ''}`);
-    const client = new WindsurfClient(acct.apiKey, ls.port, ls.csrfToken);
+    const client = new WindsurfClientClass(acct.apiKey, ls.port, ls.csrfToken);
     const result = await nonStreamResponse(
       client, chatId, created, displayModel, routingModelKey, messages, cascadeMessages, modelEnum, modelUid,
       useCascade, acct.apiKey, ckey,
@@ -2224,8 +2262,14 @@ async function _handleChatCompletionsInner(body, context = {}) {
     // see the matching catch block in streamResponse for the full
     // rationale (cascade trajectory left half-broken, next reuse hits
     // it and the model "loses" the prior conversation).
-    const _resultMsg = String(result.body?.error?.message || '');
-    if (isUpstreamDeadlineExceeded(_resultMsg)) {
+    const _resultError = result.body?.error || {};
+    const _resultMsg = String(_resultError.upstream_message || _resultError.message || '');
+    if (
+      result.status === 504
+      || _resultError.type === 'upstream_deadline_exceeded'
+      || _resultError.code === 'windsurf_provider_deadline'
+      || isUpstreamDeadlineExceeded(_resultMsg)
+    ) {
       reuseEntryDead = true;
     }
     lastErr = result;
@@ -2245,22 +2289,32 @@ async function _handleChatCompletionsInner(body, context = {}) {
       if (!context.__rateLimitEvents) context.__rateLimitEvents = [];
       const RL_WINDOW_MS = 8_000;
       const RL_BURST_THRESHOLD = 3;
-      context.__rateLimitEvents.push({ time: Date.now(), model: routingModelKey, account: acct.id });
+      context.__rateLimitEvents.push({
+        time: Date.now(),
+        model: routingModelKey,
+        account: acct.id,
+        cooldownMs: rateLimitBurstCooldownMs({
+          message: result.body?.error?.message || '',
+          retryAfterMs: result.body?.error?.retry_after_ms,
+          apiKey: acct.apiKey,
+          modelKey: routingModelKey,
+        }),
+      });
       // Prune old events
       const cutoff = Date.now() - RL_WINDOW_MS;
       while (context.__rateLimitEvents.length && context.__rateLimitEvents[0].time < cutoff) {
         context.__rateLimitEvents.shift();
       }
       const sameModelBurst = context.__rateLimitEvents.filter(e => e.model === routingModelKey);
       if (sameModelBurst.length >= RL_BURST_THRESHOLD) {
-        const maxCooldown = Math.max(...sameModelBurst.map(() => 30_000));
+        const maxCooldown = Math.max(...sameModelBurst.map(e => e.cooldownMs || IP_RATE_LIMIT_BURST_FLOOR_MS));
         log.warn(`Chat[${reqId}]: IP-rate-limit burst detected — ${sameModelBurst.length} accounts rate-limited on ${displayModel} within ${RL_WINDOW_MS}ms. Short-circuiting.`);
         return {
           status: 429,
           headers: { 'Retry-After': String(Math.ceil(maxCooldown / 1000)) },
           body: {
             error: {
-              message: `All accounts temporarily rate-limited on ${displayModel}. Windsurf upstream is applying IP-level cooldown. Wait ${Math.ceil(maxCooldown / 1000)}s before retrying, or switch to a different model.`,
+              message: `All accounts temporarily rate-limited on ${displayModel}. Windsurf upstream is applying IP-level cooldown. Wait ${formatRetryAfter(maxCooldown)} before retrying, or switch to a different model.`,
               type: 'rate_limit_exceeded',
               retry_after_ms: maxCooldown,
             },
@@ -3618,7 +3672,17 @@ function streamResponse(id, created, model, modelKey, provider, messages, cascad
               const RL_WINDOW_MS = 8_000;
               const RL_BURST_THRESHOLD = 3;
               const now = Date.now();
-              ctx.__rateLimitEvents.push({ time: now, model: modelKey, account: acct?.id });
+              ctx.__rateLimitEvents.push({
+                time: now,
+                model: modelKey,
+                account: acct?.id,
+                cooldownMs: rateLimitBurstCooldownMs({
+                  message: err.message || '',
+                  retryAfterMs: err.retry_after_ms,
+                  apiKey: currentApiKey,
+                  modelKey,
+                }),
+              });
               const cutoff = now - RL_WINDOW_MS;
               while (ctx.__rateLimitEvents.length && ctx.__rateLimitEvents[0].time < cutoff) {
                 ctx.__rateLimitEvents.shift();
@@ -3627,8 +3691,8 @@ function streamResponse(id, created, model, modelKey, provider, messages, cascad
               if (sameModelBurst.length >= RL_BURST_THRESHOLD) {
                 ctx.__rlAborted = true;
                 log.warn(`Chat[${reqId}] stream: IP-rate-limit burst — ${sameModelBurst.length} accounts rate-limited on ${model} within ${RL_WINDOW_MS}ms. Short-circuiting.`);
-                const cooldown = Math.max(...sameModelBurst.map(() => 30_000));
-                lastErr = Object.assign(new Error(`All accounts temporarily rate-limited on ${model}. Windsurf upstream is applying IP-level cooldown. Wait ~${Math.ceil(cooldown / 1000)}s before retrying.`), { type: 'rate_limit_exceeded', retry_after_ms: cooldown });
+                const cooldown = Math.max(...sameModelBurst.map(e => e.cooldownMs || IP_RATE_LIMIT_BURST_FLOOR_MS));
+                lastErr = Object.assign(new Error(`All accounts temporarily rate-limited on ${model}. Windsurf upstream is applying IP-level cooldown. Wait ~${formatRetryAfter(cooldown)} before retrying.`), { type: 'rate_limit_exceeded', retry_after_ms: cooldown });
                 break;
               }
             }
diff --git a/test/rate-limit.test.js b/test/rate-limit.test.js
@@ -10,7 +10,7 @@ import {
   removeAccount,
   setAccountTier,
 } from '../src/auth.js';
-import { handleChatCompletions, rateLimitCooldownMs } from '../src/handlers/chat.js';
+import { handleChatCompletions, rateLimitBurstCooldownMs, rateLimitCooldownMs } from '../src/handlers/chat.js';
 import { getExperimental, setExperimental } from '../src/runtime-config.js';
 
 const createdAccountIds = [];
@@ -73,6 +73,55 @@ describe('rate-limit handling', () => {
     assert.equal(rateLimitCooldownMs('resets in 3h'), 3 * 60 * 60 * 1000);
   });
 
+  it('keeps real upstream cooldowns for IP-level burst short-circuiting', () => {
+    assert.equal(
+      rateLimitBurstCooldownMs({
+        message: 'Reached message rate limit for this model. Please try again later. Resets in: 27m12s (trace ID: abc)',
+        retryAfterMs: 30000,
+      }),
+      (27 * 60 * 1000) + (12 * 1000)
+    );
+    assert.equal(rateLimitBurstCooldownMs({ message: 'rate limit exceeded' }), 30000);
+  });
+
+  it('uses real upstream cooldowns in the non-stream IP burst response', async () => {
+    const accounts = [
+      addTestAccount('ip-burst-a'),
+      addTestAccount('ip-burst-b'),
+      addTestAccount('ip-burst-c'),
+    ];
+    for (const account of accounts) setAccountTier(account.id, 'free');
+
+    class RateLimitedClient {
+      async cascadeChat() {
+        throw new Error('Reached message rate limit for this model. Please try again later. Resets in: 27m12s (trace ID: abc)');
+      }
+      async rawGetChatMessage() {
+        throw new Error('Reached message rate limit for this model. Please try again later. Resets in: 27m12s (trace ID: abc)');
+      }
+    }
+
+    const result = await handleChatCompletions({
+      model: 'gemini-2.5-flash',
+      messages: [{ role: 'user', content: `hi ${Date.now()}` }],
+    }, {
+      async waitForAccount(tried, signal, maxWaitMs, modelKey) {
+        return getApiKey(tried, modelKey);
+      },
+      async ensureLs() {},
+      getLsFor() {
+        return { port: 12345, csrfToken: 'csrf-test' };
+      },
+      WindsurfClient: RateLimitedClient,
+    });
+
+    assert.equal(result.status, 429);
+    assert.equal(result.body.error.type, 'rate_limit_exceeded');
+    assert.equal(result.body.error.retry_after_ms, (27 * 60 * 1000) + (12 * 1000));
+    assert.equal(result.headers['Retry-After'], '1632');
+    assert.match(result.body.error.message, /27m12s/);
+  });
+
   it('does not extend an existing cooldown when a later 429 arrives for the same model', async () => {
     const account = addTestAccount('max-extend');
     const modelKey = 'gemini-2.5-flash';

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "windsurf-api",`
`3`		`- "version": "2.0.139",`
	`3`	`+ "version": "2.0.140",`
`4`	`4`	`"description": "Windsurf to OpenAI + Anthropic compatible API proxy. Turns Windsurf's 107 AI models (Claude, GPT, Gemini, DeepSeek, Grok, Qwen, Kimi, GLM, SWE) into dual-protocol API endpoints. Zero npm deps.",`
`5`	`5`	`"type": "module",`
`6`	`6`	`"main": "src/index.js",`