Skip to content

Commit c15318e

Browse files
committed
fix: govern rate limit retries and cooldowns
1 parent d47e531 commit c15318e

22 files changed

Lines changed: 1102 additions & 329 deletions

admin/handler.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,8 @@ type accountResponse struct {
304304
LastUsedAt string `json:"last_used_at"`
305305
SuccessRequests int64 `json:"success_requests"`
306306
ErrorRequests int64 `json:"error_requests"`
307+
RetryErrorRequests int64 `json:"retry_error_requests"`
308+
RateLimitAttempts int64 `json:"rate_limit_attempts"`
307309
UsagePercent7d *float64 `json:"usage_percent_7d"`
308310
UsagePercent5h *float64 `json:"usage_percent_5h"`
309311
Usage5hDetail *accountUsageWindow `json:"usage_5h_detail,omitempty"`
@@ -315,6 +317,9 @@ type accountResponse struct {
315317
LastRateLimitedAt string `json:"last_rate_limited_at,omitempty"`
316318
LastTimeoutAt string `json:"last_timeout_at,omitempty"`
317319
LastServerErrorAt string `json:"last_server_error_at,omitempty"`
320+
CooldownReason string `json:"cooldown_reason,omitempty"`
321+
CooldownUntil string `json:"cooldown_until,omitempty"`
322+
ModelCooldowns []modelCooldownResponse `json:"model_cooldowns,omitempty"`
318323
Enabled bool `json:"enabled"`
319324
Locked bool `json:"locked"`
320325
AllowedAPIKeyIDs []int64 `json:"allowed_api_key_ids"`
@@ -325,6 +330,13 @@ type accountResponse struct {
325330
ImageQuotaResetAt string `json:"image_quota_reset_at,omitempty"`
326331
}
327332

333+
type modelCooldownResponse struct {
334+
Model string `json:"model"`
335+
Reason string `json:"reason"`
336+
ResetAt string `json:"reset_at"`
337+
Remaining int64 `json:"remaining_seconds"`
338+
}
339+
328340
type accountUsageWindow struct {
329341
Requests int64 `json:"requests"`
330342
Tokens int64 `json:"tokens"`
@@ -442,15 +454,32 @@ func (h *Handler) ListAccounts(c *gin.Context) {
442454
if !debug.LastServerErrorAt.IsZero() {
443455
resp.LastServerErrorAt = debug.LastServerErrorAt.Format(time.RFC3339)
444456
}
457+
if reason, until := acc.GetCooldownSnapshot(); !until.IsZero() && until.After(time.Now()) {
458+
resp.CooldownReason = reason
459+
resp.CooldownUntil = until.Format(time.RFC3339)
460+
}
461+
for _, cooldown := range acc.ActiveModelCooldowns() {
462+
resp.ModelCooldowns = append(resp.ModelCooldowns, modelCooldownResponse{
463+
Model: cooldown.Model,
464+
Reason: cooldown.Reason,
465+
ResetAt: cooldown.ResetAt.Format(time.RFC3339),
466+
Remaining: int64(time.Until(cooldown.ResetAt).Seconds()),
467+
})
468+
}
445469
// 使用运行时状态(优先于 DB 状态)
446470
resp.Status = acc.RuntimeStatus()
471+
} else if row.CooldownUntil.Valid && row.CooldownUntil.Time.After(time.Now()) {
472+
resp.CooldownReason = row.CooldownReason
473+
resp.CooldownUntil = row.CooldownUntil.Time.Format(time.RFC3339)
447474
}
448475
if resp.DispatchScore == 0 {
449476
resp.DispatchScore = dispatchScoreFallback(resp.SchedulerScore, resp.ScoreBiasEffective, resp.HealthTier, resp.Status)
450477
}
451478
if rc, ok := reqCounts[row.ID]; ok {
452479
resp.SuccessRequests = rc.SuccessCount
453480
resp.ErrorRequests = rc.ErrorCount
481+
resp.RetryErrorRequests = rc.RetryErrorCount
482+
resp.RateLimitAttempts = rc.RateLimitAttemptCount
454483
}
455484
if usage, ok := usage5h[row.ID]; ok {
456485
resp.Usage5hDetail = &accountUsageWindow{
@@ -2142,6 +2171,7 @@ type settingsResponse struct {
21422171
ProxyPoolEnabled bool `json:"proxy_pool_enabled"`
21432172
FastSchedulerEnabled bool `json:"fast_scheduler_enabled"`
21442173
MaxRetries int `json:"max_retries"`
2174+
MaxRateLimitRetries int `json:"max_rate_limit_retries"`
21452175
AllowRemoteMigration bool `json:"allow_remote_migration"`
21462176
DatabaseDriver string `json:"database_driver"`
21472177
DatabaseLabel string `json:"database_label"`
@@ -2182,6 +2212,7 @@ type updateSettingsReq struct {
21822212
ProxyPoolEnabled *bool `json:"proxy_pool_enabled"`
21832213
FastSchedulerEnabled *bool `json:"fast_scheduler_enabled"`
21842214
MaxRetries *int `json:"max_retries"`
2215+
MaxRateLimitRetries *int `json:"max_rate_limit_retries"`
21852216
AllowRemoteMigration *bool `json:"allow_remote_migration"`
21862217
ModelMapping *string `json:"model_mapping"`
21872218
ResinURL *string `json:"resin_url"`
@@ -2234,6 +2265,7 @@ func (h *Handler) GetSettings(c *gin.Context) {
22342265
ProxyPoolEnabled: h.store.GetProxyPoolEnabled(),
22352266
FastSchedulerEnabled: h.store.FastSchedulerEnabled(),
22362267
MaxRetries: h.store.GetMaxRetries(),
2268+
MaxRateLimitRetries: h.store.GetMaxRateLimitRetries(),
22372269
AllowRemoteMigration: h.store.GetAllowRemoteMigration() && adminAuthSource != "disabled",
22382270
DatabaseDriver: h.databaseDriver,
22392271
DatabaseLabel: h.databaseLabel,
@@ -2436,6 +2468,18 @@ func (h *Handler) UpdateSettings(c *gin.Context) {
24362468
log.Printf("设置已更新: max_retries = %d", v)
24372469
}
24382470

2471+
if req.MaxRateLimitRetries != nil {
2472+
v := *req.MaxRateLimitRetries
2473+
if v < 0 {
2474+
v = 0
2475+
}
2476+
if v > 10 {
2477+
v = 10
2478+
}
2479+
h.store.SetMaxRateLimitRetries(v)
2480+
log.Printf("设置已更新: max_rate_limit_retries = %d", v)
2481+
}
2482+
24392483
if req.AllowRemoteMigration != nil {
24402484
if *req.AllowRemoteMigration && !hasAdminSecret {
24412485
writeError(c, http.StatusBadRequest, "请先设置管理密钥,再启用远程迁移")
@@ -2560,6 +2604,7 @@ func (h *Handler) UpdateSettings(c *gin.Context) {
25602604
ProxyPoolEnabled: h.store.GetProxyPoolEnabled(),
25612605
FastSchedulerEnabled: h.store.FastSchedulerEnabled(),
25622606
MaxRetries: h.store.GetMaxRetries(),
2607+
MaxRateLimitRetries: h.store.GetMaxRateLimitRetries(),
25632608
AllowRemoteMigration: h.store.GetAllowRemoteMigration() && hasAdminSecret,
25642609
ModelMapping: h.store.GetModelMapping(),
25652610
ResinURL: resinURL,
@@ -2612,6 +2657,7 @@ func (h *Handler) UpdateSettings(c *gin.Context) {
26122657
ProxyPoolEnabled: h.store.GetProxyPoolEnabled(),
26132658
FastSchedulerEnabled: h.store.FastSchedulerEnabled(),
26142659
MaxRetries: h.store.GetMaxRetries(),
2660+
MaxRateLimitRetries: h.store.GetMaxRateLimitRetries(),
26152661
AllowRemoteMigration: h.store.GetAllowRemoteMigration() && adminAuthSource != "disabled",
26162662
DatabaseDriver: h.databaseDriver,
26172663
DatabaseLabel: h.databaseLabel,

admin/test_connection.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ func (h *Handler) TestConnection(c *gin.Context) {
9393
case http.StatusUnauthorized:
9494
h.store.MarkCooldown(account, 24*time.Hour, "unauthorized")
9595
case http.StatusTooManyRequests:
96-
proxy.Apply429Cooldown(h.store, account, errBody, resp)
96+
proxy.Apply429Cooldown(h.store, account, errBody, resp, testModel)
9797
}
9898
sendTestEvent(c, testEvent{Type: "error", Error: fmt.Sprintf("上游返回 %d: %s", resp.StatusCode, truncate(string(errBody), 500))})
9999
return
@@ -443,7 +443,7 @@ func (h *Handler) BatchTest(c *gin.Context) {
443443
atomic.AddInt64(&bannedCount, 1)
444444
case http.StatusTooManyRequests:
445445
proxy.SyncCodexUsageState(h.store, acc, resp)
446-
proxy.Apply429Cooldown(h.store, acc, body, resp)
446+
proxy.Apply429Cooldown(h.store, acc, body, resp, testModel)
447447
atomic.AddInt64(&rateLimitCount, 1)
448448
default:
449449
if shouldMarkBatchTestAccountError(resp.StatusCode, body) {

admin/usage_probe.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ func (h *Handler) ProbeUsageSnapshot(ctx context.Context, account *auth.Account)
4949
return nil
5050
case http.StatusTooManyRequests:
5151
h.store.ReportRequestFailure(account, "client", 0)
52-
proxy.Apply429Cooldown(h.store, account, nil, resp)
52+
proxy.Apply429Cooldown(h.store, account, nil, resp, h.store.GetTestModel())
5353
return nil
5454
default:
5555
if resp.StatusCode >= 500 {

auth/fast_scheduler.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,10 @@ func (a *Account) fastSchedulerSnapshot(baseLimit int64, now time.Time) (Account
405405
if atomic.LoadInt32(&a.DispatchPaused) != 0 {
406406
available = false
407407
}
408-
if a.Status == StatusCooldown && now.Before(a.CooldownUtil) && !a.premium5hCooldownSuppressedLocked(now) {
408+
if a.Status == StatusCooldown && now.Before(a.CooldownUtil) {
409+
available = false
410+
}
411+
if a.premium5hRateLimitedLocked(now) {
409412
available = false
410413
}
411414
// Free 账号 7d 用量耗尽,不参与调度

auth/fast_scheduler_test.go

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,7 @@ func TestStoreFastSchedulerTracksCooldownTransition(t *testing.T) {
434434
store.Release(got)
435435
}
436436

437-
func TestFastSchedulerPremium5hRateLimitUsesSingleConcurrencyAndRecoversAfterReset(t *testing.T) {
437+
func TestFastSchedulerPremium5hRateLimitIsFencedAndRecoversAfterReset(t *testing.T) {
438438
acc := &Account{
439439
DBID: 1,
440440
AccessToken: "token",
@@ -450,24 +450,19 @@ func TestFastSchedulerPremium5hRateLimitUsesSingleConcurrencyAndRecoversAfterRes
450450
scheduler.Rebuild([]*Account{acc})
451451

452452
sizes := scheduler.BucketSizes()
453-
if sizes[HealthTierRisky] != 1 {
454-
t.Fatalf("risky bucket size = %d, want 1", sizes[HealthTierRisky])
453+
if sizes[HealthTierRisky] != 0 {
454+
t.Fatalf("risky bucket size = %d, want 0 while premium 5h rate limit is active", sizes[HealthTierRisky])
455455
}
456456

457457
first := scheduler.Acquire()
458-
if first == nil {
459-
t.Fatal("first Acquire() returned nil")
460-
}
461-
462-
second := scheduler.Acquire()
463-
if second != nil {
464-
t.Fatal("second Acquire() should be nil while premium 5h rate limit is active")
458+
if first != nil {
459+
t.Fatal("Acquire() should be nil while premium 5h rate limit is active")
465460
}
466461

467462
acc.mu.Lock()
468463
acc.Reset5hAt = time.Now().Add(-time.Minute)
469464
acc.mu.Unlock()
470-
scheduler.Release(first)
465+
scheduler.Rebuild([]*Account{acc})
471466

472467
third := scheduler.Acquire()
473468
if third == nil {

auth/premium_rate_limit.go

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -63,21 +63,6 @@ func (a *Account) premium5hRateLimitedLocked(now time.Time) bool {
6363
return a.Reset5hAt.After(now)
6464
}
6565

66-
func (a *Account) premium5hRateLimitWindowLocked(now time.Time) (bool, time.Time) {
67-
if !a.premium5hRateLimitedLocked(now) {
68-
return false, time.Time{}
69-
}
70-
return true, a.Reset5hAt
71-
}
72-
73-
func (a *Account) premium5hCooldownSuppressedLocked(now time.Time) bool {
74-
if a.Status != StatusCooldown || a.CooldownReason != "rate_limited" {
75-
return false
76-
}
77-
active, _ := a.premium5hRateLimitWindowLocked(now)
78-
return active
79-
}
80-
8166
// IsPremium5hRateLimited 判断账号当前是否处于 premium 5h 限流态。
8267
func (a *Account) IsPremium5hRateLimited() bool {
8368
a.mu.RLock()
@@ -135,11 +120,9 @@ func (s *Store) MarkPremium5hRateLimited(acc *Account, resetAt time.Time) {
135120
acc.Reset5hAt = resetAt
136121
acc.UsageUpdatedAt = now
137122
acc.LastRateLimitedAt = now
138-
if acc.Status == StatusCooldown && acc.CooldownReason == "rate_limited" {
139-
acc.Status = StatusReady
140-
acc.CooldownUtil = time.Time{}
141-
acc.CooldownReason = ""
142-
}
123+
acc.Status = StatusCooldown
124+
acc.CooldownUtil = resetAt
125+
acc.CooldownReason = "rate_limited"
143126
if acc.HealthTier != HealthTierBanned {
144127
acc.HealthTier = HealthTierRisky
145128
}
@@ -154,8 +137,8 @@ func (s *Store) MarkPremium5hRateLimited(acc *Account, resetAt time.Time) {
154137

155138
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
156139
defer cancel()
157-
if err := s.db.ClearCooldown(ctx, acc.DBID); err != nil {
158-
log.Printf("[账号 %d] 清理 premium 5h 限流冷却状态失败: %v", acc.DBID, err)
140+
if err := s.db.SetCooldown(ctx, acc.DBID, "rate_limited", resetAt); err != nil {
141+
log.Printf("[账号 %d] 持久化 premium 5h 限流冷却状态失败: %v", acc.DBID, err)
159142
}
160143
if err := s.db.UpdateUsageSnapshot5h(ctx, acc.DBID, 100, resetAt, now); err != nil {
161144
log.Printf("[账号 %d] 持久化 premium 5h 限流快照失败: %v", acc.DBID, err)

auth/premium_rate_limit_test.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@ func newPremium5hTestAccount(plan string, resetAt time.Time) *Account {
2020
}
2121
}
2222

23-
func TestPremium5hRateLimitedAccountRemainsSchedulable(t *testing.T) {
23+
func TestPremium5hRateLimitedAccountIsFencedFromScheduling(t *testing.T) {
2424
acc := newPremium5hTestAccount("plus", time.Now().Add(45*time.Minute))
2525

2626
snapshot := acc.GetSchedulerDebugSnapshot(4)
2727
if got := acc.RuntimeStatus(); got != "rate_limited" {
2828
t.Fatalf("RuntimeStatus() = %q, want rate_limited", got)
2929
}
30-
if !acc.IsAvailable() {
31-
t.Fatal("IsAvailable() = false, want true for premium 5h rate limited account")
30+
if acc.IsAvailable() {
31+
t.Fatal("IsAvailable() = true, want false for premium 5h rate limited account")
3232
}
3333
if snapshot.HealthTier != string(HealthTierRisky) {
3434
t.Fatalf("HealthTier = %q, want %q", snapshot.HealthTier, HealthTierRisky)
@@ -40,6 +40,9 @@ func TestPremium5hRateLimitedAccountRemainsSchedulable(t *testing.T) {
4040

4141
func TestPremium5hRateLimitExpiresAndUsageProbeResumes(t *testing.T) {
4242
acc := newPremium5hTestAccount("team", time.Now().Add(-time.Minute))
43+
acc.Status = StatusCooldown
44+
acc.CooldownReason = "rate_limited"
45+
acc.CooldownUtil = time.Now().Add(-time.Minute)
4346

4447
snapshot := acc.GetSchedulerDebugSnapshot(4)
4548
if got := acc.RuntimeStatus(); got != "active" {

0 commit comments

Comments
 (0)