Skip to content

Commit d33ac5e

Browse files
committed
feat(auth): add transient error cooldown configuration and adjust retry logic
- Introduced `SetTransientErrorCooldownSeconds` to enable configurable cooldowns for transient errors (e.g., 408/500/502/503/504). - Updated retry scheduling logic to use the new `nextTransientErrorRetryAfter` function. - Modified config parsing to include `transient-error-cooldown-seconds` with support for disabling or defaulting to legacy behavior. - Expanded tests to validate transient cooldown logic with various configurations and edge cases. Closes: router-for-me#3315
1 parent 4926630 commit d33ac5e

10 files changed

Lines changed: 250 additions & 50 deletions

File tree

cmd/server/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ func main() {
505505
redisqueue.SetUsageStatisticsEnabled(cfg.UsageStatisticsEnabled)
506506
redisqueue.SetRetentionSeconds(cfg.RedisUsageQueueRetentionSeconds)
507507
coreauth.SetQuotaCooldownDisabled(cfg.DisableCooling)
508+
coreauth.SetTransientErrorCooldownSeconds(cfg.TransientErrorCooldownSeconds)
508509

509510
if err = logging.ConfigureLogOutput(cfg); err != nil {
510511
log.Errorf("failed to configure log output: %v", err)

config.example.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ max-retry-interval: 30
119119
# When true, disable auth/model cooldown scheduling globally (prevents blackout windows after failure states).
120120
disable-cooling: false
121121

122+
# Cooldown duration in seconds for transient upstream errors (408/500/502/503/504).
123+
# Set to 0 to keep the legacy 60-second cooldown; set to -1 to disable transient error cooldowns.
124+
transient-error-cooldown-seconds: 0
125+
122126
# When true, globally disable Claude request cloaking (the Claude Code CLI disguise and
123127
# system prompt replacement), so the original system prompt is passed through to Claude as-is.
124128
# Individual credentials can still override this: a claude-api-key entry via its "cloak.mode",

internal/api/server.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,7 @@ func NewServer(cfg *config.Config, authManager *auth.Manager, accessManager *sdk
328328
}
329329
managementasset.SetCurrentConfig(cfg)
330330
auth.SetQuotaCooldownDisabled(cfg.DisableCooling)
331+
auth.SetTransientErrorCooldownSeconds(cfg.TransientErrorCooldownSeconds)
331332
applySignatureCacheConfig(nil, cfg)
332333
// Initialize management handler
333334
s.mgmt = managementHandlers.NewHandler(cfg, configFilePath, authManager)
@@ -1596,6 +1597,9 @@ func (s *Server) UpdateClients(cfg *config.Config) {
15961597
if oldCfg == nil || oldCfg.DisableCooling != cfg.DisableCooling {
15971598
auth.SetQuotaCooldownDisabled(cfg.DisableCooling)
15981599
}
1600+
if oldCfg == nil || oldCfg.TransientErrorCooldownSeconds != cfg.TransientErrorCooldownSeconds {
1601+
auth.SetTransientErrorCooldownSeconds(cfg.TransientErrorCooldownSeconds)
1602+
}
15991603

16001604
if oldCfg != nil && oldCfg.DisableImageGeneration != cfg.DisableImageGeneration {
16011605
log.Infof("disable-image-generation updated: %v -> %v", oldCfg.DisableImageGeneration, cfg.DisableImageGeneration)

internal/config/config.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ type Config struct {
8080
// DisableCooling disables quota cooldown scheduling when true.
8181
DisableCooling bool `yaml:"disable-cooling" json:"disable-cooling"`
8282

83+
// TransientErrorCooldownSeconds controls cooldowns for transient upstream errors.
84+
// 0 keeps the legacy default cooldown. Negative values disable these cooldowns.
85+
TransientErrorCooldownSeconds int `yaml:"transient-error-cooldown-seconds" json:"transient-error-cooldown-seconds"`
86+
8387
// AuthAutoRefreshWorkers overrides the size of the core auth auto-refresh worker pool.
8488
// When <= 0, the default worker count is used.
8589
AuthAutoRefreshWorkers int `yaml:"auth-auto-refresh-workers" json:"auth-auto-refresh-workers"`
@@ -684,6 +688,7 @@ func LoadConfigOptional(configFile string, optional bool) (*Config, error) {
684688
cfg.UsageStatisticsEnabled = false
685689
cfg.RedisUsageQueueRetentionSeconds = 60
686690
cfg.DisableCooling = false
691+
cfg.TransientErrorCooldownSeconds = 0
687692
cfg.DisableImageGeneration = DisableImageGenerationOff
688693
cfg.Pprof.Enable = false
689694
cfg.Pprof.Addr = DefaultPprofAddr

internal/config/parse.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func ParseConfigBytes(data []byte) (*Config, error) {
2525
cfg.UsageStatisticsEnabled = false
2626
cfg.RedisUsageQueueRetentionSeconds = 60
2727
cfg.DisableCooling = false
28+
cfg.TransientErrorCooldownSeconds = 0
2829
cfg.DisableImageGeneration = DisableImageGenerationOff
2930
cfg.Pprof.Enable = false
3031
cfg.Pprof.Addr = DefaultPprofAddr

internal/watcher/diff/config_diff.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ func BuildConfigChangeDetails(oldCfg, newCfg *config.Config) []string {
4545
if oldCfg.DisableCooling != newCfg.DisableCooling {
4646
changes = append(changes, fmt.Sprintf("disable-cooling: %t -> %t", oldCfg.DisableCooling, newCfg.DisableCooling))
4747
}
48+
if oldCfg.TransientErrorCooldownSeconds != newCfg.TransientErrorCooldownSeconds {
49+
changes = append(changes, fmt.Sprintf("transient-error-cooldown-seconds: %d -> %d", oldCfg.TransientErrorCooldownSeconds, newCfg.TransientErrorCooldownSeconds))
50+
}
4851
if oldCfg.DisableClaudeCloakMode != newCfg.DisableClaudeCloakMode {
4952
changes = append(changes, fmt.Sprintf("disable-claude-cloak-mode: %t -> %t", oldCfg.DisableClaudeCloakMode, newCfg.DisableClaudeCloakMode))
5053
}

internal/watcher/diff/config_diff_test.go

Lines changed: 53 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -187,20 +187,21 @@ func TestBuildConfigChangeDetails_SecretsAndCounts(t *testing.T) {
187187

188188
func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
189189
oldCfg := &config.Config{
190-
Port: 1000,
191-
AuthDir: "/old",
192-
Debug: false,
193-
LoggingToFile: false,
194-
UsageStatisticsEnabled: false,
195-
DisableCooling: false,
196-
RequestRetry: 1,
197-
MaxRetryCredentials: 1,
198-
MaxRetryInterval: 1,
199-
WebsocketAuth: false,
200-
QuotaExceeded: config.QuotaExceeded{SwitchProject: false, SwitchPreviewModel: false, AntigravityCredits: false},
201-
ClaudeKey: []config.ClaudeKey{{APIKey: "c1"}},
202-
CodexKey: []config.CodexKey{{APIKey: "x1"}},
203-
RemoteManagement: config.RemoteManagement{DisableControlPanel: false, PanelGitHubRepository: "old/repo", SecretKey: "keep"},
190+
Port: 1000,
191+
AuthDir: "/old",
192+
Debug: false,
193+
LoggingToFile: false,
194+
UsageStatisticsEnabled: false,
195+
DisableCooling: false,
196+
TransientErrorCooldownSeconds: 0,
197+
RequestRetry: 1,
198+
MaxRetryCredentials: 1,
199+
MaxRetryInterval: 1,
200+
WebsocketAuth: false,
201+
QuotaExceeded: config.QuotaExceeded{SwitchProject: false, SwitchPreviewModel: false, AntigravityCredits: false},
202+
ClaudeKey: []config.ClaudeKey{{APIKey: "c1"}},
203+
CodexKey: []config.CodexKey{{APIKey: "x1"}},
204+
RemoteManagement: config.RemoteManagement{DisableControlPanel: false, PanelGitHubRepository: "old/repo", SecretKey: "keep"},
204205
SDKConfig: sdkconfig.SDKConfig{
205206
RequestLog: false,
206207
ProxyURL: "http://old-proxy",
@@ -210,17 +211,18 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
210211
},
211212
}
212213
newCfg := &config.Config{
213-
Port: 2000,
214-
AuthDir: "/new",
215-
Debug: true,
216-
LoggingToFile: true,
217-
UsageStatisticsEnabled: true,
218-
DisableCooling: true,
219-
RequestRetry: 2,
220-
MaxRetryCredentials: 3,
221-
MaxRetryInterval: 3,
222-
WebsocketAuth: true,
223-
QuotaExceeded: config.QuotaExceeded{SwitchProject: true, SwitchPreviewModel: true, AntigravityCredits: true},
214+
Port: 2000,
215+
AuthDir: "/new",
216+
Debug: true,
217+
LoggingToFile: true,
218+
UsageStatisticsEnabled: true,
219+
DisableCooling: true,
220+
TransientErrorCooldownSeconds: -1,
221+
RequestRetry: 2,
222+
MaxRetryCredentials: 3,
223+
MaxRetryInterval: 3,
224+
WebsocketAuth: true,
225+
QuotaExceeded: config.QuotaExceeded{SwitchProject: true, SwitchPreviewModel: true, AntigravityCredits: true},
224226
ClaudeKey: []config.ClaudeKey{
225227
{APIKey: "c1", BaseURL: "http://new", ProxyURL: "http://p", Headers: map[string]string{"H": "1"}, ExcludedModels: []string{"a"}},
226228
{APIKey: "c2"},
@@ -250,6 +252,7 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
250252
expectContains(t, details, "logging-to-file: false -> true")
251253
expectContains(t, details, "usage-statistics-enabled: false -> true")
252254
expectContains(t, details, "disable-cooling: false -> true")
255+
expectContains(t, details, "transient-error-cooldown-seconds: 0 -> -1")
253256
expectContains(t, details, "disable-image-generation: false -> true")
254257
expectContains(t, details, "request-log: false -> true")
255258
expectContains(t, details, "request-retry: 1 -> 2")
@@ -273,17 +276,18 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
273276

274277
func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
275278
oldCfg := &config.Config{
276-
Port: 1,
277-
AuthDir: "/a",
278-
Debug: false,
279-
LoggingToFile: false,
280-
UsageStatisticsEnabled: false,
281-
DisableCooling: false,
282-
RequestRetry: 1,
283-
MaxRetryCredentials: 1,
284-
MaxRetryInterval: 1,
285-
WebsocketAuth: false,
286-
QuotaExceeded: config.QuotaExceeded{SwitchProject: false, SwitchPreviewModel: false, AntigravityCredits: false},
279+
Port: 1,
280+
AuthDir: "/a",
281+
Debug: false,
282+
LoggingToFile: false,
283+
UsageStatisticsEnabled: false,
284+
DisableCooling: false,
285+
TransientErrorCooldownSeconds: 0,
286+
RequestRetry: 1,
287+
MaxRetryCredentials: 1,
288+
MaxRetryInterval: 1,
289+
WebsocketAuth: false,
290+
QuotaExceeded: config.QuotaExceeded{SwitchProject: false, SwitchPreviewModel: false, AntigravityCredits: false},
287291
GeminiKey: []config.GeminiKey{
288292
{APIKey: "g-old", BaseURL: "http://g-old", ProxyURL: "http://gp-old", Headers: map[string]string{"A": "1"}},
289293
},
@@ -320,17 +324,18 @@ func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
320324
},
321325
}
322326
newCfg := &config.Config{
323-
Port: 2,
324-
AuthDir: "/b",
325-
Debug: true,
326-
LoggingToFile: true,
327-
UsageStatisticsEnabled: true,
328-
DisableCooling: true,
329-
RequestRetry: 2,
330-
MaxRetryCredentials: 3,
331-
MaxRetryInterval: 3,
332-
WebsocketAuth: true,
333-
QuotaExceeded: config.QuotaExceeded{SwitchProject: true, SwitchPreviewModel: true, AntigravityCredits: true},
327+
Port: 2,
328+
AuthDir: "/b",
329+
Debug: true,
330+
LoggingToFile: true,
331+
UsageStatisticsEnabled: true,
332+
DisableCooling: true,
333+
TransientErrorCooldownSeconds: -1,
334+
RequestRetry: 2,
335+
MaxRetryCredentials: 3,
336+
MaxRetryInterval: 3,
337+
WebsocketAuth: true,
338+
QuotaExceeded: config.QuotaExceeded{SwitchProject: true, SwitchPreviewModel: true, AntigravityCredits: true},
334339
GeminiKey: []config.GeminiKey{
335340
{APIKey: "g-new", BaseURL: "http://g-new", ProxyURL: "http://gp-new", Headers: map[string]string{"A": "2"}, ExcludedModels: []string{"x", "y"}},
336341
},
@@ -380,6 +385,7 @@ func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
380385
expectContains(t, changes, "logging-to-file: false -> true")
381386
expectContains(t, changes, "usage-statistics-enabled: false -> true")
382387
expectContains(t, changes, "disable-cooling: false -> true")
388+
expectContains(t, changes, "transient-error-cooldown-seconds: 0 -> -1")
383389
expectContains(t, changes, "disable-image-generation: false -> true")
384390
expectContains(t, changes, "request-retry: 1 -> 2")
385391
expectContains(t, changes, "max-retry-credentials: 1 -> 3")

sdk/cliproxy/auth/conductor.go

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,23 @@ const (
8484
refreshIneffectiveBackoff = 30 * time.Second
8585
quotaBackoffBase = time.Second
8686
quotaBackoffMax = 30 * time.Minute
87+
transientErrorCooldown = time.Minute
8788
)
8889

8990
var quotaCooldownDisabled atomic.Bool
91+
var transientErrorCooldownSeconds atomic.Int64
9092

9193
// SetQuotaCooldownDisabled toggles quota cooldown scheduling globally.
9294
func SetQuotaCooldownDisabled(disable bool) {
9395
quotaCooldownDisabled.Store(disable)
9496
}
9597

98+
// SetTransientErrorCooldownSeconds configures cooldowns for 408/500/502/503/504.
99+
// 0 keeps the legacy default; negative values disable transient error cooldowns.
100+
func SetTransientErrorCooldownSeconds(seconds int) {
101+
transientErrorCooldownSeconds.Store(int64(seconds))
102+
}
103+
96104
func quotaCooldownDisabledForAuth(auth *Auth) bool {
97105
if auth != nil {
98106
if override, ok := auth.DisableCoolingOverride(); ok {
@@ -102,6 +110,17 @@ func quotaCooldownDisabledForAuth(auth *Auth) bool {
102110
return quotaCooldownDisabled.Load()
103111
}
104112

113+
func nextTransientErrorRetryAfter(now time.Time) time.Time {
114+
seconds := transientErrorCooldownSeconds.Load()
115+
if seconds < 0 {
116+
return time.Time{}
117+
}
118+
if seconds == 0 {
119+
return now.Add(transientErrorCooldown)
120+
}
121+
return now.Add(time.Duration(seconds) * time.Second)
122+
}
123+
105124
// Result captures execution outcome used to adjust auth state.
106125
type Result struct {
107126
// AuthID references the auth that produced this result.
@@ -2909,8 +2928,7 @@ func (m *Manager) MarkResult(ctx context.Context, result Result) {
29092928
if disableCooling {
29102929
state.NextRetryAfter = time.Time{}
29112930
} else {
2912-
next := now.Add(1 * time.Minute)
2913-
state.NextRetryAfter = next
2931+
state.NextRetryAfter = nextTransientErrorRetryAfter(now)
29142932
}
29152933
default:
29162934
state.NextRetryAfter = time.Time{}
@@ -3414,7 +3432,7 @@ func applyAuthFailureState(auth *Auth, resultErr *Error, retryAfter *time.Durati
34143432
if disableCooling {
34153433
auth.NextRetryAfter = time.Time{}
34163434
} else {
3417-
auth.NextRetryAfter = now.Add(1 * time.Minute)
3435+
auth.NextRetryAfter = nextTransientErrorRetryAfter(now)
34183436
}
34193437
default:
34203438
if auth.StatusMessage == "" {

0 commit comments

Comments
 (0)