Skip to content

Commit 07c297a

Browse files
committed
feat(auth): add persistent cooldown state management with file-backed store
- Introduced `CooldownStateStore` interface for managing independent cooldown state persistence. - Implemented `FileCooldownStateStore` for storing cooldown states as per-auth `.cds` files with atomic writes and stale file cleanup. - Enhanced `Manager` to support restoring state from `CooldownStateStore` and persisting state changes during auth updates. - Updated tests to validate cooldown state saving, loading, concurrency handling, and error scenarios. Closes: router-for-me#3368
1 parent d33ac5e commit 07c297a

10 files changed

Lines changed: 1129 additions & 11 deletions

File tree

config.example.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ max-retry-interval: 30
119119
# When true, disable auth/model cooldown scheduling globally (prevents blackout windows after failure states).
120120
disable-cooling: false
121121

122+
# When true, persist per-auth cooldown status as .cds files next to auth files.
123+
# Default is false; when false, cooldown status is kept in memory only.
124+
save-cooldown-status: false
125+
122126
# Cooldown duration in seconds for transient upstream errors (408/500/502/503/504).
123127
# Set to 0 to keep the legacy 60-second cooldown; set to -1 to disable transient error cooldowns.
124128
transient-error-cooldown-seconds: 0

internal/config/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ type Config struct {
8080
// DisableCooling disables quota cooldown scheduling when true.
8181
DisableCooling bool `yaml:"disable-cooling" json:"disable-cooling"`
8282

83+
// SaveCooldownStatus persists runtime cooldown status next to auth files when true.
84+
SaveCooldownStatus bool `yaml:"save-cooldown-status" json:"save-cooldown-status"`
85+
8386
// TransientErrorCooldownSeconds controls cooldowns for transient upstream errors.
8487
// 0 keeps the legacy default cooldown. Negative values disable these cooldowns.
8588
TransientErrorCooldownSeconds int `yaml:"transient-error-cooldown-seconds" json:"transient-error-cooldown-seconds"`
@@ -688,6 +691,7 @@ func LoadConfigOptional(configFile string, optional bool) (*Config, error) {
688691
cfg.UsageStatisticsEnabled = false
689692
cfg.RedisUsageQueueRetentionSeconds = 60
690693
cfg.DisableCooling = false
694+
cfg.SaveCooldownStatus = false
691695
cfg.TransientErrorCooldownSeconds = 0
692696
cfg.DisableImageGeneration = DisableImageGenerationOff
693697
cfg.Pprof.Enable = false

internal/config/parse.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func ParseConfigBytes(data []byte) (*Config, error) {
2525
cfg.UsageStatisticsEnabled = false
2626
cfg.RedisUsageQueueRetentionSeconds = 60
2727
cfg.DisableCooling = false
28+
cfg.SaveCooldownStatus = false
2829
cfg.TransientErrorCooldownSeconds = 0
2930
cfg.DisableImageGeneration = DisableImageGenerationOff
3031
cfg.Pprof.Enable = false

internal/watcher/diff/config_diff.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ func BuildConfigChangeDetails(oldCfg, newCfg *config.Config) []string {
4545
if oldCfg.DisableCooling != newCfg.DisableCooling {
4646
changes = append(changes, fmt.Sprintf("disable-cooling: %t -> %t", oldCfg.DisableCooling, newCfg.DisableCooling))
4747
}
48+
if oldCfg.SaveCooldownStatus != newCfg.SaveCooldownStatus {
49+
changes = append(changes, fmt.Sprintf("save-cooldown-status: %t -> %t", oldCfg.SaveCooldownStatus, newCfg.SaveCooldownStatus))
50+
}
4851
if oldCfg.TransientErrorCooldownSeconds != newCfg.TransientErrorCooldownSeconds {
4952
changes = append(changes, fmt.Sprintf("transient-error-cooldown-seconds: %d -> %d", oldCfg.TransientErrorCooldownSeconds, newCfg.TransientErrorCooldownSeconds))
5053
}

internal/watcher/diff/config_diff_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
193193
LoggingToFile: false,
194194
UsageStatisticsEnabled: false,
195195
DisableCooling: false,
196+
SaveCooldownStatus: false,
196197
TransientErrorCooldownSeconds: 0,
197198
RequestRetry: 1,
198199
MaxRetryCredentials: 1,
@@ -217,6 +218,7 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
217218
LoggingToFile: true,
218219
UsageStatisticsEnabled: true,
219220
DisableCooling: true,
221+
SaveCooldownStatus: true,
220222
TransientErrorCooldownSeconds: -1,
221223
RequestRetry: 2,
222224
MaxRetryCredentials: 3,
@@ -252,6 +254,7 @@ func TestBuildConfigChangeDetails_FlagsAndKeys(t *testing.T) {
252254
expectContains(t, details, "logging-to-file: false -> true")
253255
expectContains(t, details, "usage-statistics-enabled: false -> true")
254256
expectContains(t, details, "disable-cooling: false -> true")
257+
expectContains(t, details, "save-cooldown-status: false -> true")
255258
expectContains(t, details, "transient-error-cooldown-seconds: 0 -> -1")
256259
expectContains(t, details, "disable-image-generation: false -> true")
257260
expectContains(t, details, "request-log: false -> true")
@@ -282,6 +285,7 @@ func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
282285
LoggingToFile: false,
283286
UsageStatisticsEnabled: false,
284287
DisableCooling: false,
288+
SaveCooldownStatus: false,
285289
TransientErrorCooldownSeconds: 0,
286290
RequestRetry: 1,
287291
MaxRetryCredentials: 1,
@@ -330,6 +334,7 @@ func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
330334
LoggingToFile: true,
331335
UsageStatisticsEnabled: true,
332336
DisableCooling: true,
337+
SaveCooldownStatus: true,
333338
TransientErrorCooldownSeconds: -1,
334339
RequestRetry: 2,
335340
MaxRetryCredentials: 3,
@@ -385,6 +390,7 @@ func TestBuildConfigChangeDetails_AllBranches(t *testing.T) {
385390
expectContains(t, changes, "logging-to-file: false -> true")
386391
expectContains(t, changes, "usage-statistics-enabled: false -> true")
387392
expectContains(t, changes, "disable-cooling: false -> true")
393+
expectContains(t, changes, "save-cooldown-status: false -> true")
388394
expectContains(t, changes, "transient-error-cooldown-seconds: 0 -> -1")
389395
expectContains(t, changes, "disable-image-generation: false -> true")
390396
expectContains(t, changes, "request-retry: 1 -> 2")

0 commit comments

Comments
 (0)