Skip to content

Commit c67f875

Browse files
authored
Merge pull request #3 from hex-ci/devin/1780645724-eof-per-auth-retry
Honor per-auth request_retry override for unexpected-EOF retries
2 parents 5a0adba + 6e08223 commit c67f875

2 files changed

Lines changed: 112 additions & 21 deletions

File tree

sdk/cliproxy/auth/conductor.go

Lines changed: 64 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1295,13 +1295,14 @@ func (m *Manager) Execute(ctx context.Context, providers []string, req cliproxye
12951295
_, maxRetryCredentials, maxWait := m.retrySettings()
12961296

12971297
var lastErr error
1298+
var failedAuthID string
12981299
for attempt := 0; ; attempt++ {
1299-
resp, errExec := m.executeMixedOnce(ctx, normalized, req, opts, maxRetryCredentials)
1300+
resp, errExec := m.executeMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID)
13001301
if errExec == nil {
13011302
return resp, nil
13021303
}
13031304
lastErr = errExec
1304-
wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait)
1305+
wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait, failedAuthID)
13051306
if !shouldRetry {
13061307
break
13071308
}
@@ -1330,13 +1331,14 @@ func (m *Manager) ExecuteCount(ctx context.Context, providers []string, req clip
13301331
_, maxRetryCredentials, maxWait := m.retrySettings()
13311332

13321333
var lastErr error
1334+
var failedAuthID string
13331335
for attempt := 0; ; attempt++ {
1334-
resp, errExec := m.executeCountMixedOnce(ctx, normalized, req, opts, maxRetryCredentials)
1336+
resp, errExec := m.executeCountMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID)
13351337
if errExec == nil {
13361338
return resp, nil
13371339
}
13381340
lastErr = errExec
1339-
wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait)
1341+
wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait, failedAuthID)
13401342
if !shouldRetry {
13411343
break
13421344
}
@@ -1361,13 +1363,14 @@ func (m *Manager) ExecuteStream(ctx context.Context, providers []string, req cli
13611363
_, maxRetryCredentials, maxWait := m.retrySettings()
13621364

13631365
var lastErr error
1366+
var failedAuthID string
13641367
for attempt := 0; ; attempt++ {
1365-
result, errStream := m.executeStreamMixedOnce(ctx, normalized, req, opts, maxRetryCredentials)
1368+
result, errStream := m.executeStreamMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID)
13661369
if errStream == nil {
13671370
return result, nil
13681371
}
13691372
lastErr = errStream
1370-
wait, shouldRetry := m.shouldRetryAfterError(errStream, attempt, normalized, req.Model, maxWait)
1373+
wait, shouldRetry := m.shouldRetryAfterError(errStream, attempt, normalized, req.Model, maxWait, failedAuthID)
13711374
if !shouldRetry {
13721375
break
13731376
}
@@ -1390,7 +1393,7 @@ func (m *Manager) ExecuteStream(ctx context.Context, providers []string, req cli
13901393
return nil, &Error{Code: "auth_not_found", Message: "no auth available"}
13911394
}
13921395

1393-
func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (cliproxyexecutor.Response, error) {
1396+
func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (cliproxyexecutor.Response, error) {
13941397
if len(providers) == 0 {
13951398
return cliproxyexecutor.Response{}, &Error{Code: "provider_not_found", Message: "no provider supplied"}
13961399
}
@@ -1423,6 +1426,9 @@ func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req
14231426
entry := logEntryWithRequestID(ctx)
14241427
debugLogAuthSelection(entry, auth, provider, req.Model)
14251428
publishSelectedAuthMetadata(opts.Metadata, auth.ID)
1429+
if failedAuthID != nil {
1430+
*failedAuthID = auth.ID
1431+
}
14261432

14271433
tried[auth.ID] = struct{}{}
14281434
execCtx := ctx
@@ -1489,7 +1495,7 @@ func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req
14891495
}
14901496
}
14911497

1492-
func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (cliproxyexecutor.Response, error) {
1498+
func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (cliproxyexecutor.Response, error) {
14931499
if len(providers) == 0 {
14941500
return cliproxyexecutor.Response{}, &Error{Code: "provider_not_found", Message: "no provider supplied"}
14951501
}
@@ -1522,6 +1528,9 @@ func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string,
15221528
entry := logEntryWithRequestID(ctx)
15231529
debugLogAuthSelection(entry, auth, provider, req.Model)
15241530
publishSelectedAuthMetadata(opts.Metadata, auth.ID)
1531+
if failedAuthID != nil {
1532+
*failedAuthID = auth.ID
1533+
}
15251534

15261535
tried[auth.ID] = struct{}{}
15271536
execCtx := ctx
@@ -1588,7 +1597,7 @@ func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string,
15881597
}
15891598
}
15901599

1591-
func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (*cliproxyexecutor.StreamResult, error) {
1600+
func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (*cliproxyexecutor.StreamResult, error) {
15921601
if len(providers) == 0 {
15931602
return nil, &Error{Code: "provider_not_found", Message: "no provider supplied"}
15941603
}
@@ -1621,6 +1630,9 @@ func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string
16211630
entry := logEntryWithRequestID(ctx)
16221631
debugLogAuthSelection(entry, auth, provider, req.Model)
16231632
publishSelectedAuthMetadata(opts.Metadata, auth.ID)
1633+
if failedAuthID != nil {
1634+
*failedAuthID = auth.ID
1635+
}
16241636

16251637
tried[auth.ID] = struct{}{}
16261638
execCtx := ctx
@@ -2305,7 +2317,46 @@ func (m *Manager) retryAllowed(attempt int, providers []string) bool {
23052317
return false
23062318
}
23072319

2308-
func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []string, model string, maxWait time.Duration) (time.Duration, bool) {
2320+
// eofRetryAllowed decides whether a statusless "unexpected EOF" may be retried.
2321+
// It honors the auth-file scoped request_retry override of the credential that
2322+
// actually failed (failedAuthID) so a credential that disabled retries is not
2323+
// retried just because another credential for the provider still allows it.
2324+
// When the failing auth is unknown it falls back to the provider-wide check.
2325+
func (m *Manager) eofRetryAllowed(attempt int, providers []string, failedAuthID string) bool {
2326+
if allowed, known := m.authRetryAllowed(attempt, failedAuthID); known {
2327+
return allowed
2328+
}
2329+
return m.retryAllowed(attempt, providers)
2330+
}
2331+
2332+
// authRetryAllowed reports whether the specific auth permits another attempt,
2333+
// honoring its auth-file scoped request_retry override. The second return value
2334+
// is false when the auth is unknown (empty ID or not registered).
2335+
func (m *Manager) authRetryAllowed(attempt int, authID string) (bool, bool) {
2336+
if m == nil || attempt < 0 {
2337+
return false, false
2338+
}
2339+
authID = strings.TrimSpace(authID)
2340+
if authID == "" {
2341+
return false, false
2342+
}
2343+
m.mu.RLock()
2344+
auth := m.auths[authID]
2345+
m.mu.RUnlock()
2346+
if auth == nil {
2347+
return false, false
2348+
}
2349+
effectiveRetry := int(m.requestRetry.Load())
2350+
if override, ok := auth.RequestRetryOverride(); ok {
2351+
effectiveRetry = override
2352+
}
2353+
if effectiveRetry < 0 {
2354+
effectiveRetry = 0
2355+
}
2356+
return attempt < effectiveRetry, true
2357+
}
2358+
2359+
func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []string, model string, maxWait time.Duration, failedAuthID string) (time.Duration, bool) {
23092360
if err == nil {
23102361
return 0, false
23112362
}
@@ -2327,9 +2378,10 @@ func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []stri
23272378
// body merely contains "unexpected EOF") has already had a cooldown applied
23282379
// by MarkResult, so it must fall through to the cooldown-aware path below
23292380
// rather than force an immediate retry that selection would reject with a
2330-
// model-cooldown error.
2381+
// model-cooldown error. eofRetryAllowed also honors the failed credential's
2382+
// auth-file scoped request_retry override.
23312383
if status == 0 && isUnexpectedEOFError(err) {
2332-
if !m.retryAllowed(attempt, providers) {
2384+
if !m.eofRetryAllowed(attempt, providers, failedAuthID) {
23332385
return 0, false
23342386
}
23352387
return 0, true

sdk/cliproxy/auth/conductor_overrides_test.go

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testi
4343
}
4444

4545
_, _, maxWait := m.retrySettings()
46-
wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait)
46+
wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait, "")
4747
if shouldRetry {
4848
t.Fatalf("expected shouldRetry=false for request_retry=0, got true (wait=%v)", wait)
4949
}
@@ -53,15 +53,15 @@ func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testi
5353
t.Fatalf("update auth: %v", errUpdate)
5454
}
5555

56-
wait, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait)
56+
wait, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait, "")
5757
if !shouldRetry {
5858
t.Fatalf("expected shouldRetry=true for request_retry=1, got false")
5959
}
6060
if wait <= 0 {
6161
t.Fatalf("expected wait > 0, got %v", wait)
6262
}
6363

64-
_, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 1, []string{"claude"}, model, maxWait)
64+
_, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 1, []string{"claude"}, model, maxWait, "")
6565
if shouldRetry {
6666
t.Fatalf("expected shouldRetry=false on attempt=1 for request_retry=1, got true")
6767
}
@@ -101,7 +101,7 @@ func TestManager_ShouldRetryAfterError_UsesOAuthModelAliasForCooldown(t *testing
101101
}
102102

103103
_, _, maxWait := m.retrySettings()
104-
wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 429, Message: "quota"}, 0, []string{"kimi"}, routeModel, maxWait)
104+
wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 429, Message: "quota"}, 0, []string{"kimi"}, routeModel, maxWait, "")
105105
if !shouldRetry {
106106
t.Fatalf("expected shouldRetry=true, got false (wait=%v)", wait)
107107
}
@@ -124,7 +124,7 @@ func TestManager_ShouldRetryAfterError_RetriesUnexpectedEOF(t *testing.T) {
124124

125125
// A statusless "unexpected EOF" error (as produced by a truncated upstream
126126
// stream) must be retried immediately rather than surfaced to the client.
127-
wait, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, model, maxWait)
127+
wait, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, model, maxWait, "")
128128
if !shouldRetry {
129129
t.Fatalf("expected shouldRetry=true for unexpected EOF, got false")
130130
}
@@ -134,12 +134,12 @@ func TestManager_ShouldRetryAfterError_RetriesUnexpectedEOF(t *testing.T) {
134134

135135
// io.ErrUnexpectedEOF (possibly wrapped) is detected as well.
136136
wrapped := fmt.Errorf("read stream: %w", io.ErrUnexpectedEOF)
137-
if _, shouldRetry = m.shouldRetryAfterError(wrapped, 0, []string{"claude"}, model, maxWait); !shouldRetry {
137+
if _, shouldRetry = m.shouldRetryAfterError(wrapped, 0, []string{"claude"}, model, maxWait, ""); !shouldRetry {
138138
t.Fatalf("expected shouldRetry=true for wrapped io.ErrUnexpectedEOF, got false")
139139
}
140140

141141
// Retries stop once the configured request-retry count is exhausted.
142-
if _, shouldRetry = m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 3, []string{"claude"}, model, maxWait); shouldRetry {
142+
if _, shouldRetry = m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 3, []string{"claude"}, model, maxWait, ""); shouldRetry {
143143
t.Fatalf("expected shouldRetry=false on attempt=3 for request_retry=3, got true")
144144
}
145145
}
@@ -171,7 +171,7 @@ func TestManager_ShouldRetryAfterError_StatusBearingEOFUsesCooldownPath(t *testi
171171
// immediate fast path; with a single cooled-down credential and a cooldown
172172
// exceeding max-retry-interval, the cooldown-aware path declines the retry.
173173
statusErr := &Error{HTTPStatus: 500, Message: "internal error: unexpected EOF"}
174-
if wait, shouldRetry := m.shouldRetryAfterError(statusErr, 0, []string{"claude"}, model, maxWait); shouldRetry {
174+
if wait, shouldRetry := m.shouldRetryAfterError(statusErr, 0, []string{"claude"}, model, maxWait, ""); shouldRetry {
175175
t.Fatalf("expected shouldRetry=false for status-bearing EOF beyond cooldown, got true (wait=%v)", wait)
176176
}
177177
}
@@ -186,11 +186,50 @@ func TestManager_ShouldRetryAfterError_UnexpectedEOFRespectsRetryDisabled(t *tes
186186
}
187187

188188
_, _, maxWait := m.retrySettings()
189-
if _, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, "test-model", maxWait); shouldRetry {
189+
if _, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, "test-model", maxWait, ""); shouldRetry {
190190
t.Fatalf("expected shouldRetry=false when request-retry=0, got true")
191191
}
192192
}
193193

194+
func TestManager_ShouldRetryAfterError_UnexpectedEOFRespectsAuthRequestRetryOverride(t *testing.T) {
195+
m := NewManager(nil, nil, nil)
196+
m.SetRetryConfig(3, 30*time.Second, 0)
197+
198+
model := "test-model"
199+
// The credential that fails has retries disabled via its auth-file override.
200+
disabled := &Auth{
201+
ID: "auth-disabled",
202+
Provider: "claude",
203+
Metadata: map[string]any{"request_retry": float64(0)},
204+
}
205+
// A sibling credential for the same provider still allows retries.
206+
enabled := &Auth{ID: "auth-enabled", Provider: "claude"}
207+
for _, a := range []*Auth{disabled, enabled} {
208+
if _, errRegister := m.Register(context.Background(), a); errRegister != nil {
209+
t.Fatalf("register %s: %v", a.ID, errRegister)
210+
}
211+
}
212+
213+
_, _, maxWait := m.retrySettings()
214+
eof := &Error{Message: "unexpected EOF"}
215+
216+
// When the failing credential disabled retries, its statusless EOF must not
217+
// be retried even though a sibling credential still allows retries.
218+
if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, "auth-disabled"); shouldRetry {
219+
t.Fatalf("expected shouldRetry=false for EOF on request_retry=0 credential, got true")
220+
}
221+
222+
// The retry-enabled credential still retries its EOF.
223+
if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, "auth-enabled"); !shouldRetry {
224+
t.Fatalf("expected shouldRetry=true for EOF on retry-enabled credential, got false")
225+
}
226+
227+
// An unknown failing auth falls back to the provider-wide retry check.
228+
if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, ""); !shouldRetry {
229+
t.Fatalf("expected shouldRetry=true for EOF with unknown auth fallback, got false")
230+
}
231+
}
232+
194233
type credentialRetryLimitExecutor struct {
195234
id string
196235

0 commit comments

Comments
 (0)