From 6e082236e4156c0509aa601063e3016e60dadfe9 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 07:51:10 +0000 Subject: [PATCH] Honor per-auth request_retry override for unexpected-EOF retries Co-Authored-By: Hex --- sdk/cliproxy/auth/conductor.go | 76 ++++++++++++++++--- sdk/cliproxy/auth/conductor_overrides_test.go | 57 +++++++++++--- 2 files changed, 112 insertions(+), 21 deletions(-) diff --git a/sdk/cliproxy/auth/conductor.go b/sdk/cliproxy/auth/conductor.go index 97da9f1f03..2a0b402c77 100644 --- a/sdk/cliproxy/auth/conductor.go +++ b/sdk/cliproxy/auth/conductor.go @@ -1295,13 +1295,14 @@ func (m *Manager) Execute(ctx context.Context, providers []string, req cliproxye _, maxRetryCredentials, maxWait := m.retrySettings() var lastErr error + var failedAuthID string for attempt := 0; ; attempt++ { - resp, errExec := m.executeMixedOnce(ctx, normalized, req, opts, maxRetryCredentials) + resp, errExec := m.executeMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID) if errExec == nil { return resp, nil } lastErr = errExec - wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait, failedAuthID) if !shouldRetry { break } @@ -1330,13 +1331,14 @@ func (m *Manager) ExecuteCount(ctx context.Context, providers []string, req clip _, maxRetryCredentials, maxWait := m.retrySettings() var lastErr error + var failedAuthID string for attempt := 0; ; attempt++ { - resp, errExec := m.executeCountMixedOnce(ctx, normalized, req, opts, maxRetryCredentials) + resp, errExec := m.executeCountMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID) if errExec == nil { return resp, nil } lastErr = errExec - wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(errExec, attempt, normalized, req.Model, maxWait, failedAuthID) if !shouldRetry { break } @@ -1361,13 +1363,14 @@ func (m *Manager) ExecuteStream(ctx context.Context, providers []string, req cli _, maxRetryCredentials, maxWait := m.retrySettings() var lastErr error + var failedAuthID string for attempt := 0; ; attempt++ { - result, errStream := m.executeStreamMixedOnce(ctx, normalized, req, opts, maxRetryCredentials) + result, errStream := m.executeStreamMixedOnce(ctx, normalized, req, opts, maxRetryCredentials, &failedAuthID) if errStream == nil { return result, nil } lastErr = errStream - wait, shouldRetry := m.shouldRetryAfterError(errStream, attempt, normalized, req.Model, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(errStream, attempt, normalized, req.Model, maxWait, failedAuthID) if !shouldRetry { break } @@ -1390,7 +1393,7 @@ func (m *Manager) ExecuteStream(ctx context.Context, providers []string, req cli return nil, &Error{Code: "auth_not_found", Message: "no auth available"} } -func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (cliproxyexecutor.Response, error) { +func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (cliproxyexecutor.Response, error) { if len(providers) == 0 { return cliproxyexecutor.Response{}, &Error{Code: "provider_not_found", Message: "no provider supplied"} } @@ -1423,6 +1426,9 @@ func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req entry := logEntryWithRequestID(ctx) debugLogAuthSelection(entry, auth, provider, req.Model) publishSelectedAuthMetadata(opts.Metadata, auth.ID) + if failedAuthID != nil { + *failedAuthID = auth.ID + } tried[auth.ID] = struct{}{} execCtx := ctx @@ -1489,7 +1495,7 @@ func (m *Manager) executeMixedOnce(ctx context.Context, providers []string, req } } -func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (cliproxyexecutor.Response, error) { +func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (cliproxyexecutor.Response, error) { if len(providers) == 0 { return cliproxyexecutor.Response{}, &Error{Code: "provider_not_found", Message: "no provider supplied"} } @@ -1522,6 +1528,9 @@ func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, entry := logEntryWithRequestID(ctx) debugLogAuthSelection(entry, auth, provider, req.Model) publishSelectedAuthMetadata(opts.Metadata, auth.ID) + if failedAuthID != nil { + *failedAuthID = auth.ID + } tried[auth.ID] = struct{}{} execCtx := ctx @@ -1588,7 +1597,7 @@ func (m *Manager) executeCountMixedOnce(ctx context.Context, providers []string, } } -func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int) (*cliproxyexecutor.StreamResult, error) { +func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string, req cliproxyexecutor.Request, opts cliproxyexecutor.Options, maxRetryCredentials int, failedAuthID *string) (*cliproxyexecutor.StreamResult, error) { if len(providers) == 0 { return nil, &Error{Code: "provider_not_found", Message: "no provider supplied"} } @@ -1621,6 +1630,9 @@ func (m *Manager) executeStreamMixedOnce(ctx context.Context, providers []string entry := logEntryWithRequestID(ctx) debugLogAuthSelection(entry, auth, provider, req.Model) publishSelectedAuthMetadata(opts.Metadata, auth.ID) + if failedAuthID != nil { + *failedAuthID = auth.ID + } tried[auth.ID] = struct{}{} execCtx := ctx @@ -2305,7 +2317,46 @@ func (m *Manager) retryAllowed(attempt int, providers []string) bool { return false } -func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []string, model string, maxWait time.Duration) (time.Duration, bool) { +// eofRetryAllowed decides whether a statusless "unexpected EOF" may be retried. +// It honors the auth-file scoped request_retry override of the credential that +// actually failed (failedAuthID) so a credential that disabled retries is not +// retried just because another credential for the provider still allows it. +// When the failing auth is unknown it falls back to the provider-wide check. +func (m *Manager) eofRetryAllowed(attempt int, providers []string, failedAuthID string) bool { + if allowed, known := m.authRetryAllowed(attempt, failedAuthID); known { + return allowed + } + return m.retryAllowed(attempt, providers) +} + +// authRetryAllowed reports whether the specific auth permits another attempt, +// honoring its auth-file scoped request_retry override. The second return value +// is false when the auth is unknown (empty ID or not registered). +func (m *Manager) authRetryAllowed(attempt int, authID string) (bool, bool) { + if m == nil || attempt < 0 { + return false, false + } + authID = strings.TrimSpace(authID) + if authID == "" { + return false, false + } + m.mu.RLock() + auth := m.auths[authID] + m.mu.RUnlock() + if auth == nil { + return false, false + } + effectiveRetry := int(m.requestRetry.Load()) + if override, ok := auth.RequestRetryOverride(); ok { + effectiveRetry = override + } + if effectiveRetry < 0 { + effectiveRetry = 0 + } + return attempt < effectiveRetry, true +} + +func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []string, model string, maxWait time.Duration, failedAuthID string) (time.Duration, bool) { if err == nil { return 0, false } @@ -2327,9 +2378,10 @@ func (m *Manager) shouldRetryAfterError(err error, attempt int, providers []stri // body merely contains "unexpected EOF") has already had a cooldown applied // by MarkResult, so it must fall through to the cooldown-aware path below // rather than force an immediate retry that selection would reject with a - // model-cooldown error. + // model-cooldown error. eofRetryAllowed also honors the failed credential's + // auth-file scoped request_retry override. if status == 0 && isUnexpectedEOFError(err) { - if !m.retryAllowed(attempt, providers) { + if !m.eofRetryAllowed(attempt, providers, failedAuthID) { return 0, false } return 0, true diff --git a/sdk/cliproxy/auth/conductor_overrides_test.go b/sdk/cliproxy/auth/conductor_overrides_test.go index 3bf9a6a529..c74ee644bc 100644 --- a/sdk/cliproxy/auth/conductor_overrides_test.go +++ b/sdk/cliproxy/auth/conductor_overrides_test.go @@ -43,7 +43,7 @@ func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testi } _, _, maxWait := m.retrySettings() - wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait, "") if shouldRetry { t.Fatalf("expected shouldRetry=false for request_retry=0, got true (wait=%v)", wait) } @@ -53,7 +53,7 @@ func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testi t.Fatalf("update auth: %v", errUpdate) } - wait, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait) + wait, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 0, []string{"claude"}, model, maxWait, "") if !shouldRetry { t.Fatalf("expected shouldRetry=true for request_retry=1, got false") } @@ -61,7 +61,7 @@ func TestManager_ShouldRetryAfterError_RespectsAuthRequestRetryOverride(t *testi t.Fatalf("expected wait > 0, got %v", wait) } - _, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 1, []string{"claude"}, model, maxWait) + _, shouldRetry = m.shouldRetryAfterError(&Error{HTTPStatus: 500, Message: "boom"}, 1, []string{"claude"}, model, maxWait, "") if shouldRetry { t.Fatalf("expected shouldRetry=false on attempt=1 for request_retry=1, got true") } @@ -101,7 +101,7 @@ func TestManager_ShouldRetryAfterError_UsesOAuthModelAliasForCooldown(t *testing } _, _, maxWait := m.retrySettings() - wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 429, Message: "quota"}, 0, []string{"kimi"}, routeModel, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(&Error{HTTPStatus: 429, Message: "quota"}, 0, []string{"kimi"}, routeModel, maxWait, "") if !shouldRetry { t.Fatalf("expected shouldRetry=true, got false (wait=%v)", wait) } @@ -124,7 +124,7 @@ func TestManager_ShouldRetryAfterError_RetriesUnexpectedEOF(t *testing.T) { // A statusless "unexpected EOF" error (as produced by a truncated upstream // stream) must be retried immediately rather than surfaced to the client. - wait, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, model, maxWait) + wait, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, model, maxWait, "") if !shouldRetry { t.Fatalf("expected shouldRetry=true for unexpected EOF, got false") } @@ -134,12 +134,12 @@ func TestManager_ShouldRetryAfterError_RetriesUnexpectedEOF(t *testing.T) { // io.ErrUnexpectedEOF (possibly wrapped) is detected as well. wrapped := fmt.Errorf("read stream: %w", io.ErrUnexpectedEOF) - if _, shouldRetry = m.shouldRetryAfterError(wrapped, 0, []string{"claude"}, model, maxWait); !shouldRetry { + if _, shouldRetry = m.shouldRetryAfterError(wrapped, 0, []string{"claude"}, model, maxWait, ""); !shouldRetry { t.Fatalf("expected shouldRetry=true for wrapped io.ErrUnexpectedEOF, got false") } // Retries stop once the configured request-retry count is exhausted. - if _, shouldRetry = m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 3, []string{"claude"}, model, maxWait); shouldRetry { + if _, shouldRetry = m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 3, []string{"claude"}, model, maxWait, ""); shouldRetry { t.Fatalf("expected shouldRetry=false on attempt=3 for request_retry=3, got true") } } @@ -171,7 +171,7 @@ func TestManager_ShouldRetryAfterError_StatusBearingEOFUsesCooldownPath(t *testi // immediate fast path; with a single cooled-down credential and a cooldown // exceeding max-retry-interval, the cooldown-aware path declines the retry. statusErr := &Error{HTTPStatus: 500, Message: "internal error: unexpected EOF"} - if wait, shouldRetry := m.shouldRetryAfterError(statusErr, 0, []string{"claude"}, model, maxWait); shouldRetry { + if wait, shouldRetry := m.shouldRetryAfterError(statusErr, 0, []string{"claude"}, model, maxWait, ""); shouldRetry { t.Fatalf("expected shouldRetry=false for status-bearing EOF beyond cooldown, got true (wait=%v)", wait) } } @@ -186,11 +186,50 @@ func TestManager_ShouldRetryAfterError_UnexpectedEOFRespectsRetryDisabled(t *tes } _, _, maxWait := m.retrySettings() - if _, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, "test-model", maxWait); shouldRetry { + if _, shouldRetry := m.shouldRetryAfterError(&Error{Message: "unexpected EOF"}, 0, []string{"claude"}, "test-model", maxWait, ""); shouldRetry { t.Fatalf("expected shouldRetry=false when request-retry=0, got true") } } +func TestManager_ShouldRetryAfterError_UnexpectedEOFRespectsAuthRequestRetryOverride(t *testing.T) { + m := NewManager(nil, nil, nil) + m.SetRetryConfig(3, 30*time.Second, 0) + + model := "test-model" + // The credential that fails has retries disabled via its auth-file override. + disabled := &Auth{ + ID: "auth-disabled", + Provider: "claude", + Metadata: map[string]any{"request_retry": float64(0)}, + } + // A sibling credential for the same provider still allows retries. + enabled := &Auth{ID: "auth-enabled", Provider: "claude"} + for _, a := range []*Auth{disabled, enabled} { + if _, errRegister := m.Register(context.Background(), a); errRegister != nil { + t.Fatalf("register %s: %v", a.ID, errRegister) + } + } + + _, _, maxWait := m.retrySettings() + eof := &Error{Message: "unexpected EOF"} + + // When the failing credential disabled retries, its statusless EOF must not + // be retried even though a sibling credential still allows retries. + if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, "auth-disabled"); shouldRetry { + t.Fatalf("expected shouldRetry=false for EOF on request_retry=0 credential, got true") + } + + // The retry-enabled credential still retries its EOF. + if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, "auth-enabled"); !shouldRetry { + t.Fatalf("expected shouldRetry=true for EOF on retry-enabled credential, got false") + } + + // An unknown failing auth falls back to the provider-wide retry check. + if _, shouldRetry := m.shouldRetryAfterError(eof, 0, []string{"claude"}, model, maxWait, ""); !shouldRetry { + t.Fatalf("expected shouldRetry=true for EOF with unknown auth fallback, got false") + } +} + type credentialRetryLimitExecutor struct { id string