Skip to content

Commit 63ee62a

Browse files
reyortiz3claude
andauthored
Propagate auth errors through stale cache fallback (#4981)
refreshCache() was returning stale cached data for any API error, including auth failures (401/403 from registry and OAuth flow timeouts). Users saw previously-cached registry content with no error after their credentials expired or the OAuth browser flow timed out. Fix in two parts: - auth.Transport.RoundTrip now wraps all Token() errors with ErrRegistryAuthRequired, classifying token-acquisition failures as auth errors regardless of the underlying cause. - refreshCache() checks for ErrRegistryAuthRequired/ErrRegistryUnauthorized before the stale-cache fallback; transient errors (network, 5xx) still degrade gracefully. Fixes #4950 Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 81e23e2 commit 63ee62a

3 files changed

Lines changed: 185 additions & 3 deletions

File tree

pkg/registry/auth/transport.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@ func (t *Transport) RoundTrip(req *http.Request) (*http.Response, error) {
2323
// Get token from source
2424
token, err := t.Source.Token(req.Context())
2525
if err != nil {
26-
return nil, fmt.Errorf("failed to get auth token: %w", err)
26+
// Any token acquisition failure is an auth error regardless of cause.
27+
// Wrapping with ErrRegistryAuthRequired ensures refreshCache() and other
28+
// callers can distinguish auth failures from transient network errors.
29+
return nil, fmt.Errorf("%w: failed to get auth token: %w", ErrRegistryAuthRequired, err)
2730
}
2831

2932
// If token is empty, pass through without auth

pkg/registry/provider_cached.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package registry
66
import (
77
"context"
88
"encoding/json"
9+
"errors"
910
"fmt"
1011
"os"
1112
"path/filepath"
@@ -112,15 +113,21 @@ func (p *CachedAPIRegistryProvider) GetRegistry() (*types.Registry, error) {
112113
}
113114

114115
// refreshCache fetches fresh data from the API and updates the cache.
115-
// If the API fetch fails, returns stale cache if available.
116+
// Auth errors (ErrRegistryAuthRequired, ErrRegistryUnauthorized) are always
117+
// propagated — stale cache must never mask a changed authentication state.
118+
// For transient failures (network blip, 5xx) stale cache is returned if available.
116119
func (p *CachedAPIRegistryProvider) refreshCache() (*types.Registry, error) {
117120
p.cacheMu.Lock()
118121
defer p.cacheMu.Unlock()
119122

120123
// Fetch from API
121124
registry, err := p.APIRegistryProvider.GetRegistry()
122125
if err != nil {
123-
// If fetch fails and we have stale cache, return it
126+
// Auth errors must propagate — stale cache must not mask a changed auth state.
127+
if errors.Is(err, auth.ErrRegistryAuthRequired) || errors.Is(err, api.ErrRegistryUnauthorized) {
128+
return nil, err
129+
}
130+
// Transient failures (network blip, 5xx): degrade gracefully to stale cache.
124131
if p.cachedData != nil {
125132
return p.cachedData, nil
126133
}
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package registry
5+
6+
import (
7+
"context"
8+
"errors"
9+
"fmt"
10+
"net/http"
11+
"net/http/httptest"
12+
"sync/atomic"
13+
"testing"
14+
15+
"github.com/stretchr/testify/require"
16+
17+
"github.com/stacklok/toolhive/pkg/registry/api"
18+
"github.com/stacklok/toolhive/pkg/registry/auth"
19+
)
20+
21+
// TestCachedProvider_AuthErrorNotMaskedByStaleCache reproduces a bug in
22+
// refreshCache() at provider_cached.go:116.
23+
//
24+
// Scenario: a user has an existing cache populated from a previous successful
25+
// fetch. Later, their registry returns 401 (token revoked, credentials
26+
// rejected server-side). The CLI should surface the auth error, because the
27+
// user's authorization state may have changed since the cache was populated.
28+
//
29+
// Current behavior: refreshCache() treats ANY error from the upstream fetch
30+
// as a signal to fall back to stale cached data, including authentication
31+
// errors. The CLI silently prints stale registry contents with no error.
32+
//
33+
// This test covers the 401/403 branch (server-side rejection).
34+
func TestCachedProvider_AuthErrorNotMaskedByStaleCache(t *testing.T) {
35+
t.Parallel()
36+
37+
var returnUnauthorized atomic.Bool
38+
39+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
40+
if returnUnauthorized.Load() {
41+
w.WriteHeader(http.StatusUnauthorized)
42+
return
43+
}
44+
w.Header().Set("Content-Type", "application/json")
45+
w.WriteHeader(http.StatusOK)
46+
_, _ = w.Write([]byte(`{"servers":[],"metadata":{"next_cursor":""}}`))
47+
}))
48+
t.Cleanup(srv.Close)
49+
50+
// allowPrivateIp=true because httptest binds to 127.0.0.1
51+
// usePersistent=false exercises only the in-memory cache path
52+
// tokenSource=nil so the constructor's validation probe runs against the healthy server
53+
provider, err := NewCachedAPIRegistryProvider(srv.URL, true, false, nil)
54+
require.NoError(t, err, "constructor should succeed while server is healthy")
55+
56+
// Populate the in-memory cache with a successful fetch.
57+
_, err = provider.ListServers()
58+
require.NoError(t, err, "first fetch should succeed and populate cache")
59+
60+
// Flip the server to 401 — equivalent to the registry now rejecting
61+
// the user's credentials server-side.
62+
returnUnauthorized.Store(true)
63+
64+
// Force a refresh — mirrors what happens when the in-memory cache TTL
65+
// expires (default 1h) and the next `thv registry list` triggers a
66+
// fresh fetch. The upstream API call will fail with 401.
67+
err = provider.ForceRefresh()
68+
69+
require.Error(t, err,
70+
"expected auth error to propagate on refresh; got nil — "+
71+
"stale cache is masking the auth failure (bug)")
72+
require.True(t,
73+
errors.Is(err, api.ErrRegistryUnauthorized) ||
74+
errors.Is(err, auth.ErrRegistryAuthRequired),
75+
"expected ErrRegistryUnauthorized or ErrRegistryAuthRequired; got: %v", err)
76+
}
77+
78+
// failingTokenSource is a test double for auth.TokenSource.
79+
// It returns an empty token (no Authorization header added) when fail is
80+
// false, and returns an error wrapped like oauthTokenSource.Token() would
81+
// when its browser OAuth flow fails, when fail is true.
82+
type failingTokenSource struct {
83+
fail atomic.Bool
84+
}
85+
86+
// Token mirrors the exact error-wrapping oauthTokenSource.Token() applies
87+
// when its browser flow times out / is cancelled. See:
88+
// - pkg/registry/auth/oauth_token_source.go:61-67 (outer wrap)
89+
// - pkg/registry/auth/oauth_token_source.go:110-113 (inner wrap)
90+
//
91+
// The point of matching the wrapping is to verify that the eventual
92+
// refreshCache() fallback behavior does not depend on the specific error
93+
// type — any error causes stale cache to be served.
94+
func (s *failingTokenSource) Token(_ context.Context) (string, error) {
95+
if s.fail.Load() {
96+
inner := fmt.Errorf("oauth flow start failed: %w",
97+
errors.New("authorization timed out waiting for browser callback"))
98+
return "", fmt.Errorf("oauth flow failed: %w", inner)
99+
}
100+
// No auth header needed — the httptest server does not validate credentials.
101+
return "", nil
102+
}
103+
104+
// TestCachedProvider_OAuthFlowFailureNotMaskedByStaleCache reproduces the
105+
// same masking bug via the OAuth-browser-flow-failure code path — the one
106+
// actually described in the bug report:
107+
//
108+
// "thv registry list sent me a URL to OAuth. I did nothing. After a minute
109+
// it exited the OAuth flow and went to another OAuth flow. I also did
110+
// nothing. In another minute it exited that OAuth flow and then it
111+
// showed me everything in the registry."
112+
//
113+
// This path differs from the 401/403 path because the error returned by
114+
// oauthTokenSource.Token() when the browser flow times out is a generic
115+
// wrapped error — it does NOT match errors.Is(err, auth.ErrRegistryAuthRequired)
116+
// or errors.Is(err, api.ErrRegistryUnauthorized). It is propagated through:
117+
//
118+
// oauthTokenSource.Token ("oauth flow failed: ...")
119+
// -> auth.Transport.RoundTrip ("failed to get auth token: ...")
120+
// -> api.Client.fetchServersPage ("failed to fetch servers: ...")
121+
// -> APIRegistryProvider.GetRegistry (wrapped into *UnavailableError)
122+
// -> refreshCache (swallowed; stale cache returned with nil err)
123+
//
124+
// A sentinel-check fix like
125+
//
126+
// if errors.Is(err, auth.ErrRegistryAuthRequired) ||
127+
// errors.Is(err, api.ErrRegistryUnauthorized) { return nil, err }
128+
//
129+
// will NOT catch this path — the OAuth browser-flow error carries neither
130+
// sentinel. A correct fix has to classify token-acquisition failures as
131+
// auth errors before they are flattened into UnavailableError.
132+
func TestCachedProvider_OAuthFlowFailureNotMaskedByStaleCache(t *testing.T) {
133+
t.Parallel()
134+
135+
// Server always serves a valid empty list. All failures in this test
136+
// come from the token source, not the server — simulating an OAuth
137+
// flow that never completes while the registry itself is reachable.
138+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
139+
w.Header().Set("Content-Type", "application/json")
140+
w.WriteHeader(http.StatusOK)
141+
_, _ = w.Write([]byte(`{"servers":[],"metadata":{"next_cursor":""}}`))
142+
}))
143+
t.Cleanup(srv.Close)
144+
145+
ts := &failingTokenSource{}
146+
147+
// Non-nil tokenSource causes NewAPIRegistryProvider to skip its
148+
// construction-time validation probe (provider_api.go:57) — matching
149+
// the real OAuth-configured code path where the probe is skipped
150+
// because a browser flow cannot complete within 10 seconds.
151+
provider, err := NewCachedAPIRegistryProvider(srv.URL, true, false, ts)
152+
require.NoError(t, err, "constructor should succeed (probe skipped when tokenSource != nil)")
153+
154+
// Populate the in-memory cache with a successful fetch.
155+
// ts.fail == false, so Token() returns ("", nil) and the request passes
156+
// through the auth transport without an Authorization header.
157+
_, err = provider.ListServers()
158+
require.NoError(t, err, "first fetch should succeed and populate cache")
159+
160+
// Simulate the OAuth browser flow failing / timing out on the next
161+
// token acquisition.
162+
ts.fail.Store(true)
163+
164+
err = provider.ForceRefresh()
165+
166+
// DESIRED: an OAuth-flow failure must not be hidden by stale cache.
167+
// CURRENT BUG: refreshCache returns cached data with nil error.
168+
require.Error(t, err,
169+
"expected OAuth flow failure to propagate on refresh; got nil — "+
170+
"stale cache is masking the OAuth failure (bug, "+
171+
"matches user-reported scenario)")
172+
}

0 commit comments

Comments
 (0)