Skip to content

Commit c419c75

Browse files
author
bgagent
committed
fix(oauth): refresh-token race recovery + log gaps from review
Addresses the blocker + critical items from PR review: - Refresh-token race (review blocker). Linear rotates refresh_tokens on every use; concurrent Lambdas/agents racing the same secret will all read the same expiring token and one's refresh will succeed while the others get `invalid_grant`. On `invalid_grant`, re-read the secret from Secrets Manager (bypassing cache). If the refresh_token has changed, another caller already rotated; use the freshly-read token (or retry refresh once if it's also expiring). If unchanged, the refresh_token is permanently rejected and the workspace needs re-onboarding. Implemented in both the TS resolver (linear-oauth-resolver.ts) and Python resolver (config.py). - Unguarded bedrock_agentcore import in agent/src/server.py (review critical). The bare `from bedrock_agentcore.runtime.context import BedrockAgentCoreContext` inside `_run_task_background` killed the entire pipeline thread with no diagnostic if the SDK was missing or its module structure changed. Wrap in try/except (ImportError, AttributeError) and log via _warn_cw — the Linear token resolver has its own SM fallback, so the agent can proceed without the workload-token bridge. - Cache invalidation on fetch-level refresh failure (review high). The TS resolver's `invalidateLinearOauthCache()` only ran in the `!resp.ok` branch; if `fetch()` itself threw (timeout, DNS), the catch returned null without invalidating, leaving the stale expiring token cached for 60s and hammering Linear's token endpoint. Move invalidate into the fetch-level catch too. - Malformed expires_at log (review medium). The Python `_is_expiring` caught `ValueError` and silently returned True, masking consistently-bad writes. Add a WARN log so operators see the bad data instead of just an unexplained refresh on every task. - Positive-path refresh log (review non-blocking aws-samples#5). Added INFO-level breadcrumb on successful refresh in both resolvers so operators diagnosing intermittent 401s have a trace of which workspace refreshed and to what expiry. 11/11 existing resolver unit tests still pass; will add tests for the new race-recovery branch in a followup commit.
1 parent eaf7c55 commit c419c75

3 files changed

Lines changed: 219 additions & 17 deletions

File tree

agent/src/config.py

Lines changed: 97 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -104,15 +104,32 @@ def _is_expiring(expires_at_iso: str, threshold_seconds: int = 60) -> bool:
104104
try:
105105
expiry = datetime.fromisoformat(expires_at_iso.replace("Z", "+00:00"))
106106
except ValueError:
107+
# Malformed timestamp: treat as expiring so the refresh path runs.
108+
# Log so a bad write earlier in the chain doesn't silently trigger
109+
# a refresh on every single task with no diagnostic trace.
110+
log(
111+
"WARN",
112+
f"_is_expiring: malformed expires_at '{expires_at_iso}'; treating as expiring",
113+
)
107114
return True
108115
return (expiry - datetime.now(UTC)).total_seconds() < threshold_seconds
109116

110-
def _refresh(current: dict) -> dict | None:
117+
def _try_refresh_once(current: dict) -> tuple[str, dict | None]:
118+
"""Single Linear /oauth/token POST.
119+
120+
Returns one of:
121+
- ("success", new_token_dict)
122+
- ("invalid_grant", None) — Linear rejected the refresh_token,
123+
usually because another caller rotated it first
124+
- ("failure", None) — any other error (network, 5xx, missing
125+
fields). No retry; surface upward.
126+
"""
111127
try:
128+
import urllib.error
112129
import urllib.parse
113130
import urllib.request
114131
except ImportError:
115-
return None
132+
return ("failure", None)
116133

117134
body = urllib.parse.urlencode(
118135
{
@@ -131,12 +148,26 @@ def _refresh(current: dict) -> dict | None:
131148
try:
132149
with urllib.request.urlopen(req, timeout=10) as resp: # noqa: S310
133150
payload = json.loads(resp.read().decode("utf-8"))
151+
except urllib.error.HTTPError as e:
152+
# Body may carry `{"error": "invalid_grant", ...}` even on 400.
153+
try:
154+
err_payload = json.loads(e.read().decode("utf-8"))
155+
err_code = err_payload.get("error")
156+
except Exception:
157+
err_code = None
158+
log(
159+
"WARN",
160+
f"resolve_linear_api_token refresh rejected: status={e.code} error={err_code}",
161+
)
162+
if err_code == "invalid_grant":
163+
return ("invalid_grant", None)
164+
return ("failure", None)
134165
except Exception as e:
135166
log("WARN", f"resolve_linear_api_token refresh failed: {type(e).__name__}: {e}")
136-
return None
167+
return ("failure", None)
137168

138169
if "access_token" not in payload:
139-
return None
170+
return ("failure", None)
140171

141172
now = datetime.now(UTC)
142173
# Linear's `expires_in` is documented and reliably sent; if it's
@@ -161,7 +192,68 @@ def _refresh(current: dict) -> dict | None:
161192
except (ClientError, BotoCoreError) as e:
162193
log("WARN", f"resolve_linear_api_token: failed to persist refreshed token: {e}")
163194
# Even without persistence the in-memory token works for THIS run.
164-
return next_token
195+
196+
# Positive-path log so operators diagnosing intermittent 401s have
197+
# a breadcrumb showing which workspace refreshed and to what expiry.
198+
ws_id = next_token.get("workspace_id", "?")
199+
ws_slug = next_token.get("workspace_slug", "?")
200+
log(
201+
"INFO",
202+
f"linear_oauth_refresh_ok workspace_id={ws_id} "
203+
f"workspace_slug={ws_slug} new_expires_at={expires_at_iso}",
204+
)
205+
return ("success", next_token)
206+
207+
def _refresh(current: dict) -> dict | None:
208+
"""Refresh with one retry on invalid_grant after re-reading the secret.
209+
210+
Linear rotates refresh_tokens on every use. Concurrent callers
211+
(Lambda + agent + CLI) racing the same secret will see one
212+
succeed and the rest get `invalid_grant`. On invalid_grant,
213+
re-read SM (bypassing the just-failed token) and retry once if
214+
the refresh_token actually changed.
215+
"""
216+
kind, refreshed = _try_refresh_once(current)
217+
if kind == "success":
218+
return refreshed
219+
if kind == "failure":
220+
return None
221+
222+
# invalid_grant: maybe a concurrent caller refreshed first.
223+
log(
224+
"WARN",
225+
"resolve_linear_api_token: invalid_grant — re-reading secret to check "
226+
"for concurrent refresh",
227+
)
228+
try:
229+
fresh = _fetch_token()
230+
except (ClientError, BotoCoreError) as e:
231+
log("WARN", f"resolve_linear_api_token: re-read after invalid_grant failed: {e}")
232+
return None
233+
234+
if fresh.get("refresh_token") == current.get("refresh_token"):
235+
# No race — Linear truly rejected this refresh_token.
236+
log(
237+
"ERROR",
238+
"resolve_linear_api_token: refresh_token permanently rejected; re-onboard required",
239+
)
240+
return None
241+
242+
# Concurrent caller rotated the token. If the freshly-read value
243+
# is itself usable, just take it.
244+
if not _is_expiring(fresh.get("expires_at", "")):
245+
log(
246+
"INFO",
247+
"resolve_linear_api_token: concurrent refresh detected; using freshly-read token",
248+
)
249+
return fresh
250+
251+
# Concurrent refresh produced a token that's also already
252+
# expiring (rare). Retry once with the new refresh_token.
253+
kind2, refreshed2 = _try_refresh_once(fresh)
254+
if kind2 == "success":
255+
return refreshed2
256+
return None
165257

166258
try:
167259
token_obj = _fetch_token()

agent/src/server.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,9 +398,25 @@ def _run_task_background(
398398
# one. See aws/bedrock-agentcore-sdk-python#219 for the upstream design
399399
# constraint that motivates this manual propagation.
400400
if workload_access_token:
401-
from bedrock_agentcore.runtime.context import BedrockAgentCoreContext
401+
# Vestigial path from the parked AgentCore Identity flow. If the
402+
# `bedrock-agentcore` SDK is missing or its module structure
403+
# changes, fail open: the Linear token resolver falls back to
404+
# reading per-workspace Secrets Manager directly, so the agent
405+
# can still proceed without this ContextVar set. Catching
406+
# (ImportError, AttributeError) here keeps the pipeline alive
407+
# instead of bricking the entire task with no diagnostic when
408+
# the upstream SDK rearranges modules.
409+
try:
410+
from bedrock_agentcore.runtime.context import BedrockAgentCoreContext
402411

403-
BedrockAgentCoreContext.set_workload_access_token(workload_access_token)
412+
BedrockAgentCoreContext.set_workload_access_token(workload_access_token)
413+
except (ImportError, AttributeError) as e:
414+
_warn_cw(
415+
f"bedrock_agentcore workload-token bridge unavailable "
416+
f"({type(e).__name__}: {e}); Linear MCP will resolve via "
417+
"Secrets Manager fallback",
418+
task_id=task_id,
419+
)
404420

405421
_debug_cw(
406422
f"_run_task_background ENTERED task_id={task_id!r} "

cdk/src/handlers/shared/linear-oauth-resolver.ts

Lines changed: 104 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -237,17 +237,95 @@ async function getOauthSecret(
237237
}
238238
}
239239

240+
/**
241+
* Outcome of a single Linear /oauth/token POST. Three terminal states:
242+
* - `success` — refreshed token (caller persists + caches)
243+
* - `invalid_grant` — Linear rejected the refresh_token, likely
244+
* because another caller rotated it first. Caller can retry once
245+
* after re-reading the secret.
246+
* - `failure` — any other error (network, 5xx, missing fields). No
247+
* retry; surface null upward.
248+
*/
249+
type RefreshOutcome =
250+
| { kind: 'success'; token: StoredOauthToken }
251+
| { kind: 'invalid_grant' }
252+
| { kind: 'failure' };
253+
240254
async function refreshLinearToken(
241255
current: StoredOauthToken,
242256
sm: SecretsManagerClient,
243257
secretArn: string,
244258
options: ResolverOptions,
245259
): Promise<StoredOauthToken | null> {
260+
// First attempt with whatever refresh_token we have.
261+
const first = await tryRefreshOnce(current, sm, secretArn, options);
262+
if (first.kind === 'success') return first.token;
263+
if (first.kind === 'failure') return null;
264+
265+
// `invalid_grant`: Linear rotates refresh_tokens on every use, so a
266+
// concurrent Lambda may have refreshed before us. Re-read the secret
267+
// from SM (bypassing cache) and retry once if the refresh_token
268+
// changed. This avoids permanently bricking the workspace's token
269+
// chain when two Lambdas race the same refresh.
270+
logger.warn('Linear token refresh got invalid_grant — re-reading secret to check for concurrent refresh', {
271+
secret_arn: secretArn,
272+
workspace_id: current.workspace_id,
273+
});
274+
275+
const fresh = await getOauthSecret(sm, secretArn);
276+
if (!fresh) {
277+
invalidateLinearOauthCache(current.workspace_id, secretArn);
278+
return null;
279+
}
280+
if (fresh.refresh_token === current.refresh_token) {
281+
// No race — Linear truly rejected this refresh_token. Caller needs
282+
// a fresh OAuth dance.
283+
logger.error('Linear token refresh permanently rejected — workspace requires re-onboarding', {
284+
secret_arn: secretArn,
285+
workspace_id: current.workspace_id,
286+
});
287+
invalidateLinearOauthCache(current.workspace_id, secretArn);
288+
return null;
289+
}
290+
291+
// Another caller rotated the token. If the freshly-read token is
292+
// itself not expiring, just use it — no second refresh needed.
293+
if (!isTokenExpiring(fresh.expires_at)) {
294+
logger.info('Linear OAuth token was refreshed by a concurrent caller; using freshly-read value', {
295+
secret_arn: secretArn,
296+
workspace_id: fresh.workspace_id,
297+
new_expires_at: fresh.expires_at,
298+
});
299+
tokenCache.set(secretArn, { value: fresh, expiresAt: Date.now() + SECRET_CACHE_TTL_MS });
300+
return fresh;
301+
}
302+
303+
// Concurrent caller refreshed but the new token is also already
304+
// expiring (rare but possible if both Lambdas raced and the second
305+
// got a tiny TTL). Retry refresh once with the new refresh_token.
306+
const second = await tryRefreshOnce(fresh, sm, secretArn, options);
307+
if (second.kind === 'success') return second.token;
308+
if (second.kind === 'invalid_grant') {
309+
logger.error('Linear token refresh failed even after re-reading freshly-rotated secret', {
310+
secret_arn: secretArn,
311+
workspace_id: fresh.workspace_id,
312+
});
313+
}
314+
invalidateLinearOauthCache(current.workspace_id, secretArn);
315+
return null;
316+
}
317+
318+
async function tryRefreshOnce(
319+
current: StoredOauthToken,
320+
sm: SecretsManagerClient,
321+
secretArn: string,
322+
options: ResolverOptions,
323+
): Promise<RefreshOutcome> {
246324
if (!current.client_id || !current.client_secret) {
247325
logger.error('Cannot refresh Linear OAuth token: stored secret is missing client_id/client_secret', {
248326
secret_arn: secretArn,
249327
});
250-
return null;
328+
return { kind: 'failure' };
251329
}
252330

253331
const fetchImpl = options.fetchImpl ?? fetch;
@@ -269,15 +347,21 @@ async function refreshLinearToken(
269347
logger.error('Linear token refresh fetch failed', {
270348
error: err instanceof Error ? err.message : String(err),
271349
});
272-
return null;
350+
// Network-level failure: invalidate cache so the next call
351+
// re-reads from Secrets Manager instead of looping on a stale
352+
// expiring token. Without this the catch returned null without
353+
// invalidating, hammering Linear in a tight loop until the cache
354+
// TTL expires.
355+
invalidateLinearOauthCache(current.workspace_id, secretArn);
356+
return { kind: 'failure' };
273357
}
274358

275359
let parsed: unknown;
276360
try {
277361
parsed = await resp.json();
278362
} catch {
279363
logger.error('Linear token refresh returned non-JSON', { status: resp.status });
280-
return null;
364+
return { kind: 'failure' };
281365
}
282366

283367
if (!resp.ok) {
@@ -287,9 +371,11 @@ async function refreshLinearToken(
287371
error: errObj.error,
288372
error_description: errObj.error_description,
289373
});
290-
// Caller can attempt a fresh OAuth dance; we don't recover automatically.
291374
invalidateLinearOauthCache(current.workspace_id, secretArn);
292-
return null;
375+
if (errObj.error === 'invalid_grant') {
376+
return { kind: 'invalid_grant' };
377+
}
378+
return { kind: 'failure' };
293379
}
294380

295381
const tokenResp = parsed as {
@@ -300,7 +386,7 @@ async function refreshLinearToken(
300386
};
301387
if (!tokenResp.access_token || !tokenResp.expires_in) {
302388
logger.error('Linear token refresh response missing required fields');
303-
return null;
389+
return { kind: 'failure' };
304390
}
305391

306392
const now = new Date();
@@ -328,14 +414,22 @@ async function refreshLinearToken(
328414
error: err instanceof Error ? err.message : String(err),
329415
});
330416
// Even if persistence fails, the in-memory token still works for
331-
// the rest of THIS Lambda invocation. Other concurrent Lambdas may
332-
// race-refresh; Linear's idempotency-on-replay grace window
333-
// (30 min documented) absorbs the duplicate.
417+
// THIS Lambda invocation. Other concurrent Lambdas may race-refresh
418+
// and one will get invalid_grant; the re-read-and-retry path above
419+
// will recover.
334420
}
335421

422+
// Positive-path log so operators diagnosing intermittent 401s have
423+
// a breadcrumb showing which workspace refreshed and to what expiry.
424+
logger.info('Linear OAuth token refreshed', {
425+
workspace_id: next.workspace_id,
426+
workspace_slug: next.workspace_slug,
427+
new_expires_at: next.expires_at,
428+
});
429+
336430
// Cache the freshest value.
337431
tokenCache.set(secretArn, { value: next, expiresAt: Date.now() + SECRET_CACHE_TTL_MS });
338-
return next;
432+
return { kind: 'success', token: next };
339433
}
340434

341435
/** Test-only: clear all caches. */

0 commit comments

Comments
 (0)