fix(oauth): refresh-token race recovery + log gaps from review

bgagent · bgagent · commit c419c75f1e8f · 2026-05-25T22:32:19.000-04:00
Addresses the blocker + critical items from PR review: - Refresh-token race (review blocker). Linear rotates refresh_tokens on every use; concurrent Lambdas/agents racing the same secret will all read the same expiring token and one's refresh will succeed while the others get `invalid_grant`. On `invalid_grant`, re-read the secret from Secrets Manager (bypassing cache). If the refresh_token has changed, another caller already rotated; use the freshly-read token (or retry refresh once if it's also expiring). If unchanged, the refresh_token is permanently rejected and the workspace needs re-onboarding. Implemented in both the TS resolver (linear-oauth-resolver.ts) and Python resolver (config.py). - Unguarded bedrock_agentcore import in agent/src/server.py (review critical). The bare `from bedrock_agentcore.runtime.context import BedrockAgentCoreContext` inside `_run_task_background` killed the entire pipeline thread with no diagnostic if the SDK was missing or its module structure changed. Wrap in try/except (ImportError, AttributeError) and log via _warn_cw — the Linear token resolver has its own SM fallback, so the agent can proceed without the workload-token bridge. - Cache invalidation on fetch-level refresh failure (review high). The TS resolver's `invalidateLinearOauthCache()` only ran in the `!resp.ok` branch; if `fetch()` itself threw (timeout, DNS), the catch returned null without invalidating, leaving the stale expiring token cached for 60s and hammering Linear's token endpoint. Move invalidate into the fetch-level catch too. - Malformed expires_at log (review medium). The Python `_is_expiring` caught `ValueError` and silently returned True, masking consistently-bad writes. Add a WARN log so operators see the bad data instead of just an unexplained refresh on every task. - Positive-path refresh log (review non-blocking aws-samples#5). Added INFO-level breadcrumb on successful refresh in both resolvers so operators diagnosing intermittent 401s have a trace of which workspace refreshed and to what expiry. 11/11 existing resolver unit tests still pass; will add tests for the new race-recovery branch in a followup commit.
diff --git a/agent/src/config.py b/agent/src/config.py
@@ -104,15 +104,32 @@ def _is_expiring(expires_at_iso: str, threshold_seconds: int = 60) -> bool:
         try:
             expiry = datetime.fromisoformat(expires_at_iso.replace("Z", "+00:00"))
         except ValueError:
+            # Malformed timestamp: treat as expiring so the refresh path runs.
+            # Log so a bad write earlier in the chain doesn't silently trigger
+            # a refresh on every single task with no diagnostic trace.
+            log(
+                "WARN",
+                f"_is_expiring: malformed expires_at '{expires_at_iso}'; treating as expiring",
+            )
             return True
         return (expiry - datetime.now(UTC)).total_seconds() < threshold_seconds
 
-    def _refresh(current: dict) -> dict | None:
+    def _try_refresh_once(current: dict) -> tuple[str, dict | None]:
+        """Single Linear /oauth/token POST.
+
+        Returns one of:
+          - ("success", new_token_dict)
+          - ("invalid_grant", None) — Linear rejected the refresh_token,
+            usually because another caller rotated it first
+          - ("failure", None) — any other error (network, 5xx, missing
+            fields). No retry; surface upward.
+        """
         try:
+            import urllib.error
             import urllib.parse
             import urllib.request
         except ImportError:
-            return None
+            return ("failure", None)
 
         body = urllib.parse.urlencode(
             {
@@ -131,12 +148,26 @@ def _refresh(current: dict) -> dict | None:
         try:
             with urllib.request.urlopen(req, timeout=10) as resp:  # noqa: S310
                 payload = json.loads(resp.read().decode("utf-8"))
+        except urllib.error.HTTPError as e:
+            # Body may carry `{"error": "invalid_grant", ...}` even on 400.
+            try:
+                err_payload = json.loads(e.read().decode("utf-8"))
+                err_code = err_payload.get("error")
+            except Exception:
+                err_code = None
+            log(
+                "WARN",
+                f"resolve_linear_api_token refresh rejected: status={e.code} error={err_code}",
+            )
+            if err_code == "invalid_grant":
+                return ("invalid_grant", None)
+            return ("failure", None)
         except Exception as e:
             log("WARN", f"resolve_linear_api_token refresh failed: {type(e).__name__}: {e}")
-            return None
+            return ("failure", None)
 
         if "access_token" not in payload:
-            return None
+            return ("failure", None)
 
         now = datetime.now(UTC)
         # Linear's `expires_in` is documented and reliably sent; if it's
@@ -161,7 +192,68 @@ def _refresh(current: dict) -> dict | None:
         except (ClientError, BotoCoreError) as e:
             log("WARN", f"resolve_linear_api_token: failed to persist refreshed token: {e}")
             # Even without persistence the in-memory token works for THIS run.
-        return next_token
+
+        # Positive-path log so operators diagnosing intermittent 401s have
+        # a breadcrumb showing which workspace refreshed and to what expiry.
+        ws_id = next_token.get("workspace_id", "?")
+        ws_slug = next_token.get("workspace_slug", "?")
+        log(
+            "INFO",
+            f"linear_oauth_refresh_ok workspace_id={ws_id} "
+            f"workspace_slug={ws_slug} new_expires_at={expires_at_iso}",
+        )
+        return ("success", next_token)
+
+    def _refresh(current: dict) -> dict | None:
+        """Refresh with one retry on invalid_grant after re-reading the secret.
+
+        Linear rotates refresh_tokens on every use. Concurrent callers
+        (Lambda + agent + CLI) racing the same secret will see one
+        succeed and the rest get `invalid_grant`. On invalid_grant,
+        re-read SM (bypassing the just-failed token) and retry once if
+        the refresh_token actually changed.
+        """
+        kind, refreshed = _try_refresh_once(current)
+        if kind == "success":
+            return refreshed
+        if kind == "failure":
+            return None
+
+        # invalid_grant: maybe a concurrent caller refreshed first.
+        log(
+            "WARN",
+            "resolve_linear_api_token: invalid_grant — re-reading secret to check "
+            "for concurrent refresh",
+        )
+        try:
+            fresh = _fetch_token()
+        except (ClientError, BotoCoreError) as e:
+            log("WARN", f"resolve_linear_api_token: re-read after invalid_grant failed: {e}")
+            return None
+
+        if fresh.get("refresh_token") == current.get("refresh_token"):
+            # No race — Linear truly rejected this refresh_token.
+            log(
+                "ERROR",
+                "resolve_linear_api_token: refresh_token permanently rejected; re-onboard required",
+            )
+            return None
+
+        # Concurrent caller rotated the token. If the freshly-read value
+        # is itself usable, just take it.
+        if not _is_expiring(fresh.get("expires_at", "")):
+            log(
+                "INFO",
+                "resolve_linear_api_token: concurrent refresh detected; using freshly-read token",
+            )
+            return fresh
+
+        # Concurrent refresh produced a token that's also already
+        # expiring (rare). Retry once with the new refresh_token.
+        kind2, refreshed2 = _try_refresh_once(fresh)
+        if kind2 == "success":
+            return refreshed2
+        return None
 
     try:
         token_obj = _fetch_token()
diff --git a/agent/src/server.py b/agent/src/server.py
@@ -398,9 +398,25 @@ def _run_task_background(
     # one. See aws/bedrock-agentcore-sdk-python#219 for the upstream design
     # constraint that motivates this manual propagation.
     if workload_access_token:
-        from bedrock_agentcore.runtime.context import BedrockAgentCoreContext
+        # Vestigial path from the parked AgentCore Identity flow. If the
+        # `bedrock-agentcore` SDK is missing or its module structure
+        # changes, fail open: the Linear token resolver falls back to
+        # reading per-workspace Secrets Manager directly, so the agent
+        # can still proceed without this ContextVar set. Catching
+        # (ImportError, AttributeError) here keeps the pipeline alive
+        # instead of bricking the entire task with no diagnostic when
+        # the upstream SDK rearranges modules.
+        try:
+            from bedrock_agentcore.runtime.context import BedrockAgentCoreContext
 
-        BedrockAgentCoreContext.set_workload_access_token(workload_access_token)
+            BedrockAgentCoreContext.set_workload_access_token(workload_access_token)
+        except (ImportError, AttributeError) as e:
+            _warn_cw(
+                f"bedrock_agentcore workload-token bridge unavailable "
+                f"({type(e).__name__}: {e}); Linear MCP will resolve via "
+                "Secrets Manager fallback",
+                task_id=task_id,
+            )
 
     _debug_cw(
         f"_run_task_background ENTERED task_id={task_id!r} "
diff --git a/cdk/src/handlers/shared/linear-oauth-resolver.ts b/cdk/src/handlers/shared/linear-oauth-resolver.ts
@@ -237,17 +237,95 @@ async function getOauthSecret(
   }
 }
 
+/**
+ * Outcome of a single Linear /oauth/token POST. Three terminal states:
+ * - `success` — refreshed token (caller persists + caches)
+ * - `invalid_grant` — Linear rejected the refresh_token, likely
+ *    because another caller rotated it first. Caller can retry once
+ *    after re-reading the secret.
+ * - `failure` — any other error (network, 5xx, missing fields). No
+ *    retry; surface null upward.
+ */
+type RefreshOutcome =
+  | { kind: 'success'; token: StoredOauthToken }
+  | { kind: 'invalid_grant' }
+  | { kind: 'failure' };
+
 async function refreshLinearToken(
   current: StoredOauthToken,
   sm: SecretsManagerClient,
   secretArn: string,
   options: ResolverOptions,
 ): Promise<StoredOauthToken | null> {
+  // First attempt with whatever refresh_token we have.
+  const first = await tryRefreshOnce(current, sm, secretArn, options);
+  if (first.kind === 'success') return first.token;
+  if (first.kind === 'failure') return null;
+
+  // `invalid_grant`: Linear rotates refresh_tokens on every use, so a
+  // concurrent Lambda may have refreshed before us. Re-read the secret
+  // from SM (bypassing cache) and retry once if the refresh_token
+  // changed. This avoids permanently bricking the workspace's token
+  // chain when two Lambdas race the same refresh.
+  logger.warn('Linear token refresh got invalid_grant — re-reading secret to check for concurrent refresh', {
+    secret_arn: secretArn,
+    workspace_id: current.workspace_id,
+  });
+
+  const fresh = await getOauthSecret(sm, secretArn);
+  if (!fresh) {
+    invalidateLinearOauthCache(current.workspace_id, secretArn);
+    return null;
+  }
+  if (fresh.refresh_token === current.refresh_token) {
+    // No race — Linear truly rejected this refresh_token. Caller needs
+    // a fresh OAuth dance.
+    logger.error('Linear token refresh permanently rejected — workspace requires re-onboarding', {
+      secret_arn: secretArn,
+      workspace_id: current.workspace_id,
+    });
+    invalidateLinearOauthCache(current.workspace_id, secretArn);
+    return null;
+  }
+
+  // Another caller rotated the token. If the freshly-read token is
+  // itself not expiring, just use it — no second refresh needed.
+  if (!isTokenExpiring(fresh.expires_at)) {
+    logger.info('Linear OAuth token was refreshed by a concurrent caller; using freshly-read value', {
+      secret_arn: secretArn,
+      workspace_id: fresh.workspace_id,
+      new_expires_at: fresh.expires_at,
+    });
+    tokenCache.set(secretArn, { value: fresh, expiresAt: Date.now() + SECRET_CACHE_TTL_MS });
+    return fresh;
+  }
+
+  // Concurrent caller refreshed but the new token is also already
+  // expiring (rare but possible if both Lambdas raced and the second
+  // got a tiny TTL). Retry refresh once with the new refresh_token.
+  const second = await tryRefreshOnce(fresh, sm, secretArn, options);
+  if (second.kind === 'success') return second.token;
+  if (second.kind === 'invalid_grant') {
+    logger.error('Linear token refresh failed even after re-reading freshly-rotated secret', {
+      secret_arn: secretArn,
+      workspace_id: fresh.workspace_id,
+    });
+  }
+  invalidateLinearOauthCache(current.workspace_id, secretArn);
+  return null;
+}
+
+async function tryRefreshOnce(
+  current: StoredOauthToken,
+  sm: SecretsManagerClient,
+  secretArn: string,
+  options: ResolverOptions,
+): Promise<RefreshOutcome> {
   if (!current.client_id || !current.client_secret) {
     logger.error('Cannot refresh Linear OAuth token: stored secret is missing client_id/client_secret', {
       secret_arn: secretArn,
     });
-    return null;
+    return { kind: 'failure' };
   }
 
   const fetchImpl = options.fetchImpl ?? fetch;
@@ -269,15 +347,21 @@ async function refreshLinearToken(
     logger.error('Linear token refresh fetch failed', {
       error: err instanceof Error ? err.message : String(err),
     });
-    return null;
+    // Network-level failure: invalidate cache so the next call
+    // re-reads from Secrets Manager instead of looping on a stale
+    // expiring token. Without this the catch returned null without
+    // invalidating, hammering Linear in a tight loop until the cache
+    // TTL expires.
+    invalidateLinearOauthCache(current.workspace_id, secretArn);
+    return { kind: 'failure' };
   }
 
   let parsed: unknown;
   try {
     parsed = await resp.json();
   } catch {
     logger.error('Linear token refresh returned non-JSON', { status: resp.status });
-    return null;
+    return { kind: 'failure' };
   }
 
   if (!resp.ok) {
@@ -287,9 +371,11 @@ async function refreshLinearToken(
       error: errObj.error,
       error_description: errObj.error_description,
     });
-    // Caller can attempt a fresh OAuth dance; we don't recover automatically.
     invalidateLinearOauthCache(current.workspace_id, secretArn);
-    return null;
+    if (errObj.error === 'invalid_grant') {
+      return { kind: 'invalid_grant' };
+    }
+    return { kind: 'failure' };
   }
 
   const tokenResp = parsed as {
@@ -300,7 +386,7 @@ async function refreshLinearToken(
   };
   if (!tokenResp.access_token || !tokenResp.expires_in) {
     logger.error('Linear token refresh response missing required fields');
-    return null;
+    return { kind: 'failure' };
   }
 
   const now = new Date();
@@ -328,14 +414,22 @@ async function refreshLinearToken(
       error: err instanceof Error ? err.message : String(err),
     });
     // Even if persistence fails, the in-memory token still works for
-    // the rest of THIS Lambda invocation. Other concurrent Lambdas may
-    // race-refresh; Linear's idempotency-on-replay grace window
-    // (30 min documented) absorbs the duplicate.
+    // THIS Lambda invocation. Other concurrent Lambdas may race-refresh
+    // and one will get invalid_grant; the re-read-and-retry path above
+    // will recover.
   }
 
+  // Positive-path log so operators diagnosing intermittent 401s have
+  // a breadcrumb showing which workspace refreshed and to what expiry.
+  logger.info('Linear OAuth token refreshed', {
+    workspace_id: next.workspace_id,
+    workspace_slug: next.workspace_slug,
+    new_expires_at: next.expires_at,
+  });
+
   // Cache the freshest value.
   tokenCache.set(secretArn, { value: next, expiresAt: Date.now() + SECRET_CACHE_TTL_MS });
-  return next;
+  return { kind: 'success', token: next };
 }
 
 /** Test-only: clear all caches. */