|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Credential helper for Claude Code's Bedrock calls (#215, cost attribution). |
| 3 | +
|
| 4 | +Claude Code (``CLAUDE_CODE_USE_BEDROCK=1``) makes every ``InvokeModel`` call — |
| 5 | +not the agent's boto3 — so the per-task tenant-data SessionRole in |
| 6 | +``aws_session.py`` cannot tag those calls. Instead Claude Code's |
| 7 | +``awsCredentialExport`` setting (in the image's managed-settings layer) runs |
| 8 | +this script, captures its JSON stdout, and signs Bedrock requests with the |
| 9 | +returned credentials. With a real ``Expiration`` it re-runs ~5 min before |
| 10 | +expiry, so an 8 h task survives the 1 h role-chaining cap. |
| 11 | +
|
| 12 | +Goal: assume the per-task SessionRole with ``{user_id, repo, task_id}`` STS |
| 13 | +session tags so Bedrock spend is attributable per user/repo in AWS Cost |
| 14 | +Explorer / CUR 2.0 (``iamPrincipal/*`` dimensions, after the operator activates |
| 15 | +the cost-allocation tags). The same role already carries the tenant-data grants; |
| 16 | +Track-1 only adds ``bedrock:InvokeModel*`` to it (see ``agent-session-role.ts``). |
| 17 | +
|
| 18 | +**Fails OPEN.** Bedrock attribution is a billing/observability control, not a |
| 19 | +tenant-isolation one (contrast ``aws_session.py``, which fails closed). If the |
| 20 | +attribution config is absent or the assume-role fails, this helper emits the |
| 21 | +**ambient** compute-role credentials so Bedrock keeps working untagged — losing |
| 22 | +chargeback granularity is not a security incident, and the compute role retains |
| 23 | +``InvokeModel`` precisely so this fallback works. |
| 24 | +
|
| 25 | +The role ARN and tag values are read from a 0600 JSON file the agent writes at |
| 26 | +startup (``write_attribution_file``), not from the environment — so the tenant |
| 27 | +identifiers are not inherited by the untrusted repo subprocesses the agent |
| 28 | +spawns, matching the discipline in ``aws_session.py``. |
| 29 | +
|
| 30 | +Output shape (consumed by Claude Code's awsCredentialExport): |
| 31 | +
|
| 32 | + {"Credentials": {"AccessKeyId": "...", "SecretAccessKey": "...", |
| 33 | + "SessionToken": "...", "Expiration": "<ISO8601>"}} |
| 34 | +""" |
| 35 | + |
| 36 | +from __future__ import annotations |
| 37 | + |
| 38 | +import json |
| 39 | +import os |
| 40 | +import sys |
| 41 | +from typing import Any |
| 42 | + |
| 43 | +# Fixed path the agent writes (0600) and this helper reads. A fixed path is |
| 44 | +# required because the managed-settings ``awsCredentialExport`` command is |
| 45 | +# static (baked into the image) and cannot carry per-task arguments. |
| 46 | +ATTRIBUTION_FILE_ENV = "BEDROCK_ATTRIBUTION_FILE" |
| 47 | +DEFAULT_ATTRIBUTION_FILE = "/home/agent/.bedrock-attribution.json" |
| 48 | + |
| 49 | +# Role chaining caps the assumed session at 1 hour; request the max the cap |
| 50 | +# allows. Claude Code refreshes ~5 min before the returned Expiration. |
| 51 | +_CHAINED_SESSION_DURATION_S = 3600 |
| 52 | + |
| 53 | + |
| 54 | +def attribution_file_path() -> str: |
| 55 | + return os.environ.get(ATTRIBUTION_FILE_ENV, "").strip() or DEFAULT_ATTRIBUTION_FILE |
| 56 | + |
| 57 | + |
| 58 | +def write_attribution_file( |
| 59 | + role_arn: str, tags: list[dict[str, str]], path: str | None = None |
| 60 | +) -> str: |
| 61 | + """Persist the SessionRole ARN + STS tags for the helper to read. |
| 62 | +
|
| 63 | + Written 0600 and owned by the agent user. Returns the path written. Called |
| 64 | + by the agent at startup (see ``runner._setup_agent_env``) only when a |
| 65 | + SessionRole is configured; absence is the fail-open signal. |
| 66 | + """ |
| 67 | + target = path or attribution_file_path() |
| 68 | + payload = json.dumps({"role_arn": role_arn, "tags": tags}) |
| 69 | + # Create with 0600 from the start (os.open + O_CREAT honors mode, modulo |
| 70 | + # umask) so the secret-adjacent file is never briefly world-readable. |
| 71 | + fd = os.open(target, os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600) |
| 72 | + with os.fdopen(fd, "w") as fh: |
| 73 | + fh.write(payload) |
| 74 | + return target |
| 75 | + |
| 76 | + |
| 77 | +def _warn(message: str) -> None: |
| 78 | + """Emit a diagnostic to stderr. |
| 79 | +
|
| 80 | + This process's **stdout is the credential channel** — Claude Code parses it |
| 81 | + as the ``awsCredentialExport`` JSON result — so diagnostics MUST go to |
| 82 | + stderr or they would corrupt the credential envelope. (This is also why |
| 83 | + ``shell.log``, which writes to fd 1, is unusable here.) Every fail-open path |
| 84 | + logs through here so a silent, weeks-long loss of cost attribution is |
| 85 | + instead a visible, correlatable signal — the fallback stays open, but it is |
| 86 | + never invisible. |
| 87 | + """ |
| 88 | + print(f"[bedrock-creds] {message}", file=sys.stderr) |
| 89 | + |
| 90 | + |
| 91 | +def _emit(creds: dict[str, str]) -> None: |
| 92 | + json.dump({"Credentials": creds}, sys.stdout) |
| 93 | + |
| 94 | + |
| 95 | +def _frozen_to_creds(frozen: Any, expiry_iso: str | None) -> dict[str, str]: |
| 96 | + out = { |
| 97 | + "AccessKeyId": frozen.access_key, |
| 98 | + "SecretAccessKey": frozen.secret_key, |
| 99 | + "SessionToken": frozen.token or "", |
| 100 | + } |
| 101 | + if expiry_iso: |
| 102 | + out["Expiration"] = expiry_iso |
| 103 | + return out |
| 104 | + |
| 105 | + |
| 106 | +def _ambient_credentials() -> dict[str, str]: |
| 107 | + """Frozen ambient (compute-role) credentials — the fail-open fallback.""" |
| 108 | + import botocore.session |
| 109 | + |
| 110 | + creds = botocore.session.get_session().get_credentials() |
| 111 | + if creds is None: |
| 112 | + # No resolvable credentials at all — the deepest degradation. Emit an |
| 113 | + # empty object; Claude Code then falls back to its own default-chain |
| 114 | + # resolution. Surface it: if that fallback also fails, this stderr line |
| 115 | + # is the only breadcrumb. |
| 116 | + _warn( |
| 117 | + "no resolvable AWS credentials; emitting empty envelope, " |
| 118 | + "Claude Code will use its default chain" |
| 119 | + ) |
| 120 | + return {} |
| 121 | + return _frozen_to_creds(creds.get_frozen_credentials(), None) |
| 122 | + |
| 123 | + |
| 124 | +def resolve_credentials() -> dict[str, str]: |
| 125 | + """Return tagged assumed-role creds, or ambient creds on any failure.""" |
| 126 | + path = attribution_file_path() |
| 127 | + try: |
| 128 | + with open(path) as fh: |
| 129 | + cfg = json.load(fh) |
| 130 | + role_arn = cfg["role_arn"] |
| 131 | + tags = cfg.get("tags", []) |
| 132 | + except FileNotFoundError: |
| 133 | + # Attribution not configured (local/dev, or pre-provisioning). Expected |
| 134 | + # and benign — debug-level signal only. |
| 135 | + _warn("attribution file absent; not configured — using ambient creds") |
| 136 | + return _ambient_credentials() |
| 137 | + except (OSError, ValueError, KeyError) as exc: |
| 138 | + # File present but unreadable/malformed/schema-drifted. This is NOT the |
| 139 | + # benign "not configured" case — it points at a write_attribution_file |
| 140 | + # bug or a partial write, so it warrants a louder signal. |
| 141 | + _warn( |
| 142 | + f"attribution file present but unreadable ({type(exc).__name__}: {exc}); " |
| 143 | + "using ambient creds" |
| 144 | + ) |
| 145 | + return _ambient_credentials() |
| 146 | + |
| 147 | + try: |
| 148 | + import boto3 |
| 149 | + from botocore.exceptions import BotoCoreError, ClientError |
| 150 | + except ImportError as exc: |
| 151 | + # boto3 missing/broken in the image is a packaging defect, not the |
| 152 | + # expected assume-role failure — name it explicitly so it can't hide. |
| 153 | + _warn(f"boto3 unavailable ({exc}); using ambient creds — fix the image") |
| 154 | + return _ambient_credentials() |
| 155 | + |
| 156 | + region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") |
| 157 | + task_id = next((t["Value"] for t in tags if t.get("Key") == "task_id"), "") |
| 158 | + session_name = f"abca-bedrock-{task_id}"[:64] or "abca-bedrock" |
| 159 | + try: |
| 160 | + resp = boto3.client("sts", region_name=region).assume_role( |
| 161 | + RoleArn=role_arn, |
| 162 | + RoleSessionName=session_name, |
| 163 | + DurationSeconds=_CHAINED_SESSION_DURATION_S, |
| 164 | + Tags=tags, |
| 165 | + ) |
| 166 | + c = resp["Credentials"] |
| 167 | + return { |
| 168 | + "AccessKeyId": c["AccessKeyId"], |
| 169 | + "SecretAccessKey": c["SecretAccessKey"], |
| 170 | + "SessionToken": c["SessionToken"], |
| 171 | + "Expiration": c["Expiration"].isoformat(), |
| 172 | + } |
| 173 | + except (ClientError, BotoCoreError) as exc: |
| 174 | + # Expected assume failure: role not yet provisioned, AccessDenied, |
| 175 | + # transient STS error. Fail open so Bedrock keeps working on the |
| 176 | + # compute role; spend for this task is untagged. |
| 177 | + _warn( |
| 178 | + f"assume_role failed ({type(exc).__name__}: {exc}); using ambient creds " |
| 179 | + "— Bedrock spend will be UNTAGGED" |
| 180 | + ) |
| 181 | + return _ambient_credentials() |
| 182 | + except Exception as exc: |
| 183 | + # Anything else (unexpected STS response shape, a logic bug here) is NOT |
| 184 | + # the expected fallback. Still fail open — this is a billing control, not |
| 185 | + # isolation — but flag it distinctly so it isn't mistaken for AccessDenied. |
| 186 | + _warn( |
| 187 | + f"UNEXPECTED error minting tagged creds ({type(exc).__name__}: {exc}); " |
| 188 | + "using ambient creds" |
| 189 | + ) |
| 190 | + return _ambient_credentials() |
| 191 | + |
| 192 | + |
| 193 | +def main() -> int: |
| 194 | + _emit(resolve_credentials()) |
| 195 | + return 0 |
| 196 | + |
| 197 | + |
| 198 | +if __name__ == "__main__": |
| 199 | + sys.exit(main()) |
0 commit comments