|
| 1 | +"""Per-task scoped AWS credentials for tenant-data access. |
| 2 | +
|
| 3 | +This module centralizes how the agent obtains boto3 clients/resources for |
| 4 | +**tenant data** (the task's own DynamoDB rows and its S3 trace/attachment |
| 5 | +objects). Instead of using the long-lived compute role (AgentCore Runtime |
| 6 | +``ExecutionRole`` or ECS Fargate task role) directly, the agent assumes a |
| 7 | +per-task **SessionRole** with session tags ``{user_id, repo, task_id}`` and |
| 8 | +uses the resulting short-lived, tag-scoped credentials. The SessionRole's IAM |
| 9 | +policy self-constrains via ``aws:PrincipalTag/*`` conditions |
| 10 | +(``dynamodb:LeadingKeys`` on ``task_id`` for the task tables, an S3 prefix |
| 11 | +condition on ``user_id`` for the trace bucket), so a compromised session can |
| 12 | +only reach its own task's data — not other tenants'. |
| 13 | +
|
| 14 | +Two properties matter for correctness: |
| 15 | +
|
| 16 | +1. **Refreshable, not one-shot.** The agent runs under credentials that are |
| 17 | + themselves an assumed role, so the agent's own ``sts:AssumeRole`` is *role |
| 18 | + chaining*, which AWS hard-caps at 1 hour regardless of the role's |
| 19 | + ``MaxSessionDuration``. Tasks can run up to ``maxLifetime`` (8 h), so a |
| 20 | + single ``assume_role()`` call would yield credentials that expire mid-task |
| 21 | + and fail every subsequent call with ``ExpiredToken``. We wrap the assume |
| 22 | + call in botocore ``RefreshableCredentials`` so boto3 transparently |
| 23 | + re-assumes before expiry. |
| 24 | +
|
| 25 | +2. **Backend-agnostic.** The same code path works whether the agent boots |
| 26 | + under an AgentCore execution role or an ECS task role — both are valid |
| 27 | + assuming principals in the SessionRole trust policy. |
| 28 | +
|
| 29 | +**Fail-safe vs. fail-closed:** |
| 30 | +
|
| 31 | +- When ``AGENT_SESSION_ROLE_ARN`` is **unset** (local dev, tests, or a |
| 32 | + deployment that has not yet provisioned the SessionRole), this module returns |
| 33 | + plain boto3 clients/resources backed by the ambient credential chain — |
| 34 | + identical to the pre-feature behavior. Scoping is opt-in and its absence does |
| 35 | + not block task execution. |
| 36 | +- When ``AGENT_SESSION_ROLE_ARN`` **is set**, scoping has been *requested*, so a |
| 37 | + failure to build the scoped session is treated as a hard error: this module |
| 38 | + raises :class:`SessionScopingError` rather than silently degrading to the |
| 39 | + ambient (cross-tenant) compute role. For a tenant-isolation control, silently |
| 40 | + running unscoped is the most dangerous failure mode — the agent keeps working |
| 41 | + but stops isolating tenants — so we fail **closed** (abort the task) instead. |
| 42 | +""" |
| 43 | + |
| 44 | +from __future__ import annotations |
| 45 | + |
| 46 | +import os |
| 47 | +import threading |
| 48 | +from typing import Any |
| 49 | + |
| 50 | +# Env var holding the per-task SessionRole ARN. Set by the orchestrator on the |
| 51 | +# compute environment (AgentCore runtime env / ECS container overrides). |
| 52 | +SESSION_ROLE_ARN_ENV = "AGENT_SESSION_ROLE_ARN" |
| 53 | + |
| 54 | +# Role chaining caps the assumed session at 1 hour. Request the maximum the |
| 55 | +# cap allows; botocore refreshes well before this elapses. |
| 56 | +_CHAINED_SESSION_DURATION_S = 3600 |
| 57 | + |
| 58 | +# IAM session-tag value constraints: keys <=128 chars, values <=256 chars. |
| 59 | +_MAX_TAG_VALUE_LEN = 256 |
| 60 | + |
| 61 | +_lock = threading.Lock() |
| 62 | +_session: Any = None # cached boto3.Session (scoped or plain) |
| 63 | +_scoped: bool | None = None # None until first resolution; True if tag-scoped |
| 64 | + |
| 65 | +# Session-tag values, set once at startup by ``configure_session`` from the |
| 66 | +# resolved TaskConfig. Kept in private module state — NOT os.environ — so the |
| 67 | +# tenant identifiers are not inherited by the untrusted repo subprocesses the |
| 68 | +# agent spawns (build/test/tooling). Read by ``_session_tags`` at assume time. |
| 69 | +_tags: dict[str, str] = {} |
| 70 | + |
| 71 | + |
| 72 | +class SessionScopingError(RuntimeError): |
| 73 | + """Per-session IAM scoping was requested but could not be established. |
| 74 | +
|
| 75 | + Raised when ``AGENT_SESSION_ROLE_ARN`` is set but the scoped session cannot |
| 76 | + be built. Fails the task closed rather than silently falling back to the |
| 77 | + ambient (cross-tenant) compute role. |
| 78 | + """ |
| 79 | + |
| 80 | + |
| 81 | +def configure_session(user_id: str, repo: str, task_id: str) -> None: |
| 82 | + """Record session-tag values in private module state for later use. |
| 83 | +
|
| 84 | + Called once at agent startup from the resolved ``TaskConfig``. Idempotent; |
| 85 | + safe to call before any tenant-data client is created. Does not itself |
| 86 | + assume the role — assumption is deferred until the first client is built so |
| 87 | + that a missing SessionRole never delays startup. Values are stored in a |
| 88 | + module global (not ``os.environ``) so tenant identifiers do not leak into |
| 89 | + spawned subprocesses. |
| 90 | + """ |
| 91 | + global _tags |
| 92 | + _tags = { |
| 93 | + key: value |
| 94 | + for key, value in (("user_id", user_id), ("repo", repo), ("task_id", task_id)) |
| 95 | + if value |
| 96 | + } |
| 97 | + |
| 98 | + |
| 99 | +def reset_session_cache() -> None: |
| 100 | + """Drop the cached session and tags. For tests that toggle config.""" |
| 101 | + global _session, _scoped, _tags |
| 102 | + with _lock: |
| 103 | + _session = None |
| 104 | + _scoped = None |
| 105 | + _tags = {} |
| 106 | + |
| 107 | + |
| 108 | +def _session_tags() -> list[dict[str, str]]: |
| 109 | + """Build the AssumeRole ``Tags`` list from the configured tag values. |
| 110 | +
|
| 111 | + Only non-empty values are included (filtered at ``configure_session``). |
| 112 | + Values are truncated to the IAM limit so an over-long repo slug can never |
| 113 | + make ``AssumeRole`` fail closed. |
| 114 | + """ |
| 115 | + return [{"Key": key, "Value": value[:_MAX_TAG_VALUE_LEN]} for key, value in _tags.items()] |
| 116 | + |
| 117 | + |
| 118 | +def _build_scoped_session(role_arn: str) -> Any: |
| 119 | + """Build a boto3 Session backed by refreshable assumed-role credentials. |
| 120 | +
|
| 121 | + The refresh callback re-invokes ``sts:AssumeRole`` (with session tags) each |
| 122 | + time botocore decides the cached credentials are near expiry, so a task |
| 123 | + running past the 1-hour role-chaining cap keeps working. |
| 124 | + """ |
| 125 | + import boto3 |
| 126 | + from botocore.credentials import ( |
| 127 | + DeferredRefreshableCredentials, |
| 128 | + ) |
| 129 | + from botocore.session import get_session as get_botocore_session |
| 130 | + |
| 131 | + region = os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") |
| 132 | + task_id = _tags.get("task_id", "") |
| 133 | + # Role session name must be <=64 chars and match [\w+=,.@-]. task_id is a |
| 134 | + # short slug (a ULID, ~26 chars, in the API path; a 12-char hex fallback |
| 135 | + # when the agent generates its own) — well under 64. The ``abca-`` prefix |
| 136 | + # keeps CloudTrail entries identifiable. Truncate defensively regardless. |
| 137 | + session_name = f"abca-{task_id}"[:64] or "abca-session" |
| 138 | + |
| 139 | + # A dedicated STS client built from the *ambient* (compute-role) chain. |
| 140 | + # This is the role-chaining caller; the assumed SessionRole credentials it |
| 141 | + # returns must NOT be used to build it, or refresh would recurse. |
| 142 | + sts_client = boto3.client("sts", region_name=region) |
| 143 | + |
| 144 | + def _refresh() -> dict[str, str]: |
| 145 | + resp = sts_client.assume_role( |
| 146 | + RoleArn=role_arn, |
| 147 | + RoleSessionName=session_name, |
| 148 | + DurationSeconds=_CHAINED_SESSION_DURATION_S, |
| 149 | + Tags=_session_tags(), |
| 150 | + ) |
| 151 | + creds = resp["Credentials"] |
| 152 | + return { |
| 153 | + "access_key": creds["AccessKeyId"], |
| 154 | + "secret_key": creds["SecretAccessKey"], |
| 155 | + "token": creds["SessionToken"], |
| 156 | + # botocore expects an ISO8601 string; the SDK returns a datetime. |
| 157 | + "expiry_time": creds["Expiration"].isoformat(), |
| 158 | + } |
| 159 | + |
| 160 | + botocore_session = get_botocore_session() |
| 161 | + # Deferred: the first assume_role happens on first credential use, not now, |
| 162 | + # so a transient STS hiccup at startup doesn't crash the agent before it |
| 163 | + # has even begun. |
| 164 | + botocore_session._credentials = DeferredRefreshableCredentials( |
| 165 | + method="sts-assume-role-session-tags", |
| 166 | + refresh_using=_refresh, |
| 167 | + ) |
| 168 | + if region: |
| 169 | + botocore_session.set_config_variable("region", region) |
| 170 | + return boto3.Session(botocore_session=botocore_session) |
| 171 | + |
| 172 | + |
| 173 | +def get_session() -> Any: |
| 174 | + """Return the cached boto3 Session for tenant-data access. |
| 175 | +
|
| 176 | + Tag-scoped (assumed SessionRole) when ``AGENT_SESSION_ROLE_ARN`` is set; |
| 177 | + otherwise a plain session on the ambient credential chain (fail-safe). |
| 178 | + """ |
| 179 | + global _session, _scoped |
| 180 | + if _session is not None: |
| 181 | + return _session |
| 182 | + with _lock: |
| 183 | + if _session is not None: |
| 184 | + return _session |
| 185 | + import boto3 |
| 186 | + |
| 187 | + role_arn = os.environ.get(SESSION_ROLE_ARN_ENV, "").strip() |
| 188 | + if role_arn: |
| 189 | + # Scoping was requested. Build the scoped session or FAIL CLOSED — |
| 190 | + # never silently downgrade to the ambient compute role, which can |
| 191 | + # reach every tenant's data (that is exactly what this control |
| 192 | + # prevents). |
| 193 | + try: |
| 194 | + _session = _build_scoped_session(role_arn) |
| 195 | + _scoped = True |
| 196 | + except Exception as exc: |
| 197 | + from shell import log_error_cw |
| 198 | + |
| 199 | + log_error_cw( |
| 200 | + "SESSION_SCOPING_FAILED: AGENT_SESSION_ROLE_ARN is set but the " |
| 201 | + f"scoped session could not be built ({type(exc).__name__}: {exc}). " |
| 202 | + "Failing closed — refusing to run on unscoped ambient credentials, " |
| 203 | + "which would disable tenant isolation.", |
| 204 | + task_id=_tags.get("task_id") or None, |
| 205 | + ) |
| 206 | + raise SessionScopingError( |
| 207 | + "per-session IAM scoping requested via " |
| 208 | + f"{SESSION_ROLE_ARN_ENV} but could not be established" |
| 209 | + ) from exc |
| 210 | + else: |
| 211 | + # Scoping not requested (local/dev/tests, or pre-provisioning): |
| 212 | + # plain ambient session, behaviorally identical to pre-feature code. |
| 213 | + _session = boto3.Session( |
| 214 | + region_name=os.environ.get("AWS_REGION") or os.environ.get("AWS_DEFAULT_REGION") |
| 215 | + ) |
| 216 | + _scoped = False |
| 217 | + return _session |
| 218 | + |
| 219 | + |
| 220 | +def is_scoped() -> bool: |
| 221 | + """Whether the current session uses tag-scoped assumed-role credentials.""" |
| 222 | + if _scoped is None: |
| 223 | + get_session() |
| 224 | + return bool(_scoped) |
| 225 | + |
| 226 | + |
| 227 | +def tenant_client(service_name: str, **kwargs: Any) -> Any: |
| 228 | + """boto3 client for tenant data. |
| 229 | +
|
| 230 | + When the per-task SessionRole is configured, the client is built from the |
| 231 | + tag-scoped, refreshable session. Otherwise it delegates directly to |
| 232 | + ``boto3.client`` — behaviorally identical to the pre-feature code path |
| 233 | + (and transparent to callers/tests that mock ``boto3.client``). |
| 234 | + """ |
| 235 | + session = get_session() |
| 236 | + if is_scoped(): |
| 237 | + return session.client(service_name, **kwargs) |
| 238 | + import boto3 |
| 239 | + |
| 240 | + return boto3.client(service_name, **kwargs) |
| 241 | + |
| 242 | + |
| 243 | +def tenant_resource(service_name: str, **kwargs: Any) -> Any: |
| 244 | + """boto3 resource for tenant data. See :func:`tenant_client`.""" |
| 245 | + session = get_session() |
| 246 | + if is_scoped(): |
| 247 | + return session.resource(service_name, **kwargs) |
| 248 | + import boto3 |
| 249 | + |
| 250 | + return boto3.resource(service_name, **kwargs) |
0 commit comments