telemetryflow
diff --git a/‎.github/workflows/benchmark-promote-image.yml‎
Lines changed: 128 additions & 0 deletions b/‎.github/workflows/benchmark-promote-image.yml‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎.github/workflows/benchmark-run.yml‎
Lines changed: 28 additions & 0 deletions b/‎.github/workflows/benchmark-run.yml‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎app/agent/investigation.py‎
Lines changed: 93 additions & 0 deletions b/‎app/agent/investigation.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎app/agent/llm_invoke_errors.py‎
Lines changed: 14 additions & 1 deletion b/‎app/agent/llm_invoke_errors.py‎
Lines changed: 14 additions & 1 deletion
@@ -0,0 +1,128 @@
+name: Benchmark image — promote tag to task definition
+
+# Manually-triggered workflow that runs `terraform apply -var=image_tag=<TAG>`
+# in infra/bench/ to register a new ECS task definition revision pointing at
+# the chosen ECR image. This is the privileged "deploy" step that comes
+# between an image push (automatic) and a bench run (manual).
+#
+# Why not auto-promote on every image push? An image build is a code-change
+# event. A task-def update is a deploy event. Decoupling them lets you
+# stage many images and choose deliberately which one production runs.
+#
+# Trigger from the GitHub UI:
+#   Actions → "Benchmark image — promote tag to task definition" → Run
+#
+# Pre-reqs (one-time):
+#   - infra/bench/ Terraform applied at least once
+#   - Repo secrets seeded into AWS Secrets Manager
+#   - Repo vars set (AWS_ACCOUNT_ID etc., see AWS_BENCH_SETUP.md step 4)
+#   - The opensre-bench-github-actions OIDC role MUST be granted these
+#     additional permissions before this workflow can succeed:
+#       - ecs:RegisterTaskDefinition
+#       - ecs:DescribeTaskDefinition
+#       - iam:PassRole on the bench task + execution roles
+#       - s3:GetObject, s3:PutObject on the Terraform state bucket
+#       - dynamodb:GetItem, PutItem, DeleteItem on the lock table
+#     The existing role currently has RunTask + Seed only. Extending it is
+#     a 10-line addition to infra/bench/iam_oidc.tf — see the comment in
+#     that file. This workflow will FAIL with AccessDenied until that
+#     IAM diff is applied.
+
+on:
+  workflow_dispatch:
+    inputs:
+      image_tag:
+        description: 'ECR image tag to promote (e.g. 3792493)'
+        required: true
+
+permissions:
+  contents: read
+  id-token: write           # required for AWS OIDC role assumption
+
+concurrency:
+  group: benchmark-promote-image
+  cancel-in-progress: false
+
+jobs:
+  promote:
+    name: terraform apply image_tag=${{ inputs.image_tag }}
+    if: github.repository == 'Tracer-Cloud/opensre'
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    env:
+      AWS_REGION: us-east-1
+
+    steps:
+      - name: Verify required repo variables
+        env:
+          AWS_ACCOUNT_ID: ${{ vars.AWS_ACCOUNT_ID }}
+        run: |
+          if [ -z "${AWS_ACCOUNT_ID:-}" ]; then
+            echo "::error::Missing repo variable AWS_ACCOUNT_ID. See AWS_BENCH_SETUP.md."
+            exit 1
+          fi
+
+      - uses: actions/checkout@v5
+
+      - name: Configure AWS credentials (OIDC role assumption)
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::${{ vars.AWS_ACCOUNT_ID }}:role/opensre-bench-github-actions
+          role-session-name: bench-promote-${{ github.run_id }}
+          aws-region: us-east-1
+
+      - name: Verify image tag exists in ECR
+        # Fail loudly if the operator typos the tag, before Terraform tries
+        # to register a task definition pointing at a missing image.
+        env:
+          IMAGE_TAG: ${{ inputs.image_tag }}
+        run: |
+          if ! aws ecr describe-images \
+                --repository-name opensre-bench \
+                --image-ids imageTag="$IMAGE_TAG" \
+                >/dev/null 2>&1; then
+            echo "::error::Image tag $IMAGE_TAG not found in ECR repo opensre-bench."
+            echo "::error::Push it first via 'Benchmark image — build + push to ECR'."
+            exit 1
+          fi
+
+      - uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: 1.7.5
+
+      - name: Terraform init
+        working-directory: infra/bench
+        run: terraform init -input=false
+
+      - name: Terraform apply
+        # Plan is captured in the workflow log; review it in the run page.
+        # -auto-approve is intentional — this workflow IS the human approval
+        # (the operator triggered it manually with a specific tag).
+        working-directory: infra/bench
+        env:
+          IMAGE_TAG: ${{ inputs.image_tag }}
+        run: |
+          terraform apply -input=false -auto-approve -var="image_tag=$IMAGE_TAG"
+
+      - name: Surface the new task definition revision in the job summary
+        working-directory: infra/bench
+        env:
+          IMAGE_TAG: ${{ inputs.image_tag }}
+        run: |
+          TASK_DEF_ARN=$(terraform output -raw task_definition_arn)
+          IMAGE_URI=$(aws ecs describe-task-definition \
+            --task-definition "$TASK_DEF_ARN" \
+            --query 'taskDefinition.containerDefinitions[0].image' \
+            --output text)
+          {
+            echo "## Image promoted"
+            echo ""
+            echo "- Promoted tag: \`$IMAGE_TAG\`"
+            echo "- New task definition ARN: \`$TASK_DEF_ARN\`"
+            echo "- Image now in task definition: \`$IMAGE_URI\`"
+            echo ""
+            echo "### Next step"
+            echo ""
+            echo "Trigger **Benchmark — run on Fargate** to launch a run against this image."
+          } >> "$GITHUB_STEP_SUMMARY"
@@ -60,6 +60,34 @@ jobs:
       AWS_REGION: us-east-1
 
     steps:
+      - name: Verify required repo variables
+        # Fail loudly BEFORE the AWS auth step if any required repo variable
+        # is missing. Without this, an unset var would surface downstream as
+        # an opaque error like "Task Definition can not be blank" from
+        # describe-task-definition. See AWS_BENCH_SETUP.md step 4 for how
+        # to set these.
+        env:
+          AWS_ACCOUNT_ID: ${{ vars.AWS_ACCOUNT_ID }}
+          BENCH_ECS_CLUSTER: ${{ vars.BENCH_ECS_CLUSTER }}
+          BENCH_TASK_DEFINITION_FAMILY: ${{ vars.BENCH_TASK_DEFINITION_FAMILY }}
+          BENCH_SUBNET_IDS: ${{ vars.BENCH_SUBNET_IDS }}
+          BENCH_SECURITY_GROUP_ID: ${{ vars.BENCH_SECURITY_GROUP_ID }}
+        run: |
+          missing=()
+          for var in AWS_ACCOUNT_ID BENCH_ECS_CLUSTER BENCH_TASK_DEFINITION_FAMILY \
+                     BENCH_SUBNET_IDS BENCH_SECURITY_GROUP_ID; do
+            if [ -z "${!var:-}" ]; then
+              missing+=("$var")
+            fi
+          done
+          if [ "${#missing[@]}" -gt 0 ]; then
+            echo "::error::Missing repo variable(s): ${missing[*]}"
+            echo "::error::Set them under Settings → Secrets and variables → Actions → Variables."
+            echo "::error::Values come from \`cd infra/bench && terraform output\` — see tests/benchmarks/AWS_BENCH_SETUP.md step 4."
+            exit 1
+          fi
+          echo "All 5 required repo variables are set."
+
       - name: Configure AWS credentials (OIDC role assumption)
         uses: aws-actions/configure-aws-credentials@v4
         with:
 
@@ -25,6 +25,18 @@
 _TOOL_EXECUTOR_WORKERS = 10
 _UNSET: object = object()  # sentinel distinguishing "not yet started" from a None tool result
 
+# Defensive context-window ceiling. Below this we never trim; above this we
+# drop the oldest tool_use/tool_result pair until back under the ceiling.
+#
+# Anthropic's 200k prompt limit is the hard cap. The estimator at
+# ``_estimate_message_tokens`` covers messages + system + tool schemas
+# (all three count toward the limit). 170k ceiling leaves ~30k headroom
+# for the response. ratio=0.40 absorbs JSON-structural overhead in tool
+# payloads — empirically tuned from overflow logs where Anthropic landed
+# at 0.32–0.40 tokens/char for opensre's tool-result mix.
+_TOKEN_BUDGET_CEILING = 170_000
+_TOKENS_PER_CHAR = 0.40
+
 # Maps alert_source → tool source keys. Tools from these sources are auto-called
 # before the LLM loop starts when the alert source is known.
 _ALERT_SOURCE_TO_TOOL_SOURCES: dict[str, list[str]] = {
@@ -167,6 +179,7 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
         for iteration in range(MAX_INVESTIGATION_LOOPS):
             logger.debug("[agent] iteration=%d", iteration)
             _emit("llm_start", {"iteration": iteration})
+            _enforce_context_budget(messages, system=system, tools=tool_schemas)
             try:
                 response = llm.invoke(messages, system=system, tools=tool_schemas)
 
@@ -263,6 +276,86 @@ def _record_tool_end(tc: ToolCall, output: Any) -> None:
 InvestigationAgent = ConnectedInvestigationAgent
 
 
+def _estimate_message_tokens(
+    messages: list[dict[str, Any]],
+    *,
+    system: str | None = None,
+    tools: list[dict[str, Any]] | None = None,
+) -> int:
+    """Cheap upper-bound token estimate covering everything Anthropic sees.
+
+    Anthropic counts ``messages`` + ``system`` + ``tools`` toward the 200k
+    prompt limit. Earlier versions counted only ``messages`` and trimmed
+    aggressively while system + tools (tens of thousands of tokens for
+    opensre's 100+ tool registry) silently pushed us over the line.
+    """
+    total = 0
+    for message in messages:
+        content = message.get("content", "")
+        if isinstance(content, str):
+            total += int(len(content) * _TOKENS_PER_CHAR)
+        elif isinstance(content, list):
+            for block in content:
+                if isinstance(block, dict):
+                    total += int(len(json.dumps(block, default=str)) * _TOKENS_PER_CHAR)
+                elif isinstance(block, str):
+                    total += int(len(block) * _TOKENS_PER_CHAR)
+    if system:
+        total += int(len(system) * _TOKENS_PER_CHAR)
+    if tools:
+        for schema in tools:
+            total += int(len(json.dumps(schema, default=str)) * _TOKENS_PER_CHAR)
+    return total
+
+
+def _trim_oldest_tool_pair(messages: list[dict[str, Any]]) -> bool:
+    """Drop the oldest assistant tool_use message together with the
+    immediate next user message carrying its tool_results. Anthropic
+    requires every ``tool_use`` block to be followed by a matching
+    ``tool_result`` block, so the pair must be removed together to keep
+    the conversation valid.
+
+    Returns True if a pair was dropped, False if nothing trimmable
+    remains (e.g. only the initial user prompt is left).
+    """
+    for index, message in enumerate(messages):
+        if message.get("role") != "assistant":
+            continue
+        content = message.get("content", [])
+        if not isinstance(content, list):
+            continue
+        has_tool_use = any(
+            isinstance(block, dict) and block.get("type") == "tool_use" for block in content
+        )
+        if not has_tool_use:
+            continue
+        # Drop the assistant turn AND its paired user turn (the tool_results).
+        # If the user turn isn't present (e.g. truncated mid-iteration),
+        # del messages[i:i+2] safely just drops the assistant turn.
+        del messages[index : index + 2]
+        return True
+    return False
+
+
+def _enforce_context_budget(
+    messages: list[dict[str, Any]],
+    *,
+    system: str | None = None,
+    tools: list[dict[str, Any]] | None = None,
+) -> None:
+    """Trim oldest tool pairs until prompt fits under the budget ceiling.
+
+    No-op on the happy path: the estimate covers messages + system + tools
+    in one pass and returns under the ceiling for normal investigations.
+    Only fires on long CloudOpsBench cases where unbounded tool history
+    has pushed the prompt past the model's limit.
+    """
+    while _estimate_message_tokens(messages, system=system, tools=tools) > _TOKEN_BUDGET_CEILING:
+        if not _trim_oldest_tool_pair(messages):
+            return
+        logger.warning("[agent] trimmed oldest tool pair to fit context budget")
+
+
 def _degraded_investigation_from_llm_failure(
     failure: LLMInvokeFailure,
     *,
 
@@ -64,12 +64,25 @@ def _looks_like_timeout(exc: BaseException) -> bool:
 
 
 def classify_llm_invoke_failure(exc: BaseException) -> LLMInvokeFailure | None:
-    """Return a structured failure when *exc* is a known operational LLM error."""
+    """Return a structured failure when *exc* is a known operational LLM error.
+
+    Returns ``None`` to signal the caller should re-raise. In particular,
+    :class:`LLMCreditExhaustedError` is intentionally NOT classified — it
+    represents a non-recoverable billing condition that the bench runner
+    (and production agent) must halt on, not wrap into a degraded result.
+    """
     from app.integrations.llm_cli.errors import (
         CLIAuthenticationRequired,
         CLIInterruptedError,
         CLITimeoutError,
     )
+    from app.utils.llm_retry import LLMCreditExhaustedError
+
+    # Fatal — propagate to the runner / operator. Do NOT wrap into the
+    # generic "rate-limited" classification (which the text branch below
+    # would otherwise match against "credit balance too low" / "quota").
+    if isinstance(exc, LLMCreditExhaustedError):
+        return None
 
     if isinstance(exc, CLIAuthenticationRequired):
         return LLMInvokeFailure(