From 8220e0555fcabe3bd4b44ae2921586254ea71098 Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 8 May 2026 07:01:13 +0000 Subject: [PATCH 1/3] fix: end-to-end agent testing on this PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use GitHub API for structured failure info (which job/step failed + targeted log) instead of noisy grep across all logs - Fix system prompt: concrete example, no angle brackets - Fix filepath parsing edge cases - Fix ruff: lambda → def, formatting - Add 15-min wait for PR CI security scan to complete - Add aiohttp==3.9.0 to Dockerfile (CVE trigger) - Add pull_request trigger with Dockerfile path TODO after validation: remove pull_request trigger, remove aiohttp, restore branch validation --- .github/workflows/agent-currency-fix.yml | 101 +++++++--- docker/vllm/Dockerfile | 1 + scripts/autocurrency/agent-fix.py | 236 ++++++++++++++++++----- 3 files changed, 271 insertions(+), 67 deletions(-) diff --git a/.github/workflows/agent-currency-fix.yml b/.github/workflows/agent-currency-fix.yml index 725b8f151c0c..ab3156a98f8a 100644 --- a/.github/workflows/agent-currency-fix.yml +++ b/.github/workflows/agent-currency-fix.yml @@ -8,6 +8,13 @@ on: workflow_run: workflows: ["Merge Conditions"] types: [completed] + # PR trigger for testing (remove after validation) + pull_request: + branches: [main] + paths: + - "scripts/autocurrency/agent-fix.py" + - ".github/workflows/agent-currency-fix.yml" + - "docker/vllm/Dockerfile" permissions: contents: read @@ -23,19 +30,27 @@ env: jobs: fix-agent: if: >- - github.event.workflow_run.conclusion == 'failure' && - startsWith(github.event.workflow_run.head_branch, 'auto-update/') + github.event_name == 'workflow_dispatch' || + github.event_name == 'pull_request' || ( + github.event.workflow_run.conclusion == 'failure' && + startsWith(github.event.workflow_run.head_branch, 'auto-update/') + ) runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true env: - HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} - RUN_ID: ${{ github.event.workflow_run.id }} - RUN_URL: ${{ github.event.workflow_run.html_url }} + HEAD_BRANCH: ${{ github.event.workflow_run.head_branch || github.head_ref }} + RUN_ID: ${{ github.event.workflow_run.id || '' }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} steps: - name: Validate branch name run: | + # Skip validation for pull_request testing + if [ "${{ github.event_name }}" = "pull_request" ]; then + echo "Skipping branch validation for PR testing" + exit 0 + fi if [[ ! "$HEAD_BRANCH" =~ ^auto-update/[a-z]+-[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "::error::Branch '${HEAD_BRANCH}' does not match expected pattern." exit 1 @@ -51,13 +66,40 @@ jobs: fi gh --version + # For pull_request testing: poll until a tracked workflow fails (or timeout) + - name: Wait for CI failure + if: github.event_name == 'pull_request' + env: + GH_TOKEN: ${{ github.token }} + run: | + TRACKED="PR - vLLM EC2|PR - vLLM SageMaker|PR - SGLang EC2|PR - SGLang SageMaker" + SHA="${{ github.event.pull_request.head.sha }}" + echo "Polling for tracked workflow failures on SHA: $SHA" + for i in $(seq 1 40); do + FOUND=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ + --jq "[.workflow_runs[] | select(.conclusion == \"failure\" and (.name | test(\"${TRACKED}\")))] | length") + if [ "$FOUND" -gt 0 ]; then + echo "Found $FOUND failed tracked workflow(s) after $i minutes" + break + fi + echo "No failures yet, waiting 60s... ($i/40)" + sleep 60 + done + - name: Find failed tracked workflows id: failures env: GH_TOKEN: ${{ github.token }} run: | TRACKED="PR - vLLM EC2|PR - vLLM SageMaker|PR - SGLang EC2|PR - SGLang SageMaker" - SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha') + + # Get HEAD SHA depending on event type + if [ -n "$RUN_ID" ]; then + SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha') + else + SHA="${{ github.event.pull_request.head.sha || github.sha }}" + fi + echo "SHA: $SHA" FAILED_RUN_IDS=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ --jq "[.workflow_runs[] | select(.conclusion == \"failure\" and (.name | test(\"${TRACKED}\")))] | .[].id" \ @@ -93,7 +135,7 @@ jobs: if: steps.failures.outputs.has_failures == 'true' uses: actions/checkout@v5 with: - ref: main + ref: ${{ github.event_name == 'pull_request' && github.head_ref || 'main' }} fetch-depth: 0 token: ${{ steps.app-token.outputs.token }} @@ -101,8 +143,11 @@ jobs: if: steps.failures.outputs.has_failures == 'true' run: | cp scripts/autocurrency/agent-fix.py /tmp/agent-fix.py - git fetch origin "$HEAD_BRANCH" - git checkout "origin/$HEAD_BRANCH" -B pr-branch + # For pull_request testing, we're already on the right branch + if [ "${{ github.event_name }}" != "pull_request" ]; then + git fetch origin "$HEAD_BRANCH" + git checkout "origin/$HEAD_BRANCH" -B pr-branch + fi - name: Count previous attempts if: steps.failures.outputs.has_failures == 'true' @@ -139,22 +184,27 @@ jobs: Agent run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" fi - - name: Download failed run logs - if: steps.failures.outputs.has_failures == 'true' && steps.retry.outputs.max_reached != 'true' - env: - GH_TOKEN: ${{ steps.app-token.outputs.token }} - FAILED_RUNS: ${{ steps.failures.outputs.failed_runs }} - run: | - mkdir -p /tmp/ci-logs - for RUN in $FAILED_RUNS; do - gh api "/repos/${{ github.repository }}/actions/runs/${RUN}/logs" > "/tmp/ci-logs-${RUN}.zip" || true - [ -s "/tmp/ci-logs-${RUN}.zip" ] && unzip -o "/tmp/ci-logs-${RUN}.zip" -d "/tmp/ci-logs/run-${RUN}/" || true - done - - name: Determine framework if: steps.failures.outputs.has_failures == 'true' && steps.retry.outputs.max_reached != 'true' + env: + GH_TOKEN: ${{ github.token }} run: | - FRAMEWORK=$(echo "$HEAD_BRANCH" | sed 's|auto-update/||' | sed 's|-[0-9].*||') + # Derive framework from which tracked workflow failed + SHA="${{ github.event.pull_request.head.sha || github.sha }}" + if [ -n "$RUN_ID" ]; then + SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha' 2>/dev/null || echo "$SHA") + fi + FAILED_NAME=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ + --jq '[.workflow_runs[] | select(.conclusion == "failure" and (.name | test("PR - vLLM|PR - SGLang")))][0].name' 2>/dev/null || echo "") + if echo "$FAILED_NAME" | grep -qi "vllm"; then + FRAMEWORK="vllm" + elif echo "$FAILED_NAME" | grep -qi "sglang"; then + FRAMEWORK="sglang" + else + # Fallback to branch name parsing + FRAMEWORK=$(echo "$HEAD_BRANCH" | sed 's|auto-update/||' | sed 's|-[0-9].*||') + fi + echo "Framework: $FRAMEWORK (from: $FAILED_NAME)" echo "FRAMEWORK=$FRAMEWORK" >> $GITHUB_ENV - name: Run fix agent @@ -162,12 +212,15 @@ jobs: id: fix env: AWS_REGION: us-west-2 + GH_TOKEN: ${{ steps.app-token.outputs.token }} run: | python3 -m pip install boto3 -q python3 /tmp/agent-fix.py \ - --logs-dir /tmp/ci-logs/ \ --framework "$FRAMEWORK" \ - --branch "$HEAD_BRANCH" + --branch "$HEAD_BRANCH" \ + --run-ids "${{ steps.failures.outputs.failed_runs }}" \ + --token "$GH_TOKEN" \ + --repo "${{ github.repository }}" - name: Commit and push if: steps.failures.outputs.has_failures == 'true' && steps.retry.outputs.max_reached != 'true' && steps.fix.outcome == 'success' diff --git a/docker/vllm/Dockerfile b/docker/vllm/Dockerfile index c3cc8e7625d7..92cec276b122 100644 --- a/docker/vllm/Dockerfile +++ b/docker/vllm/Dockerfile @@ -38,6 +38,7 @@ RUN uv pip install --system \ "PyJWT>=2.12.0" \ "model-hosting-container-standards>=0.1.15,<1.0.0" \ "pyasn1>=0.6.3" \ + "aiohttp==3.9.0" \ && uv cache clean COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/scripts/autocurrency/agent-fix.py b/scripts/autocurrency/agent-fix.py index 32e4848a7186..bd608099b733 100755 --- a/scripts/autocurrency/agent-fix.py +++ b/scripts/autocurrency/agent-fix.py @@ -34,6 +34,7 @@ - ONLY fix the specific failure shown in the logs - Do NOT delete or skip tests - Do NOT modify files unrelated to the failure +- ONLY edit files that are provided in the context below. If a file is not shown, do not edit it. - For CVE scan failures: pin a safe version in Dockerfile, or add to allowlist if vendored/unpatchable - For "file not found" errors: find the new path in the upstream repo - For build errors: check if upstream base image changed something @@ -43,30 +44,159 @@ If the failure is TRANSIENT (capacity, timeout, runner crash), respond with exactly: TRANSIENT: -Otherwise, respond with search/replace blocks: +Otherwise, respond with search/replace blocks. Use this EXACT format: - +path/to/file.ext <<<<<<< SEARCH - +exact text to find in the file ======= - +replacement text >>>>>>> REPLACE +IMPORTANT: Write the file path as plain text (e.g., docker/vllm/Dockerfile). Do NOT wrap it in angle brackets, backticks, or any other formatting. + Include 1-2 surrounding lines in SEARCH for unique anchoring. For JSON arrays (allowlists), SEARCH the last few lines and REPLACE with those lines plus the new entry. -End with: DESCRIPTION: """ +End with: DESCRIPTION: one-line commit message""" def parse_args(): p = argparse.ArgumentParser() - p.add_argument("--logs-dir", required=True) + p.add_argument("--logs-dir", default="/tmp/ci-logs") p.add_argument("--framework", required=True) p.add_argument("--branch", required=True) + p.add_argument("--run-ids", default="", help="Space-separated failed run IDs") + p.add_argument("--token", default=os.environ.get("GH_TOKEN", ""), help="GitHub token") + p.add_argument("--repo", default="aws/deep-learning-containers") return p.parse_args() -def extract_error_lines(logs_dir: str) -> str: +def extract_failure_info( + logs_dir: str, run_ids: str = "", token: str = "", repo: str = "" +) -> tuple: + """Extract failure info. Returns (error_text, failed_job_names).""" + + # Try structured API approach first + if run_ids and token and repo: + print("Using GitHub API for structured failure extraction") + return _extract_via_api(run_ids, token, repo) + + # Fallback: grep log files + print("ERROR: run_ids and token are required") + return "No run IDs provided", [] + + +def _extract_via_api(run_ids: str, token: str, repo: str) -> tuple: + """Use GitHub API to get structured failure info.""" + import urllib.request + + results = [] + failed_job_names = [] + for run_id in run_ids.strip().split(): + if not run_id: + continue + # Get jobs for this run + url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/jobs?per_page=100" + req = urllib.request.Request( + url, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github+json", + }, + ) + try: + resp = urllib.request.urlopen(req) + data = json.loads(resp.read()) + except Exception as e: + results.append(f"Failed to fetch jobs for run {run_id}: {e}") + continue + + # Find failed jobs and steps + tracked_jobs = [ + "build-image", + "sanity-test", + "security-test", + "telemetry-test", + "upstream-tests", + "sagemaker-test", + ] + for job in data.get("jobs", []): + if job.get("conclusion") != "failure": + continue + + # Only process jobs that match our tracked job names + job_lower = job["name"].lower() + matched_key = None + for key in tracked_jobs: + if key.replace("-", "") in job_lower.replace("-", "").replace(" ", ""): + matched_key = key + break + if not matched_key: + continue + + failed_steps = [ + s["name"] for s in job.get("steps", []) if s.get("conclusion") == "failure" + ] + results.append(f"FAILED JOB: {job['name']}") + failed_job_names.append(matched_key) + results.append(f" Failed steps: {', '.join(failed_steps)}") + + # Download log for this specific job + log_url = f"https://api.github.com/repos/{repo}/actions/jobs/{job['id']}/logs" + req = urllib.request.Request( + log_url, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github+json", + }, + ) + try: + log_resp = urllib.request.urlopen(req) + log_text = log_resp.read().decode(errors="replace") + + log_lines = log_text.splitlines() + tail = log_lines + results.append(f" Log ({len(tail)} lines):") + results.extend(f" {line}" for line in tail) + except Exception: + # Fallback: download full run log zip and find the relevant file + results.append(" Job log 403, falling back to run zip...") + try: + import io + import zipfile + + zip_url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/logs" + req2 = urllib.request.Request( + zip_url, + headers={ + "Authorization": f"token {token}", + "Accept": "application/vnd.github+json", + }, + ) + resp2 = urllib.request.urlopen(req2) + z = zipfile.ZipFile(io.BytesIO(resp2.read())) + # Match using the exact job name (API: "a / b" → zip: "a _ b") + target = job["name"].replace(" / ", " _ ") + for name in z.namelist(): + if target in name: + lines = z.read(name).decode(errors="replace").splitlines() + tail = lines + results.append(f" Log from zip ({name}, {len(tail)} lines):") + results.extend(f" {line}" for line in tail) + break + else: + results.append(f" No matching log file for '{target}' in zip") + except Exception as e2: + results.append(f" Zip fallback also failed: {e2}") + + results.append("") + + return "\n".join(results) or "No failure info extracted.", failed_job_names + + +def _extract_via_grep(logs_dir: str) -> str: + """Fallback: grep log files for error keywords.""" logs_path = Path(logs_dir) if not logs_path.exists(): return "No logs available." @@ -82,7 +212,7 @@ def extract_error_lines(logs_dir: str) -> str: for i, line in enumerate(lines): if any(kw in line.lower() for kw in keywords): start, end = max(0, i - 2), min(len(lines), i + 3) - error_lines.append(f"--- {log_file.name}:{i+1} ---") + error_lines.append(f"--- {log_file.name}:{i + 1} ---") error_lines.extend(lines[start:end]) error_lines.append("") if len(error_lines) > MAX_LOG_LINES: @@ -107,7 +237,14 @@ def detect_failed_jobs(logs_dir: str) -> list: job_names = set() for f in logs_path.rglob("*.txt"): name = f.stem.lower() - for job in ["build-image", "sanity-test", "security-test", "telemetry-test", "upstream-tests", "sagemaker-test"]: + for job in [ + "build-image", + "sanity-test", + "security-test", + "telemetry-test", + "upstream-tests", + "sagemaker-test", + ]: if job in name: job_names.add(job) return list(job_names) @@ -120,15 +257,20 @@ def load_context_files(framework: str, failed_jobs: list) -> dict: """ mapping_path = Path(CONTEXT_MAP_PATH) if not mapping_path.exists(): - return {p: read_file(p) for p in [ - f"docker/{framework}/Dockerfile", - f".github/config/image/{framework}-ec2.yml", - f"test/security/data/ecr_scan_allowlist/{framework}/framework_allowlist.json", - ] if read_file(p)} + return { + p: read_file(p) + for p in [ + f"docker/{framework}/Dockerfile", + f".github/config/image/{framework}-ec2.yml", + f"test/security/data/ecr_scan_allowlist/{framework}/framework_allowlist.json", + ] + if read_file(p) + } # Parse YAML via subprocess (yq available on runners) or fallback to simple parsing try: import yaml + config = yaml.safe_load(mapping_path.read_text()) except ImportError: # Fallback: parse the simple YAML structure manually @@ -166,7 +308,12 @@ def _parse_simple_yaml(text: str) -> dict: current_job = None elif line == "jobs:": current_section = "jobs" - elif current_section == "jobs" and line.startswith(" ") and not line.startswith(" ") and stripped.endswith(":"): + elif ( + current_section == "jobs" + and line.startswith(" ") + and not line.startswith(" ") + and stripped.endswith(":") + ): current_job = stripped.rstrip(":") result["jobs"][current_job] = [] elif stripped.startswith("- "): @@ -182,7 +329,9 @@ def get_previous_fixes() -> str: try: r = subprocess.run( ["git", "log", "--oneline", "origin/main..HEAD", "--grep=[agent-fix]"], - capture_output=True, text=True, check=True, + capture_output=True, + text=True, + check=True, ) return r.stdout.strip() or "None" except subprocess.CalledProcessError: @@ -193,8 +342,9 @@ def parse_blocks(response: str) -> list: blocks = [] for m in SEARCH_REPLACE_PATTERN.finditer(response): filepath = m.group(1).strip().strip("`").strip() - # Strip common LLM artifacts: , **filepath**, `filepath` - filepath = re.sub(r"^<\w+>|<\/\w+>$", "", filepath).strip() + # Strip all common LLM artifacts: path, , **path**, `path` + filepath = re.sub(r"^<[^>]*>", "", filepath).strip() # strips , , etc. + filepath = re.sub(r"^<|>$", "", filepath).strip() # strips bare < > filepath = filepath.strip("*").strip("`").strip() blocks.append({"path": filepath, "search": m.group(2), "replace": m.group(3)}) return blocks @@ -207,14 +357,18 @@ def find_match(content: str, search: str) -> tuple: return idx, idx + len(search) # Whitespace-normalized: strip trailing spaces per line - norm = lambda s: "\n".join(line.rstrip() for line in s.splitlines()) + def norm(s): + return "\n".join(line.rstrip() for line in s.splitlines()) + norm_content, norm_search = norm(content), norm(search) idx = norm_content.find(norm_search) if idx != -1: line_num = norm_content[:idx].count("\n") lines = content.splitlines(keepends=True) end_line = line_num + norm_search.count("\n") - return sum(len(lines[i]) for i in range(line_num)), sum(len(lines[i]) for i in range(end_line + 1)) + return sum(len(lines[i]) for i in range(line_num)), sum( + len(lines[i]) for i in range(end_line + 1) + ) return None, None @@ -256,18 +410,19 @@ def call_bedrock(system: str, user: str) -> str: client = boto3.client("bedrock-runtime", region_name=REGION) resp = client.invoke_model( modelId=MODEL_ID, - body=json.dumps({ - "anthropic_version": "bedrock-2023-05-31", - "max_tokens": MAX_TOKENS, - "system": system, - "messages": [{"role": "user", "content": user}], - }), + body=json.dumps( + { + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": MAX_TOKENS, + "system": system, + "messages": [{"role": "user", "content": user}], + } + ), ) return json.loads(resp["body"].read())["content"][0]["text"] -def build_prompt(framework, branch, error_lines, context_files, - previous_fixes, retry_context=""): +def build_prompt(framework, branch, error_lines, context_files, previous_fixes, retry_context=""): files_section = "" for path, content in context_files.items(): ext = Path(path).suffix.lstrip(".") @@ -295,21 +450,14 @@ def main(): args = parse_args() print(f"=== Currency Fix Agent: {args.framework} @ {args.branch} ===\n") - error_lines = extract_error_lines(args.logs_dir) - failed_jobs = detect_failed_jobs(args.logs_dir) + error_lines, api_failed_jobs = extract_failure_info( + args.logs_dir, args.run_ids, args.token, args.repo + ) + # Use API-detected jobs if available, otherwise fall back to log filename detection + failed_jobs = api_failed_jobs context_files = load_context_files(args.framework, failed_jobs) previous_fixes = get_previous_fixes() - # Debug: show what logs we have - logs_path = Path(args.logs_dir) - if logs_path.exists(): - log_files = list(logs_path.rglob("*.txt")) - print(f"Log files found: {len(log_files)}") - for f in log_files[:10]: - print(f" {f.name} ({f.stat().st_size} bytes)") - else: - print(f"WARNING: logs dir {args.logs_dir} does not exist!") - print(f"Error lines extracted: {len(error_lines.splitlines())} lines") print(f"Error lines preview: {error_lines[:500]}") print(f"Failed jobs detected: {failed_jobs or 'none (including all files)'}") @@ -320,8 +468,9 @@ def main(): for attempt in range(1, MAX_LLM_RETRIES + 1): print(f"--- Attempt {attempt}/{MAX_LLM_RETRIES} ---") - prompt = build_prompt(args.framework, args.branch, error_lines, - context_files, previous_fixes, retry_context) + prompt = build_prompt( + args.framework, args.branch, error_lines, context_files, previous_fixes, retry_context + ) print(f"Prompt size: {len(prompt)} chars") response = call_bedrock(SYSTEM_PROMPT, prompt) print(f"LLM response ({len(response)} chars):") @@ -336,7 +485,8 @@ def main(): blocks = parse_blocks(response) if blocks: - print(f"Parsed {len(blocks)} block(s): {[b["path"] for b in blocks]}") + paths = [b["path"] for b in blocks] + print(f"Parsed {len(blocks)} block(s): {paths}") if not blocks: retry_context = ( f"Could not parse search/replace blocks from response.\n" From 070797c0317f6478bd97709904e2cfdf02f4ebda Mon Sep 17 00:00:00 2001 From: "asimov-bot[bot]" Date: Fri, 8 May 2026 19:06:59 +0000 Subject: [PATCH 2/3] [agent-fix] Fix CVE-2024-52304, CVE-2024-23334, CVE-2025-53643, CVE-2024-30251 by upgrading aiohttp from pinned 3.9.0 to >=3.12.14 --- docker/vllm/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/vllm/Dockerfile b/docker/vllm/Dockerfile index 92cec276b122..24af7ba196f9 100644 --- a/docker/vllm/Dockerfile +++ b/docker/vllm/Dockerfile @@ -38,7 +38,7 @@ RUN uv pip install --system \ "PyJWT>=2.12.0" \ "model-hosting-container-standards>=0.1.15,<1.0.0" \ "pyasn1>=0.6.3" \ - "aiohttp==3.9.0" \ + "aiohttp>=3.12.14" \ && uv cache clean COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py From 5958a46a04d12d4f09adcf0c8b927b783089f90f Mon Sep 17 00:00:00 2001 From: Kevin Wang Date: Fri, 8 May 2026 20:24:29 +0000 Subject: [PATCH 3/3] chore: revert test-only changes, keep real fixes Revert: - pull_request trigger - aiohttp==3.9.0 in Dockerfile (and agent's >=3.12.14 fix) - Polling loop - Branch validation skip - Framework detection from workflow name Keep: - Structured API failure extraction (no grep) - Only process tracked jobs (skip check-changes/gatekeeper) - System prompt: no angle brackets, only edit files in context - Filepath parsing fixes - Pass --run-ids and --token to agent script - Remove log zip download step (API handles it) - Pre-commit clean (ruff format + lint) --- .github/workflows/agent-currency-fix.yml | 82 +++------------------- docker/vllm/Dockerfile | 1 - scripts/autocurrency/agent-fix.py | 86 +++++++----------------- 3 files changed, 34 insertions(+), 135 deletions(-) diff --git a/.github/workflows/agent-currency-fix.yml b/.github/workflows/agent-currency-fix.yml index ab3156a98f8a..f58860acbf73 100644 --- a/.github/workflows/agent-currency-fix.yml +++ b/.github/workflows/agent-currency-fix.yml @@ -8,13 +8,6 @@ on: workflow_run: workflows: ["Merge Conditions"] types: [completed] - # PR trigger for testing (remove after validation) - pull_request: - branches: [main] - paths: - - "scripts/autocurrency/agent-fix.py" - - ".github/workflows/agent-currency-fix.yml" - - "docker/vllm/Dockerfile" permissions: contents: read @@ -30,27 +23,19 @@ env: jobs: fix-agent: if: >- - github.event_name == 'workflow_dispatch' || - github.event_name == 'pull_request' || ( - github.event.workflow_run.conclusion == 'failure' && - startsWith(github.event.workflow_run.head_branch, 'auto-update/') - ) + github.event.workflow_run.conclusion == 'failure' && + startsWith(github.event.workflow_run.head_branch, 'auto-update/') runs-on: - codebuild-runner-${{ github.run_id }}-${{ github.run_attempt }} fleet:default-runner buildspec-override:true env: - HEAD_BRANCH: ${{ github.event.workflow_run.head_branch || github.head_ref }} - RUN_ID: ${{ github.event.workflow_run.id || '' }} - RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }} + RUN_ID: ${{ github.event.workflow_run.id }} + RUN_URL: ${{ github.event.workflow_run.html_url }} steps: - name: Validate branch name run: | - # Skip validation for pull_request testing - if [ "${{ github.event_name }}" = "pull_request" ]; then - echo "Skipping branch validation for PR testing" - exit 0 - fi if [[ ! "$HEAD_BRANCH" =~ ^auto-update/[a-z]+-[0-9]+\.[0-9]+\.[0-9]+$ ]]; then echo "::error::Branch '${HEAD_BRANCH}' does not match expected pattern." exit 1 @@ -66,40 +51,13 @@ jobs: fi gh --version - # For pull_request testing: poll until a tracked workflow fails (or timeout) - - name: Wait for CI failure - if: github.event_name == 'pull_request' - env: - GH_TOKEN: ${{ github.token }} - run: | - TRACKED="PR - vLLM EC2|PR - vLLM SageMaker|PR - SGLang EC2|PR - SGLang SageMaker" - SHA="${{ github.event.pull_request.head.sha }}" - echo "Polling for tracked workflow failures on SHA: $SHA" - for i in $(seq 1 40); do - FOUND=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ - --jq "[.workflow_runs[] | select(.conclusion == \"failure\" and (.name | test(\"${TRACKED}\")))] | length") - if [ "$FOUND" -gt 0 ]; then - echo "Found $FOUND failed tracked workflow(s) after $i minutes" - break - fi - echo "No failures yet, waiting 60s... ($i/40)" - sleep 60 - done - - name: Find failed tracked workflows id: failures env: GH_TOKEN: ${{ github.token }} run: | TRACKED="PR - vLLM EC2|PR - vLLM SageMaker|PR - SGLang EC2|PR - SGLang SageMaker" - - # Get HEAD SHA depending on event type - if [ -n "$RUN_ID" ]; then - SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha') - else - SHA="${{ github.event.pull_request.head.sha || github.sha }}" - fi - echo "SHA: $SHA" + SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha') FAILED_RUN_IDS=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ --jq "[.workflow_runs[] | select(.conclusion == \"failure\" and (.name | test(\"${TRACKED}\")))] | .[].id" \ @@ -135,7 +93,7 @@ jobs: if: steps.failures.outputs.has_failures == 'true' uses: actions/checkout@v5 with: - ref: ${{ github.event_name == 'pull_request' && github.head_ref || 'main' }} + ref: main fetch-depth: 0 token: ${{ steps.app-token.outputs.token }} @@ -143,11 +101,8 @@ jobs: if: steps.failures.outputs.has_failures == 'true' run: | cp scripts/autocurrency/agent-fix.py /tmp/agent-fix.py - # For pull_request testing, we're already on the right branch - if [ "${{ github.event_name }}" != "pull_request" ]; then - git fetch origin "$HEAD_BRANCH" - git checkout "origin/$HEAD_BRANCH" -B pr-branch - fi + git fetch origin "$HEAD_BRANCH" + git checkout "origin/$HEAD_BRANCH" -B pr-branch - name: Count previous attempts if: steps.failures.outputs.has_failures == 'true' @@ -186,25 +141,8 @@ jobs: - name: Determine framework if: steps.failures.outputs.has_failures == 'true' && steps.retry.outputs.max_reached != 'true' - env: - GH_TOKEN: ${{ github.token }} run: | - # Derive framework from which tracked workflow failed - SHA="${{ github.event.pull_request.head.sha || github.sha }}" - if [ -n "$RUN_ID" ]; then - SHA=$(gh api "/repos/${{ github.repository }}/actions/runs/${RUN_ID}" --jq '.head_sha' 2>/dev/null || echo "$SHA") - fi - FAILED_NAME=$(gh api "/repos/${{ github.repository }}/actions/runs?head_sha=${SHA}&status=completed&per_page=50" \ - --jq '[.workflow_runs[] | select(.conclusion == "failure" and (.name | test("PR - vLLM|PR - SGLang")))][0].name' 2>/dev/null || echo "") - if echo "$FAILED_NAME" | grep -qi "vllm"; then - FRAMEWORK="vllm" - elif echo "$FAILED_NAME" | grep -qi "sglang"; then - FRAMEWORK="sglang" - else - # Fallback to branch name parsing - FRAMEWORK=$(echo "$HEAD_BRANCH" | sed 's|auto-update/||' | sed 's|-[0-9].*||') - fi - echo "Framework: $FRAMEWORK (from: $FAILED_NAME)" + FRAMEWORK=$(echo "$HEAD_BRANCH" | sed 's|auto-update/||' | sed 's|-[0-9].*||') echo "FRAMEWORK=$FRAMEWORK" >> $GITHUB_ENV - name: Run fix agent diff --git a/docker/vllm/Dockerfile b/docker/vllm/Dockerfile index 24af7ba196f9..c3cc8e7625d7 100644 --- a/docker/vllm/Dockerfile +++ b/docker/vllm/Dockerfile @@ -38,7 +38,6 @@ RUN uv pip install --system \ "PyJWT>=2.12.0" \ "model-hosting-container-standards>=0.1.15,<1.0.0" \ "pyasn1>=0.6.3" \ - "aiohttp>=3.12.14" \ && uv cache clean COPY ./scripts/telemetry/deep_learning_container.py /usr/local/bin/deep_learning_container.py diff --git a/scripts/autocurrency/agent-fix.py b/scripts/autocurrency/agent-fix.py index bd608099b733..88dd6ef5155e 100755 --- a/scripts/autocurrency/agent-fix.py +++ b/scripts/autocurrency/agent-fix.py @@ -63,7 +63,6 @@ def parse_args(): p = argparse.ArgumentParser() - p.add_argument("--logs-dir", default="/tmp/ci-logs") p.add_argument("--framework", required=True) p.add_argument("--branch", required=True) p.add_argument("--run-ids", default="", help="Space-separated failed run IDs") @@ -72,23 +71,9 @@ def parse_args(): return p.parse_args() -def extract_failure_info( - logs_dir: str, run_ids: str = "", token: str = "", repo: str = "" -) -> tuple: - """Extract failure info. Returns (error_text, failed_job_names).""" - - # Try structured API approach first - if run_ids and token and repo: - print("Using GitHub API for structured failure extraction") - return _extract_via_api(run_ids, token, repo) - - # Fallback: grep log files - print("ERROR: run_ids and token are required") - return "No run IDs provided", [] - - -def _extract_via_api(run_ids: str, token: str, repo: str) -> tuple: - """Use GitHub API to get structured failure info.""" +def extract_failure_info(run_ids: str, token: str, repo: str) -> tuple: + """Use GitHub API to get structured failure info. Returns (error_text, failed_job_names).""" + print("Using GitHub API for structured failure extraction") import urllib.request results = [] @@ -142,53 +127,32 @@ def _extract_via_api(run_ids: str, token: str, repo: str) -> tuple: failed_job_names.append(matched_key) results.append(f" Failed steps: {', '.join(failed_steps)}") - # Download log for this specific job - log_url = f"https://api.github.com/repos/{repo}/actions/jobs/{job['id']}/logs" - req = urllib.request.Request( - log_url, + # Download log from run zip + import io + import zipfile + + zip_url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/logs" + zip_req = urllib.request.Request( + zip_url, headers={ "Authorization": f"token {token}", "Accept": "application/vnd.github+json", }, ) try: - log_resp = urllib.request.urlopen(req) - log_text = log_resp.read().decode(errors="replace") - - log_lines = log_text.splitlines() - tail = log_lines - results.append(f" Log ({len(tail)} lines):") - results.extend(f" {line}" for line in tail) - except Exception: - # Fallback: download full run log zip and find the relevant file - results.append(" Job log 403, falling back to run zip...") - try: - import io - import zipfile - - zip_url = f"https://api.github.com/repos/{repo}/actions/runs/{run_id}/logs" - req2 = urllib.request.Request( - zip_url, - headers={ - "Authorization": f"token {token}", - "Accept": "application/vnd.github+json", - }, - ) - resp2 = urllib.request.urlopen(req2) - z = zipfile.ZipFile(io.BytesIO(resp2.read())) - # Match using the exact job name (API: "a / b" → zip: "a _ b") - target = job["name"].replace(" / ", " _ ") - for name in z.namelist(): - if target in name: - lines = z.read(name).decode(errors="replace").splitlines() - tail = lines - results.append(f" Log from zip ({name}, {len(tail)} lines):") - results.extend(f" {line}" for line in tail) - break - else: - results.append(f" No matching log file for '{target}' in zip") - except Exception as e2: - results.append(f" Zip fallback also failed: {e2}") + resp = urllib.request.urlopen(zip_req) + z = zipfile.ZipFile(io.BytesIO(resp.read())) + target = job["name"].replace(" / ", " _ ") + for name in z.namelist(): + if target in name: + log_lines = z.read(name).decode(errors="replace").splitlines() + results.append(f" Log ({name}, {len(log_lines)} lines):") + results.extend(f" {line}" for line in log_lines) + break + else: + results.append(f" No matching log file for '{target}' in zip") + except Exception as e: + results.append(f" Failed to download logs: {e}") results.append("") @@ -450,9 +414,7 @@ def main(): args = parse_args() print(f"=== Currency Fix Agent: {args.framework} @ {args.branch} ===\n") - error_lines, api_failed_jobs = extract_failure_info( - args.logs_dir, args.run_ids, args.token, args.repo - ) + error_lines, api_failed_jobs = extract_failure_info(args.run_ids, args.token, args.repo) # Use API-detected jobs if available, otherwise fall back to log filename detection failed_jobs = api_failed_jobs context_files = load_context_files(args.framework, failed_jobs)