Skip to content

Commit 7fc98d0

Browse files
authored
Merge branch 'main' into khazic/fix/vlm-packed-mrope-position-ids
2 parents 768bb46 + 3d437a9 commit 7fc98d0

10 files changed

Lines changed: 830 additions & 40 deletions

File tree

.github/workflows/cicd-approve-test-queue.yml

Lines changed: 145 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,18 @@ name: Approve Test Queue
1616

1717
on:
1818
schedule:
19-
- cron: '*/5 * * * *' # Runs every 5 minutes
20-
workflow_dispatch: # Allows manual triggering
19+
- cron: "*/5 * * * *" # Runs every 5 minutes
20+
workflow_dispatch: # Allows manual triggering
2121

2222
jobs:
2323
approve-queue:
2424
runs-on: ubuntu-latest
2525
environment: main
26+
if: github.repository == 'NVIDIA-NeMo/Automodel'
27+
strategy:
28+
matrix:
29+
branch: [main, others, workflow_dispatch]
30+
contributor_type: [internal, external]
2631
steps:
2732
- name: Checkout repository
2833
uses: actions/checkout@v6
@@ -37,22 +42,53 @@ jobs:
3742
python -m pip install --upgrade pip
3843
pip install requests
3944
45+
- name: Download SSO users list
46+
run: |
47+
gh release download v0.1.0 \
48+
--repo NVIDIA-GitHub-Management/github-audits \
49+
--pattern users_sso.json \
50+
--output users_sso.json || echo '{}' > users_sso.json
51+
env:
52+
GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
53+
4054
- name: Approve waiting deployments
4155
env:
4256
GITHUB_TOKEN: ${{ secrets.PAT }}
4357
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
58+
MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
59+
MAX_CONCURRENCY_WORKFLOW_DISPATCH: ${{ vars.MAX_CONCURRENCY || 1 }}
60+
CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
61+
MATRIX_BRANCH: ${{ matrix.branch }}
62+
SSO_USERS_FILE: users_sso.json
63+
PYTHONUNBUFFERED: 1
64+
shell: python
4465
run: |
45-
python - <<EOF
4666
import os
67+
import json
4768
import requests
69+
import re
4870
import time
4971
50-
5172
# GitHub API configuration
5273
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
5374
REPO = os.environ["GITHUB_REPOSITORY"]
54-
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
55-
API_BASE = f"https://api.github.com/repos/{REPO}"
75+
CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
76+
MATRIX_BRANCH = os.environ["MATRIX_BRANCH"]
77+
if MATRIX_BRANCH == "workflow_dispatch":
78+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_WORKFLOW_DISPATCH"])
79+
API_BASE = f"https://api.github.com/repos/{REPO}"
80+
WORKFLOW_NAME = "CICD NeMo"
81+
else:
82+
if CONTRIBUTOR_TYPE == "external":
83+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
84+
else:
85+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
86+
API_BASE = "https://api.github.com/repos/NVIDIA-NeMo/Automodel"
87+
WORKFLOW_NAME = "CICD NeMo"
88+
89+
# Load SSO users for internal/external classification
90+
with open(os.environ["SSO_USERS_FILE"]) as f:
91+
sso_users = json.load(f)
5692
5793
# Headers for GitHub API
5894
headers = {
@@ -94,7 +130,91 @@ jobs:
94130
print(f"Max retries ({max_retries}) exceeded for {endpoint}")
95131
return None
96132
133+
def is_internal_contributor(pr_info):
134+
"""Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
135+
login = pr_info.get("user", {}).get("login", "")
136+
org_roles = sso_users.get(login, {}).get("org_roles", [])
137+
return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
138+
139+
def get_pr_base_branch(workflow_run):
140+
"""
141+
Return the base branch of the PR associated with a workflow run, or None.
142+
Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
143+
Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
144+
"""
145+
print(workflow_run.get("head_branch", ""))
146+
head_branch = workflow_run.get("head_branch", "")
147+
match = re.match(r"pull-request/(\d+)", head_branch)
148+
if not match:
149+
return None, None # Not a PR branch pattern
150+
151+
pr_number = int(match.group(1))
152+
153+
# Fetch PR info from GitHub API
154+
pr_info = make_request(f"pulls/{pr_number}")
155+
if not pr_info:
156+
print(f"Failed to fetch PR #{pr_number}")
157+
return None, None
158+
159+
base_branch = pr_info.get("base", {}).get("ref")
160+
return base_branch, pr_info
161+
162+
def is_internal_actor(workflow_run):
163+
"""Return True if the actor who triggered the workflow run is an NVIDIA/NVIDIA-NeMo member."""
164+
login = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "")
165+
org_roles = sso_users.get(login, {}).get("org_roles", [])
166+
return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
167+
168+
def is_pr_run(workflow_run):
169+
"""Return True if this run was triggered by a PR (head_branch matches pull-request/<number>)."""
170+
return bool(re.match(r"pull-request/\d+", workflow_run.get("head_branch", "")))
171+
172+
def is_workflow_dispatch_run(workflow_run):
173+
"""Return True if this run was manually triggered (head_branch starts with mcore-testing-)."""
174+
return workflow_run.get("head_branch", "").startswith("mcore-testing-")
175+
176+
def matches_queue(workflow_run, target_branch, contributor_type):
177+
"""
178+
Return True if the workflow run belongs to this queue cell:
179+
matching target branch AND matching contributor type (internal/external).
180+
181+
workflow_dispatch runs (head_branch: mcore-testing-*) are routed to the 'workflow_dispatch' queue only.
182+
PR runs (head_branch: pull-request/<n>) are routed to 'main' or 'others' queues only.
183+
"""
184+
if target_branch == "workflow_dispatch":
185+
if not is_workflow_dispatch_run(workflow_run):
186+
return False
187+
internal = is_internal_actor(workflow_run)
188+
contributor_match = (contributor_type == "internal") == internal
189+
if contributor_match:
190+
actor = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "unknown")
191+
print(f"workflow_dispatch run by {actor}, contributor_type={contributor_type} (internal={internal})")
192+
return contributor_match
193+
194+
# PR queue: skip non-PR runs
195+
if not is_pr_run(workflow_run):
196+
return False
197+
198+
base_branch, pr_info = get_pr_base_branch(workflow_run)
199+
if base_branch is None:
200+
return False
201+
202+
branch_match = (
203+
(base_branch == target_branch) or
204+
(base_branch != "main" and base_branch != "dev" and target_branch == "others")
205+
)
206+
if not branch_match:
207+
return False
208+
209+
pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
210+
internal = is_internal_contributor(pr_info)
211+
contributor_match = (contributor_type == "internal") == internal
212+
if branch_match and contributor_match:
213+
print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
214+
return branch_match and contributor_match
215+
97216
# Get current running and queued workflows
217+
print(f"\n=== Queue cell: branch=${{ matrix.branch }}, contributor_type={CONTRIBUTOR_TYPE} ===")
98218
print("Fetching workflow runs...")
99219
queued_resp = make_request("actions/runs?status=queued")
100220
if queued_resp is None:
@@ -107,13 +227,25 @@ jobs:
107227
exit(1)
108228
in_progress_workflow_runs = in_progress_resp.get("workflow_runs", [])
109229
230+
def log_and_filter(runs, label):
231+
cicd_runs = [r for r in runs if r["name"] == WORKFLOW_NAME]
232+
print(f"{label}: {len(runs)} total, {len(cicd_runs)} {WORKFLOW_NAME}")
233+
for r in cicd_runs:
234+
actor = (r.get("triggering_actor") or r.get("actor") or {}).get("login", "unknown")
235+
matched = matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)
236+
print(f" run={r['id']} head_branch={r.get('head_branch')} event={r.get('event')} actor={actor} -> matched={matched}")
237+
return [r for r in cicd_runs if matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
238+
239+
queued_workflow_runs = log_and_filter(queued_workflow_runs, "queued")
240+
in_progress_workflow_runs = log_and_filter(in_progress_workflow_runs, "in_progress")
241+
110242
# Count running and queued workflows
111-
queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
112-
in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
243+
queued_workflows = len(queued_workflow_runs)
244+
in_progress_workflows = len(in_progress_workflow_runs)
113245
114246
total_workflows = queued_workflows + in_progress_workflows
115-
print(f"Current queued workflows: {queued_workflows}")
116-
print(f"Current running workflows: {in_progress_workflows}")
247+
print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
248+
print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
117249
print(f"Total workflows: {total_workflows}")
118250
print(f"Max concurrency: {MAX_CONCURRENCY}")
119251
@@ -122,20 +254,19 @@ jobs:
122254
exit(0)
123255
124256
# Get waiting CI workflows for test environment
125-
print("Fetching deployments...")
257+
print("Fetching waiting deployments...")
126258
waiting_resp = make_request("actions/runs?status=waiting")
127259
if waiting_resp is None:
128260
print("Failed to fetch waiting workflow runs after retries, exiting")
129261
exit(1)
130-
pending_workflows = waiting_resp.get("workflow_runs", [])
131-
pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
262+
pending_workflows = log_and_filter(waiting_resp.get("workflow_runs", []), "waiting")
132263
133264
# Sort deployments by creation date (oldest first)
134265
print("Sorting workflows...")
135266
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
136267
137268
# Process each deployment
138-
print("Processing ...")
269+
print(f"Processing {len(pending_workflows)} pending workflows...")
139270
for workflow in pending_workflows:
140271
if total_workflows >= MAX_CONCURRENCY:
141272
print("Maximum concurrency reached, stopping approvals")
@@ -166,8 +297,6 @@ jobs:
166297
else:
167298
print(f"Failed to approve deployment {deployment['id']}")
168299
exit(1)
169-
170-
EOF
171300
notify:
172301
if: failure()
173302
runs-on: ubuntu-latest

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ jobs:
611611
(
612612
(needs.pre-flight.outputs.is_ci_workload == 'true' && !failure())
613613
|| success()
614+
|| (needs.pre-flight.outputs.is_member != 'true' && needs.Nemo_CICD_Test.result == 'success')
614615
)
615616
&& !cancelled()
616617
&& vars.ENABLE_CODECOV == 'true'
@@ -664,20 +665,3 @@ jobs:
664665
path: |
665666
.coverage
666667
include-hidden-files: true
667-
668-
cleanup-taint-node:
669-
runs-on: ${{ needs.pre-flight.outputs.runner_prefix }}
670-
needs:
671-
- pre-flight
672-
- Nemo_CICD_Test
673-
- Coverage
674-
- Coverage_Fake
675-
if: |
676-
always()
677-
&& !cancelled()
678-
&& needs.pre-flight.outputs.is_deployment_workflow != 'true'
679-
steps:
680-
- name: Taint node for cleanup
681-
if: contains(runner.name, 'ephemeral')
682-
shell: bash
683-
run: taint-node.sh

docs/conf.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,17 @@
2020
# -- Project information -----------------------------------------------------
2121
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
2222

23+
import datetime
2324
import os
2425
import sys
2526

2627
# flake8: noqa
2728
# pylint: skip-file
2829

30+
# Embed a build timestamp so every docs build produces unique content,
31+
# ensuring Akamai ECCU revalidate detects changed ETags on S3.
32+
build_timestamp = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
33+
2934
project = "NeMo-AutoModel"
3035
copyright = "2026, NVIDIA Corporation"
3136
author = "NVIDIA Corporation"
@@ -105,7 +110,8 @@
105110
"version_match": release,
106111
},
107112
"extra_head": {
108-
"""
113+
f"""
114+
<meta name="build-timestamp" content="{build_timestamp}">
109115
<script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>
110116
"""
111117
},

0 commit comments

Comments
 (0)