Skip to content

Commit ea1d0b2

Browse files
authored
Merge branch 'main' into hungyueh/modelopt-nvfp4-w4a16
2 parents 9409858 + 1c2085f commit ea1d0b2

17 files changed

Lines changed: 2467 additions & 3 deletions

.github/workflows/claude.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
name: Claude
2+
3+
# Interactive `@claude` assistant. Triggered by mentioning `@claude` in:
4+
# - PR or issue comments
5+
# - PR review comments (line-level review threads)
6+
# - PR review bodies
7+
# - Issue body or title (when the issue is opened)
8+
#
9+
# Restricted to NVIDIA org members / collaborators to prevent abuse from
10+
# external contributors (the repo is public).
11+
#
12+
# Permissions are read-only on `contents`, so Claude can answer questions
13+
# and post comments / suggestions, but cannot push commits. To enable
14+
# commit creation, raise `contents` to `write`.
15+
#
16+
# For deep code review, use `/claude review` (see claude_review.yml).
17+
18+
on:
19+
issue_comment:
20+
types: [created]
21+
pull_request_review_comment:
22+
types: [created]
23+
pull_request_review:
24+
types: [submitted]
25+
issues:
26+
types: [opened, assigned]
27+
28+
jobs:
29+
claude:
30+
name: Claude (interactive)
31+
# Fire only when @claude is mentioned by an NVIDIA org member, owner,
32+
# or collaborator. Skip if the comment is also a `/claude review`
33+
# invocation — that's handled by claude_review.yml.
34+
if: |
35+
(
36+
(github.event_name == 'issue_comment' &&
37+
contains(github.event.comment.body, '@claude') &&
38+
!contains(github.event.comment.body, '/claude review') &&
39+
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)
40+
) ||
41+
(github.event_name == 'pull_request_review_comment' &&
42+
contains(github.event.comment.body, '@claude') &&
43+
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)
44+
) ||
45+
(github.event_name == 'pull_request_review' &&
46+
contains(github.event.review.body, '@claude') &&
47+
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.review.author_association)
48+
) ||
49+
(github.event_name == 'issues' &&
50+
(contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude')) &&
51+
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)
52+
)
53+
)
54+
runs-on: ubuntu-latest
55+
permissions:
56+
contents: read
57+
pull-requests: write
58+
issues: write
59+
id-token: write
60+
steps:
61+
- name: Checkout repository
62+
uses: actions/checkout@v6
63+
with:
64+
fetch-depth: 1
65+
66+
- name: Run Claude
67+
uses: anthropics/claude-code-action@v1
68+
env:
69+
ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
70+
with:
71+
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
72+
claude_args: |
73+
--allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh issue view:*),Bash(gh issue comment:*),Bash(git diff:*),Bash(git show:*),Bash(git log:*),Read,Grep,Glob"
74+
--model "${{ vars.CLAUDE_MODEL }}"
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
name: Claude Code Review
2+
3+
on:
4+
issue_comment:
5+
types: [created]
6+
7+
jobs:
8+
# ──────────────────────────────────────────────────────────────────
9+
# Deep ModelOpt-focused review covering numerical correctness,
10+
# mode/state composition, export compatibility, and backward
11+
# compatibility for saved checkpoints / recipes.
12+
#
13+
# CodeRabbit (.coderabbit.yaml) auto-reviews every PR for routine
14+
# bugs, typos, style, and security anti-patterns — this Claude job
15+
# is on-demand and complements that with deeper architectural
16+
# analysis. Trigger: /claude review
17+
# ──────────────────────────────────────────────────────────────────
18+
review:
19+
name: Claude Review
20+
if: |
21+
github.event_name == 'issue_comment' &&
22+
github.event.issue.pull_request &&
23+
contains(github.event.comment.body, '/claude review') &&
24+
contains(fromJson('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)
25+
runs-on: ubuntu-latest
26+
permissions:
27+
contents: read
28+
pull-requests: write
29+
issues: write
30+
id-token: write
31+
env:
32+
GH_TOKEN: ${{ github.token }}
33+
REPO: ${{ github.repository }}
34+
PR_NUMBER: ${{ github.event.issue.number }}
35+
steps:
36+
- name: Get PR info
37+
id: pr-info
38+
run: |
39+
PR_DATA=$(gh pr view $PR_NUMBER --repo $REPO --json headRefOid,baseRefName)
40+
echo "sha=$(echo $PR_DATA | jq -r .headRefOid)" >> $GITHUB_OUTPUT
41+
echo "base_ref=$(echo $PR_DATA | jq -r .baseRefName)" >> $GITHUB_OUTPUT
42+
43+
- name: Checkout repository
44+
uses: actions/checkout@v6
45+
with:
46+
fetch-depth: 1
47+
ref: ${{ steps.pr-info.outputs.sha }}
48+
49+
- name: Fetch base branch for diff analysis
50+
run: git fetch origin ${{ steps.pr-info.outputs.base_ref }}
51+
52+
- name: React to trigger comment
53+
run: |
54+
gh api repos/$REPO/issues/comments/${{ github.event.comment.id }}/reactions \
55+
--method POST \
56+
-f content='eyes'
57+
58+
- name: Run Claude Review
59+
uses: anthropics/claude-code-action@v1
60+
env:
61+
ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
62+
with:
63+
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
64+
trigger_phrase: "/claude review"
65+
show_full_output: true
66+
claude_args: |
67+
--allowedTools "mcp__github_inline_comment__create_inline_comment,Bash(gh pr comment:*),Bash(gh pr diff:*),Bash(gh pr view:*),Bash(gh pr review:*),Bash(git diff:*),Bash(git show:*),Bash(git log:*),Read,Grep,Glob"
68+
--model "${{ vars.CLAUDE_MODEL }}"
69+
prompt: |
70+
REPO: ${{ env.REPO }}
71+
PR NUMBER: ${{ env.PR_NUMBER }}
72+
BASE REF: origin/${{ steps.pr-info.outputs.base_ref }}
73+
74+
Mandatory workflow — never skip or reorder:
75+
1. Read the PR diff first (gh pr diff).
76+
2. Read CLAUDE.md and CONTRIBUTING.md for project conventions and architecture.
77+
3. For changed files under `modelopt/torch/<sub-package>/`, read the sub-package's
78+
`__init__.py` plus any `mode.py` / `config.py` to understand mode registration
79+
and config schema.
80+
4. Only then perform the review using that context.
81+
82+
You are performing a deep code review on a **NVIDIA Model Optimizer (ModelOpt)** PR.
83+
ModelOpt is NVIDIA's open-source library for model optimization (quantization, pruning,
84+
distillation, sparsity, speculative decoding, NAS, PEFT) targeting PyTorch / ONNX /
85+
HuggingFace / Megatron, with deployment into TRT-LLM / vLLM / SGLang.
86+
87+
Apply general correctness reasoning to whichever ModelOpt sub-package the diff touches —
88+
you already know the algorithms (quantization formulas, distillation losses, N:M sparsity
89+
mask selection, speculative draft-token alignment, etc.). The prompt below covers the
90+
**ModelOpt-specific structural concerns** that you can't infer from the diff alone.
91+
92+
## Division of labor with CodeRabbit
93+
94+
CodeRabbit (`.coderabbit.yaml`) already auto-reviews every PR with the `chill` profile
95+
and runs a hard pre-merge gate on security anti-patterns. **Do NOT duplicate its work.**
96+
Specifically, do NOT comment on:
97+
- Style, formatting, or naming nits (handled by ruff + CodeRabbit)
98+
- Simple typos in code/comments/strings (CodeRabbit catches these)
99+
- The security anti-patterns enumerated in `.coderabbit.yaml`:
100+
`torch.load(weights_only=False)` without justification, `numpy.load(allow_pickle=True)`
101+
without justification, hardcoded `trust_remote_code=True`, `eval`/`exec` on external
102+
input, `# nosec` bypasses, non-permissive new PIP dependencies — these are already
103+
gated. Skip them entirely.
104+
- Generic "consider adding a test" suggestions for trivial changes.
105+
106+
Your value is in things CodeRabbit's pattern-matching cannot do well: tracing dataflow
107+
across multiple files, reasoning about mode/state composition, judging export
108+
compatibility, and catching algorithm-level correctness bugs.
109+
110+
## Review Procedure
111+
112+
1. Get PR metadata: `gh pr view $PR_NUMBER --repo $REPO --json title,body,baseRefName,headRefName,files,additions,deletions,changedFiles,author`
113+
2. Get the full diff: `gh pr diff $PR_NUMBER --repo $REPO`
114+
- For large PRs (>50 files), prioritize source code over config/lock/auto-generated files.
115+
3. For each significant changed file, read the full file for surrounding context.
116+
4. Trace the algorithm end-to-end through the diff. Verify the math/logic matches the
117+
intended technique (whatever sub-package it belongs to).
118+
5. For each newly introduced variable/argument/field, verify it has a meaningful runtime
119+
use path — not just declaration/docstring or discard assignment (`_ = new_arg`).
120+
Use Grep to search for usage beyond declaration sites.
121+
6. Post findings as inline comments with severity and category tags.
122+
123+
## Critical Issues (Must Fix)
124+
125+
### Algorithm Correctness
126+
- Verify the implementation matches the intended technique. Apply your knowledge of the
127+
relevant algorithm family (quantization scales/rounding/saturation, distillation loss
128+
composition, sparsity mask selection, pruning importance scoring, speculative draft
129+
acceptance, NAS supernet weight sharing, etc.).
130+
- Watch for silent numerical bugs: missing fp32 upcast in reductions, wrong reduction
131+
dimension, division-by-zero guards, casts that wrap instead of saturate, gradient
132+
flow through stop_gradient boundaries.
133+
- Watch for state corruption across calibration / search / training passes — leftover
134+
statistics from a previous run are a common foot-gun.
135+
136+
### Mode & State Composability (ModelOpt-specific)
137+
- **Mode registration**: New modes must register correctly with `apply_mode()` /
138+
`restore()`, declare their dependencies, and produce a `modelopt_state` entry that
139+
round-trips through save/restore.
140+
- **State dict schema**: Modified `modelopt_state` schema must include a migration path
141+
or version bump — silently changing keys breaks restore for existing checkpoints.
142+
- **Restore fidelity**: After `restore(model, state)`, the model must be functionally
143+
identical to the saved one. Verify module replacements, hooks, and parameters are
144+
re-applied.
145+
- **Plugin laziness**: Optional integrations (HF, Megatron, TRT-LLM, ONNX) must not
146+
hard-import at module load — gate behind `import_plugin()` so users without those
147+
extras don't break.
148+
149+
### Export Compatibility (ModelOpt-specific)
150+
- HF export (`unified_export_hf.py`) must produce a checkpoint that loads cleanly in
151+
transformers and matches the on-device dtype.
152+
- TRT-LLM export (`model_config_export.py`) must emit a valid `config.json` with
153+
correct `quant_algo`, `kv_cache_quant_algo`, scale tensor names, and weight layout.
154+
- ONNX export must use opsets and operator versions supported by the target consumer
155+
(TRT, ORT).
156+
157+
## Important Issues (Should Fix)
158+
159+
### Backward Compatibility
160+
- Renamed or removed arguments / config fields without deprecation path — breaks
161+
existing user scripts.
162+
- `modelopt_recipes/*.yaml` schema changes without a version bump — old recipes
163+
silently misparse.
164+
- Changed defaults silently alter behavior for users relying on them.
165+
- Changed function signatures, return types, or side effects in
166+
`modelopt/torch/*/__init__.py` (public API) without a backward-compat shim.
167+
- Modified `modelopt_state` keys/structure without migration — makes existing
168+
optimized checkpoints unloadable.
169+
170+
### Performance
171+
- Unnecessary CPU-GPU synchronization in hot paths: `.item()`, `.cpu()`,
172+
`torch.cuda.synchronize()`, Python-side tensor value checks.
173+
- Memory regressions: double-allocating weights, holding tensors past their lifetime.
174+
175+
## Suggestions (Nice to Have)
176+
- Stale, imprecise, or misleading comments/docstrings — a wrong docstring is worse
177+
than none.
178+
- Missing shape/dtype assertions at module/parallelism boundaries where they would
179+
catch real bugs.
180+
- Functions mixing many unrelated responsibilities that would benefit from splitting.
181+
182+
## Comment Format
183+
184+
Prefix each comment with severity and category tag:
185+
- `**[CRITICAL Algorithm]**`, `**[CRITICAL ModeState]**`, `**[CRITICAL Export]**`
186+
- `**[IMPORTANT Compatibility]**`, `**[IMPORTANT Performance]**`
187+
- `**[SUGGESTION]**`
188+
189+
For each finding, explain: (1) what the issue is, (2) why it matters (impact/risk), (3) specific suggestion for fix.
190+
191+
Only use inline ```suggestion blocks for simple, self-contained line replacements (typos,
192+
renames, single-line fixes). For structural changes that add, remove, or reorganize blocks
193+
of code, use a top-level PR comment with a code block showing the proposed change instead.
194+
195+
## Completion
196+
197+
After posting all inline comments, post a summary PR comment:
198+
- List total findings by severity (CRITICAL: N, IMPORTANT: N, SUGGESTION: N)
199+
- Highlight the most impactful findings
200+
- Overall assessment of the PR's risk level
201+
202+
If no significant issues are found, approve the PR:
203+
gh pr review $PR_NUMBER --repo $REPO --approve --body "Claude review passed — no significant issues found. LGTM"
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "27446984",
6+
"metadata": {},
7+
"source": "# Prerequisites: Data Preparation & Baseline Evaluation (~15 min on 2x H200)\n\nThis notebook has two goals:\n1. **Prepare the distillation dataset** — download [WikiText-103](https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-103-v1) and tokenize it into the binary format expected by Megatron-Bridge. This dataset is used during the distillation step (after pruning) in all scenario notebooks.\n2. **Establish the teacher baseline** — evaluate the original Qwen3-8B on MMLU before any compression.\n\n> **Why prepare the dataset before compression?** The distillation step (which comes *after* pruning) requires a pre-tokenized dataset in Megatron binary format. Preparing it upfront avoids interrupting the compression pipeline and ensures a consistent dataset across all scenarios.\n\n> **Note on calibration datasets:** Pruning also requires calibration data to score the importance of each component — the model runs forward passes on a small dataset to measure how much each neuron, head, or layer contributes to the output. This calibration data (we use [`nvidia/Nemotron-Post-Training-Dataset-v2`](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2)) is handled separately in each scenario notebook. Minitron downloads it automatically under the hood, while Puzzletron requires an explicit preparation step. See the respective notebooks ([`scenario1_minitron.ipynb`](scenario1_minitron.ipynb), [`scenario2_puzzletron.ipynb`](scenario2_puzzletron.ipynb), etc.) for details.\n\n**Prerequisites:** Run this notebook from inside the NeMo container with the base model already downloaded at `/workspace/models/Qwen3-8B` (see the guide's Prerequisites section)."
8+
},
9+
{
10+
"cell_type": "markdown",
11+
"id": "ea318822",
12+
"metadata": {},
13+
"source": "## 1. Download and tokenize distillation dataset\n\nFor distillation we use [WikiText-103](https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-103-v1) — a small, generic language modeling dataset.\n\nThe `megatron_preprocess_data` utility downloads the dataset directly from the HuggingFace Hub and tokenizes it into the binary `.bin` / `.idx` format expected by Megatron-Bridge in a single step (~2 min)."
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": null,
18+
"id": "22112d26",
19+
"metadata": {},
20+
"outputs": [],
21+
"source": "!python -m modelopt.torch.utils.plugins.megatron_preprocess_data \\\n --hf_dataset wikitext \\\n --hf_name wikitext-103-v1 \\\n --hf_split train \\\n --json_keys text \\\n --tokenizer /workspace/models/Qwen3-8B \\\n --output_dir /workspace/datasets/tokenized_qwen3 \\\n --workers 32 \\\n --append_eod \\\n --strip_newlines"
22+
},
23+
{
24+
"cell_type": "markdown",
25+
"id": "ynlx6sgkqr",
26+
"metadata": {},
27+
"source": "## 2. Evaluate teacher model (baseline)\n\nBefore compressing, we establish the baseline MMLU score for the original Qwen3-8B. Results in the guide are expressed as a percentage of this number.\n\nWe use [`lm-eval`](https://github.com/EleutherAI/lm-evaluation-harness) — a standard open-source evaluation harness — to run the MMLU benchmark. MMLU (Massive Multitask Language Understanding) covers 57 subjects across STEM, humanities, and social sciences, using 4-choice questions. The 5-shot setting provides 5 in-context examples per question, which is the standard configuration for comparing LLMs on this benchmark."
28+
},
29+
{
30+
"cell_type": "code",
31+
"execution_count": null,
32+
"id": "zzansswg3zq",
33+
"metadata": {},
34+
"outputs": [],
35+
"source": [
36+
"!lm_eval --model hf \\\n",
37+
" --model_args pretrained=/workspace/models/Qwen3-8B,dtype=bfloat16 \\\n",
38+
" --tasks mmlu \\\n",
39+
" --num_fewshot 5 \\\n",
40+
" --batch_size 4"
41+
]
42+
},
43+
{
44+
"cell_type": "markdown",
45+
"id": "281mvurl3op",
46+
"metadata": {},
47+
"source": [
48+
"**Expected result:** MMLU (5-shot) = **0.7493**. This is the teacher baseline used throughout the guide."
49+
]
50+
}
51+
],
52+
"metadata": {
53+
"kernelspec": {
54+
"display_name": "Python 3 (ipykernel)",
55+
"language": "python",
56+
"name": "python3"
57+
},
58+
"language_info": {
59+
"codemirror_mode": {
60+
"name": "ipython",
61+
"version": 3
62+
},
63+
"file_extension": ".py",
64+
"mimetype": "text/x-python",
65+
"name": "python",
66+
"nbconvert_exporter": "python",
67+
"pygments_lexer": "ipython3",
68+
"version": "3.12.3"
69+
}
70+
},
71+
"nbformat": 4,
72+
"nbformat_minor": 5
73+
}

0 commit comments

Comments
 (0)