diff --git a/.claude/skills/common/environment-setup.md b/.claude/skills/common/environment-setup.md
index 2eee2cd2a2b..a40ace2a5f9 100644
--- a/.claude/skills/common/environment-setup.md
+++ b/.claude/skills/common/environment-setup.md
@@ -29,6 +29,8 @@ cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>
 
 If a cluster config exists with content → **use the remote cluster** (do not fall back to local even if local GPUs are available — the cluster config indicates the user's preferred execution environment). Otherwise → **local execution**.
 
+If the cluster config contains multiple clusters and the user did not name the target cluster, ask which cluster to use before calling `remote_load_cluster`. Do not silently fall back to `default_cluster` in multi-cluster configs; different clusters can have different filesystems, GPU types, auth paths, and SSH setup.
+
 For remote, connect:
 
 ```bash
diff --git a/.claude/skills/common/remote-execution.md b/.claude/skills/common/remote-execution.md
index be770aef936..caaf0ce4db1 100644
--- a/.claude/skills/common/remote-execution.md
+++ b/.claude/skills/common/remote-execution.md
@@ -33,10 +33,10 @@ default_cluster: my-cluster
 Workstation filesystems (`/home/scratch.*`, local NFS) are **not** mounted on the cluster. If a checkpoint was produced on your workstation, copy it to the cluster's own storage before submitting any job that references it — NEL and SLURM do NOT sync checkpoints automatically.
 
 ```bash
-rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/checkpoints/
+rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/<session_id>/<model>/checkpoints/
 ```
 
-Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
+Use the `workspace` path from your cluster config as the destination root, and keep staged checkpoints under the session/model directory. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
 
 See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
 
@@ -118,8 +118,8 @@ When submitting SLURM jobs remotely, write **two files** locally to avoid shell
 Then upload both and submit:
 
 ```bash
-remote_sync_to /local/scripts/ scripts/
-JOBID=$(remote_run "sbatch /remote/path/scripts/job_slurm.sh" | grep -o '[0-9]\+' | tail -1)
+remote_sync_to /local/scripts/ <session_id>/<model>/scripts/
+JOBID=$(remote_run "sbatch <remote_workspace>/<session_id>/<model>/scripts/job_slurm.sh" | grep -o '[0-9]\+' | tail -1)
 ```
 
 ---
diff --git a/.claude/skills/common/workspace-management.md b/.claude/skills/common/workspace-management.md
index f797e7870ee..29356bca101 100644
--- a/.claude/skills/common/workspace-management.md
+++ b/.claude/skills/common/workspace-management.md
@@ -1,77 +1,95 @@
 # Workspace Management
 
-Organize work by model name so outputs (checkpoints, logs) are easy to find and reuse across PTQ → deploy → eval pipelines.
+Organize work by session id and model name so concurrent agents do not
+clobber each other, while outputs (checkpoints, logs) stay easy to find and
+reuse across PTQ → deploy → eval pipelines within the same session.
 
-## Single-user (default)
+## Session Workspaces
 
-Create a work directory named after the model in the current project:
+Use the same `<session_id>` convention as the monitor skill:
 
-```bash
-mkdir -p ./workspaces/<model-name>
-```
-
-Use descriptive names, not timestamps:
-
-```bash
-# Good
-workspaces/qwen3-0.6b-nvfp4/
-workspaces/llama-3.1-8b-fp8/
-
-# Bad
-workspaces/ptq-20260318-143022/
-workspaces/job-001/
-```
-
-Store outputs (checkpoints, logs) inside the workspace:
-
-```bash
-workspaces/qwen3-0.6b-nvfp4/
-  output/          # quantized checkpoint
-  logs/            # job logs
-  scripts/         # custom PTQ scripts (if unsupported model)
-```
+- Claude Code: `$CLAUDE_CODE_SESSION_ID`, or the `session_id` field from hook input
+- Codex: `$CODEX_THREAD_ID`
+- If no session id is available, create a stable id for the current terminal session and reuse it for every local and remote path created by that agent
 
 ## When to Reuse vs Create
 
-**Before starting any task**, check for an existing workspace:
+**Before starting any task**, check for an existing workspace in the current
+session:
 
 ```bash
-ls ./workspaces/ 2>/dev/null
+ls ./workspaces/<session_id>/ 2>/dev/null
 ```
 
 **Reuse** when:
 
-- Same model (e.g., deploying a model you just quantized)
+- The matching model workspace already exists under `./workspaces/<session_id>/`
 - Task requires output from a previous step (e.g., eval requires the PTQ checkpoint)
 - User says "deploy the model I just quantized"
 
 **Create new** when:
 
-- New model not seen before
+- No matching model workspace exists under `./workspaces/<session_id>/`
 - User explicitly asks for a fresh start
-- Different quantization format for same model (e.g., `qwen3-0.6b-fp8` vs `qwen3-0.6b-nvfp4`)
+
+## Model Workspace Names
+
+Within `./workspaces/<session_id>/`, create one model workspace per model or
+model variant. Include meaningful variant details in the model workspace name,
+for example quantization format or checkpoint role:
+
+```bash
+mkdir -p ./workspaces/<session_id>/<model-name>
+```
+
+Use descriptive model workspace names, not timestamps:
+
+```text
+# Good
+workspaces/<session_id>/qwen3-0.6b-nvfp4/
+workspaces/<session_id>/qwen3-0.6b-fp8/
+workspaces/<session_id>/qwen3-0.6b-baseline/
+
+# Bad
+workspaces/<session_id>/ptq-20260318-143022/
+workspaces/<session_id>/job-001/
+```
+
+Store outputs (checkpoints, logs) inside the model workspace:
+
+```text
+workspaces/<session_id>/qwen3-0.6b-nvfp4/
+  output/          # quantized checkpoint
+  logs/            # job logs
+  scripts/         # custom PTQ scripts (if unsupported model)
+```
 
 ## Remote execution
 
 When using a remote machine (clusters.yaml configured), create matching workspaces on **both** local and remote:
 
-- **Local** `./workspaces/<model>/` — write and edit scripts here
-- **Remote** `<remote_workspace>/workspaces/<model>/` — model downloads, execution, outputs
+- **Local** `./workspaces/<session_id>/<model>/` — write and edit scripts here
+- **Remote** `<remote_workspace>/<session_id>/<model>/` — model downloads, execution, outputs
+
+Session-scope newly created remote run directories, logs, response caches,
+temporary configs, and output artifacts. Shared read-only or concurrency-safe
+caches, such as Hugging Face model caches and prebuilt container image caches,
+can remain outside the session directory.
 
 Before running, sync the local ModelOpt source and scripts to the remote workspace:
 
 ```bash
 # Sync ModelOpt source (first time or after local changes)
-remote_sync_to ./ workspaces/<model>/Model-Optimizer/
+remote_sync_to ./ <session_id>/<model>/Model-Optimizer/
 
 # Sync custom scripts
-remote_sync_to ./workspaces/<model>/scripts/ workspaces/<model>/scripts/
+remote_sync_to ./workspaces/<session_id>/<model>/scripts/ <session_id>/<model>/scripts/
 ```
 
 Download the model on the **remote** machine (avoids transferring large model files):
 
 ```bash
-remote_run "python -c \"from huggingface_hub import snapshot_download; snapshot_download('<model_id>', local_dir='<remote_workspace>/workspaces/<model>/model')\""
+remote_run "python -c \"from huggingface_hub import snapshot_download; snapshot_download('<model_id>', local_dir='<remote_workspace>/<session_id>/<model>/model')\""
 ```
 
 Inspect remote files with `remote_run "cat ..."` — read README, config.json, tokenizer_config.json to understand requirements before writing scripts locally.
@@ -80,7 +98,7 @@ Inspect remote files with `remote_run "cat ..."` — read README, config.json, t
 
 When `MODELOPT_WORKSPACE_ROOT` is set, use it instead of `./workspaces/`:
 
-- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (set by the bot)
+- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (set by the bot); use `$MODELOPT_WORKSPACE_ROOT/<session_id>/<name>/`
 - `MODELOPT_REPO_DIR` — shared upstream repo (read-only, use for fresh copies)
 
 To create a workspace, copy the upstream repo (without `.git`):
@@ -89,7 +107,7 @@ To create a workspace, copy the upstream repo (without `.git`):
 rsync -a --quiet \
     --exclude .git --exclude __pycache__ --exclude '*.pyc' \
     --exclude node_modules --exclude '*.egg-info' --exclude '*.sqsh' \
-    "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<name>/"
+    "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<session_id>/<name>/"
 ```
 
 ## Cross-Skill Workspace Flow
@@ -97,7 +115,7 @@ rsync -a --quiet \
 Workspaces carry over across the PTQ → Deploy → Eval pipeline. Each stage adds to the same directory:
 
 ```text
-workspaces/model-name-format/
+workspaces/<session_id>/model-name-format/
   output/              ← PTQ: quantized checkpoint
   eval_results/        ← Evaluation: NEL artifacts (results.yml per task)
   eval_config.yaml     ← Evaluation: NEL config
@@ -109,19 +127,19 @@ workspaces/model-name-format/
 
 ```text
 User: "quantize Qwen3-0.6B with nvfp4"
-Agent: ls workspaces/ → no "qwen3-0.6b-nvfp4"
-       → mkdir workspaces/qwen3-0.6b-nvfp4
-       → run PTQ, output to workspaces/qwen3-0.6b-nvfp4/output/
+Agent: ls workspaces/<session_id>/ → no "qwen3-0.6b-nvfp4"
+       → mkdir workspaces/<session_id>/qwen3-0.6b-nvfp4
+       → run PTQ, output to workspaces/<session_id>/qwen3-0.6b-nvfp4/output/
 
 User: "deploy the model I just quantized"
-Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
-       → reuse, find checkpoint at workspaces/qwen3-0.6b-nvfp4/output/
+Agent: ls workspaces/<session_id>/ → sees "qwen3-0.6b-nvfp4"
+       → reuse, find checkpoint at workspaces/<session_id>/qwen3-0.6b-nvfp4/output/
 
 User: "evaluate the quantized model on MMLU and GSM8K"
-Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
-       → reuse, write eval_config.yaml, results to workspaces/qwen3-0.6b-nvfp4/eval_results/
+Agent: ls workspaces/<session_id>/ → sees "qwen3-0.6b-nvfp4"
+       → reuse, write eval_config.yaml, results to workspaces/<session_id>/qwen3-0.6b-nvfp4/eval_results/
 
 User: "now quantize Llama-3.1-8B with fp8"
-Agent: ls workspaces/ → no llama
-       → mkdir workspaces/llama-3.1-8b-fp8
+Agent: ls workspaces/<session_id>/ → no llama
+       → mkdir workspaces/<session_id>/llama-3.1-8b-fp8
 ```
diff --git a/.claude/skills/debugging-playbooks/SKILL.md b/.claude/skills/debugging-playbooks/SKILL.md
new file mode 100644
index 00000000000..533cb0f0c1d
--- /dev/null
+++ b/.claude/skills/debugging-playbooks/SKILL.md
@@ -0,0 +1,22 @@
+---
+name: debugging-playbooks
+description: Diagnostic playbooks for tricky failures — failures where the traceback misdirects and the first 2-3 reasonable hypotheses turn out wrong. Use when a run fails with a framework-internal-looking error (cryptic torch.compile / dynamo / NCCL / vLLM / transformers / CUDA / pyxis / enroot / NEL / SLURM / container runtime), the top frame appears to blame the wrong layer (e.g. the user's code, ModelOpt, the quantized linear, the wrapper class) but fixing that layer doesn't help, or the symptom recurs across unrelated changes. Use this skill when you've eliminated the obvious suspects and the bug hasn't budged. Don't reach for this on the first guess; reach for it when the obvious answers don't pan out. Each playbook is keyed by a literal symptom string from logs so future agents can grep for it.
+---
+
+# Debugging playbooks
+
+When a failure surfaces a symptom that doesn't clearly map to the code under change, check whether one of the documented playbooks below already describes it. Each playbook is keyed by the literal symptom string so future agents can match by grep.
+
+| Symptom (literal string from logs) | Playbook |
+| --- | --- |
+| `AttributeError: 'NoneType' object has no attribute 'size'` during vLLM `profile_run` / `_dummy_run` / CUDA-graph capture | [vllm-aot-cache-poisoning.md](references/vllm-aot-cache-poisoning.md) |
+
+## When to add a new playbook
+
+Add an entry when **all three** are true:
+
+1. The root cause was non-obvious from the traceback — the immediate frame was misleading (e.g. blames ModelOpt when the bug is in vLLM).
+2. The symptom is likely to recur across runs (different models, different containers).
+3. There is a concrete fix (config change, env var, cache invalidation) that future agents should reach for before deeper debugging.
+
+Each playbook should include: the literal symptom string, the actual mechanism, how to confirm the diagnosis, and the minimal fix.
diff --git a/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
new file mode 100644
index 00000000000..6723b1f3f92
--- /dev/null
+++ b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
@@ -0,0 +1,139 @@
+# vLLM AOT compile-cache poisoning across multimodal-on / multimodal-off runs
+
+Applies to **any** model whose vLLM architecture supports multimodal input —
+this is modality-agnostic, covering image, video, audio, or any other
+modality (`vllm/multimodal/registry.py: supports_multimodal_inputs` iterates
+the model's `supported_mm_limits`, which can be `{"image": N}`,
+`{"video": N}`, `{"audio": N}`, `{"image": N, "video": N}`, etc.). The hazard
+appears when multiple vLLM runs against the **same checkpoint** share a
+`VLLM_CACHE_ROOT` and differ in whether **all** of the model's modalities
+are zeroed out via `--limit-mm-per-prompt`.
+
+## Symptom
+
+vLLM startup crashes during `profile_run` / `_dummy_run` / CUDA-graph capture
+with:
+
+```text
+AttributeError: 'NoneType' object has no attribute 'size'
+```
+
+The traceback ends inside `torch/_dynamo/utils.py call_size → x.size(i)`,
+after passing through `vllm/compilation/decorators.py: aot_compiled_fn`.
+**There is no model-layer frame** in the failing stack — no attention op,
+no MLP, no quantized linear. The compiled function is loaded from disk and
+crashes in dynamo's prologue, before any decoder layer runs. The log line
+just above the traceback is the smoking gun:
+
+```text
+INFO ... [decorators.py:...] Directly load AOT compilation from path
+  /vllm-cache/torch_compile_cache/torch_aot_compile/<hash>/rank_*/model
+```
+
+## Mechanism
+
+vLLM's `@support_torch_compile` decorator caches one compiled `forward` per
+`(aot_compile_hash_factors(vllm_config), _model_hash_key(forward))` key
+(`vllm/compilation/decorators.py`). That key includes the model config and
+quantization, but **does not include** `--limit-mm-per-prompt` or the
+derived `supports_mm_inputs` flag.
+
+`vllm/v1/worker/gpu_model_runner.py: _dummy_run` branches on
+`supports_mm_inputs`:
+
+```python
+if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
+    input_ids, inputs_embeds = self._prepare_mm_inputs(...)   # (None, Tensor)
+else:
+    input_ids = self.input_ids.gpu[:num_tokens_padded]        # (Tensor, None)
+    inputs_embeds = None
+```
+
+`supports_mm_inputs` (`vllm/multimodal/registry.py: supports_multimodal_inputs`)
+returns `False` when **every** supported modality has
+`--limit-mm-per-prompt = 0`. So:
+
+| Run config | `supports_mm_inputs` | Pattern compiled / loaded |
+| --- | --- | --- |
+| `--limit-mm-per-prompt '{"image":0}'` (and `"video":0` etc.) | False | `input_ids=Tensor, inputs_embeds=None` |
+| default, or any modality non-zero | True | `input_ids=None, inputs_embeds=Tensor` |
+
+The `@support_torch_compile` docstring explicitly forbids the same argument
+slot from being `None` on one invocation and a Tensor on another — Dynamo
+specializes on None-vs-Tensor identity per argument, so one cached graph
+cannot serve both patterns. When run A populates the cache slot and run B
+shares the slot but uses the opposite pattern, the prologue calls
+`.size()` on what is now `None` and dies.
+
+This is symmetric: a multimodal-first run followed by a text-only-via-image:0
+run fails the same way, just with the None/Tensor roles swapped.
+
+## How to confirm
+
+1. **Cache hit before the crash.** Look in the server log for
+   `Directly load AOT compilation from path ...` shortly before the
+   traceback. A cache *hit* immediately before a `NoneType.size()` is the
+   diagnostic. (A cold compile would print `Dynamo bytecode transform
+   time` and `Inductor compile took ...` instead.)
+2. **Config delta on `--limit-mm-per-prompt`.** Compare the failing run's
+   serving args against the most recent successful runs that share
+   `$VLLM_CACHE_ROOT`. If they disagree on whether any modality is
+   zero-limited (or one side omits the flag while the other passes
+   `{"image":0}`), the cache slot is colliding.
+3. **Positive control.** Relaunch the failing config with
+   `VLLM_DISABLE_COMPILE_CACHE=1` and change nothing else. If `profile_run`
+   passes, the cache was the cause.
+
+## Fix
+
+Two parts — stop the poisoning, then heal what's already poisoned.
+
+### Stop poisoning
+
+For multimodal-architecture models, do **not** zero out a modality with
+`--limit-mm-per-prompt '{"image":0}'` (or `"video":0`, …) on runs intended
+to share a cache root with multimodal runs. The vision tower weights are
+loaded from the checkpoint regardless of this flag; zeroing only flips
+`supports_mm_inputs` and creates the cache hazard. Text-only inference
+still works without the flag because vLLM's `_preprocess` routes both
+text and multimodal prompts through the same `inputs_embeds` path when
+`supports_mm_inputs=True`:
+
+```python
+# vllm/v1/worker/gpu_model_runner.py: _preprocess
+# NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
+# we always use embeddings (rather than token ids) as input to the
+# multimodal model, even when the input is text.
+inputs_embeds_scheduled = self.model.embed_input_ids(
+    self.input_ids.gpu[:num_scheduled_tokens],
+    multimodal_embeddings=mm_embeds,
+    is_multimodal=is_mm_embed,
+)
+```
+
+A text-only prompt simply has `mm_embeds=[]` / `is_multimodal=False`; the
+call signature into the language model is unchanged. The small cost of
+keeping multimodal inputs enabled is that vLLM allocates an encoder cache
+budget at startup (e.g. a few hundred MB) and prints a vision warmup line.
+
+### Heal existing cache
+
+Either fully wipe and let the next run repopulate:
+
+```bash
+rm -rf "$VLLM_CACHE_ROOT/torch_compile_cache/torch_aot_compile/"
+```
+
+…or sidestep by separating cache roots per multimodal-ness (set a different
+`VLLM_CACHE_ROOT` for the runs that need a different pattern), or just set
+`VLLM_DISABLE_COMPILE_CACHE=1` on the affected runs and accept a one-time
+recompile (~20-30 s) at every startup.
+
+## See also
+
+- `vllm/compilation/decorators.py` — `support_torch_compile` decorator and
+  its docstring on the None-vs-Tensor invariant.
+- `vllm/v1/worker/gpu_model_runner.py` — the input-construction branch in
+  `_dummy_run` and the unified-`inputs_embeds` comment in `_preprocess`.
+- `vllm/multimodal/registry.py` — how `supports_multimodal_inputs` is
+  computed from `--limit-mm-per-prompt`.
diff --git a/.claude/skills/deployment/SKILL.md b/.claude/skills/deployment/SKILL.md
index 5210eae6c3c..f14cc0b9822 100644
--- a/.claude/skills/deployment/SKILL.md
+++ b/.claude/skills/deployment/SKILL.md
@@ -38,10 +38,10 @@ The script handles: GPU detection, quantization flag auto-detection (FP8 vs FP4)
 
 ### 0. Check workspace (multi-user / Slack bot)
 
-If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check for existing ones — especially if deploying a checkpoint from a prior PTQ run:
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check the current session for existing model workspaces — especially if deploying a checkpoint from a prior PTQ run:
 
 ```bash
-ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null
+ls "$MODELOPT_WORKSPACE_ROOT/<session_id>/" 2>/dev/null
 ```
 
 If the user says "deploy the model I just quantized" or references a previous PTQ, find the matching workspace and `cd` into it. The checkpoint should be in that workspace's output directory.
@@ -190,7 +190,7 @@ If a cluster config exists (`~/.config/modelopt/clusters.yaml` or `.claude/clust
    If the checkpoint path is a remote/absolute path (e.g., from a prior PTQ run on the cluster), skip sync — it's already there. Verify with `remote_run "ls <checkpoint_path>/config.json"`. Only sync if the checkpoint is local:
 
    ```bash
-   remote_sync_to <local_checkpoint_path> checkpoints/
+   remote_sync_to <local_checkpoint_path> <session_id>/<model>/checkpoints/
    ```
 
 3. **Deploy based on remote environment:**
diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 69920814828..e224299e81c 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -14,10 +14,21 @@ You're an expert in NeMo Evaluator Launcher! Guide the user through creating pro
 
 ### Workspace and Pipeline Integration
 
-If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces in the current session — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
 
 This skill is often the final stage of the PTQ → Deploy → Eval pipeline. If the model required runtime patches during deployment (transformers upgrade, framework source fixes), carry those patches into the NEL config via `deployment.command`.
 
+### NEL Timeout and Resume Behavior
+
+NEL submissions commonly create a dependency chain of SLURM jobs. The first job
+runs the evaluation and writes response/result caches. A dependent follow-on job
+resumes from those caches if the first job times out, then queues another follow-on
+job so long-running evals can continue across walltime windows.
+
+Do not assume a timeout means the evaluation failed or produced invalid results.
+Treat timeouts as expected resume events until `nel status`/`nel info`, artifacts,
+and logs show a terminal failure or invalid run.
+
 ### Workflow
 
 ```text
@@ -30,8 +41,13 @@ Config Generation Progress:
 - [ ] Step 5: Confirm tasks (iterative)
 - [ ] Step 6: Advanced - Multi-node (Data Parallel)
 - [ ] Step 7: Advanced - Interceptors
-- [ ] Step 7.5: Check container registry auth (SLURM only)
+- [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+  - [ ] Step 8.1: Dry-run / NEL CLI config validation
+  - [ ] Step 8.2: Limited-samples canary
+  - [ ] Step 8.3: Full evaluation
+- [ ] Step 9: Verify completed evaluation run
+- [ ] Step 10: Verify baseline-vs-quantized comparability
 ```
 
 **Step 1: Check prerequisites**
@@ -40,13 +56,16 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
-**Shortcut: use pre-built task snippets.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching task snippet. Available: mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. Task snippets contain only the task-specific config (name, params, repeats) — not the full NEL config. To use them:
+**Shortcut: use task references.** For named benchmarks, read the matching
+`recipes/tasks/<name>.md` before creating or editing the config. Available:
+mmlu_pro, mmmu_pro, gpqa, aime2025, livecodebench, ifbench, scicode,
+aa_lcr, ns_hle_aa, tau2_bench_telecom.
 
-1. Read the task snippet(s) the user wants
+1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
-3. Replace the `tasks:` section with the selected snippet(s)
-4. Do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in `???` values)
-5. Proceed to Step 7.5/8
+3. Copy the selected YAML fragment(s) into `evaluation.tasks`.
+4. Apply any notes from the reference.
+5. Do Step 3, Step 4, then Step 7.5/8.
 
 **Step 2: Build the base config file**
 
@@ -65,6 +84,8 @@ Prompt the user with "I'll ask you 5 questions to build the base config we'll ad
 - NIM
 - TRT-LLM
 
+Prefer vLLM for NEL self-deployment unless the user explicitly asks for another runtime, the model card requires another runtime, or the evaluation targets an already-running endpoint (`deployment: none`).
+
 3. Auto-export:
 
 - None (auto-export disabled)
@@ -112,24 +133,21 @@ Ask for model path. Determine type:
 
 **Auto-detect ModelOpt quantization format** (checkpoint paths only):
 
-Check for `hf_quant_config.json` in the checkpoint directory:
+Check `config.json` first for a `quantization_config` section with `quant_method: "modelopt"`. If absent, check the legacy/backward-compatible `hf_quant_config.json`:
 
 ```bash
+cat <checkpoint_path>/config.json 2>/dev/null
 cat <checkpoint_path>/hf_quant_config.json 2>/dev/null
 ```
 
-If found, read `quantization.quant_algo` and set the correct vLLM/SGLang quantization flag in `deployment.extra_args`:
+If ModelOpt quantization is detected, read the quantization algorithm from `quantization_config.quant_algo` or `quantization.quant_algo`.
 
-| `quant_algo` | Flag to add |
-|-------------|-------------|
-| `FP8` | `--quantization modelopt` |
-| `W4A8_AWQ` | `--quantization modelopt` |
-| `NVFP4`, `NVFP4_AWQ` | `--quantization modelopt_fp4` |
-| Other values | Try `--quantization modelopt`; consult vLLM/SGLang docs if unsure |
+- **vLLM:** Do not add a `--quantization` flag by default. Recent vLLM reads `quantization_config` / `hf_quant_config.json` and selects the ModelOpt backend automatically; adding a stale or mismatched flag can cause a config mismatch. Only add an explicit flag if the model card, vLLM version, or dry-run error requires it.
+- **SGLang:** Use SGLang-specific docs/model-card guidance. For offline ModelOpt checkpoints, recent SGLang can parse the config in many cases; if an explicit flag is required, common values are `--quantization modelopt_fp8` for FP8 and `--quantization modelopt_fp4` for NVFP4. Some exported ModelOpt flows document `--quantization modelopt`; verify against the installed SGLang version.
 
-If no `hf_quant_config.json`, also check `config.json` for a `quantization_config` section with `quant_method: "modelopt"`. If neither is found, the checkpoint is unquantized — no flag needed.
+If neither file contains a ModelOpt quantization config, treat the checkpoint as unquantized — no quantization flag needed.
 
-> **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These are not in `hf_quant_config.json` — they are discovered during model card research below.
+> **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These may not be in the quantization config files — they are discovered during model card research below.
 
 **Auto-detect deployment settings from checkpoint:**
 
@@ -158,8 +176,14 @@ Combine all detected flags into a single `deployment.extra_args` override. The r
 
 When a quantized checkpoint is detected, read `references/quantization-benchmarks.md` for benchmark sensitivity rankings and recommended sets. Present recommendations to the user and ask which to include.
 
+**Baseline comparison preflight:**
+
+When a quantized checkpoint is detected, identify the matching baseline before launching the full quantized run. The baseline is usually the pre-quantization source model/checkpoint for this run, but it may itself be quantized (for example, an FP8 checkpoint used as the baseline for an NVFP4 checkpoint). First infer the baseline from the PTQ source model/checkpoint in the workspace or config used to create the quantized checkpoint. If it cannot be inferred, ask the user for the baseline model/checkpoint or an existing baseline invocation/run path. If no matching baseline exists, prepare a companion baseline config and launch it before or alongside the quantized config. The baseline config should match the quantized config's benchmark versions, task configs, serving args, token limits, dataset setup, credentials, cluster, and container as closely as possible; change only the model/checkpoint and adjust quantization-specific flags to match the baseline checkpoint. Do not treat the quantized score as release-ready until the baseline comparison exists.
+
 Read `references/model-card-research.md` for the full extraction checklist (sampling params, reasoning config, ARM64 compatibility, pre_cmd, etc.). Use WebSearch to research the model card, present findings, and ask the user to confirm.
 
+For reasoning-capable models, prefer reasoning mode for evaluation because it usually produces the highest task scores; configure the model-card-specific on/off control and any reasoning budget or effort setting. If the user wants lower variance/noise, lower latency/cost, or an apples-to-apples comparison against non-reasoning baselines, also consider a non-reasoning companion run.
+
 **Step 4: Fill in remaining missing values**
 
 - Find all remaining `???` missing values in the config.
@@ -171,7 +195,10 @@ Read `references/model-card-research.md` for the full extraction checklist (samp
 Show tasks in the current config. Loop until the user confirms the task list is final:
 
 1. Tell the user: "Run `nel ls tasks` to see all available tasks".
-2. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
+2. If the task list includes a benchmark with a reference in `recipes/tasks/`,
+   read it before editing the config and prefer its YAML fragment unless the user
+   asks for different settings. Keep the reference repeat counts.
+3. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
    To add per-task `nemo_evaluator_config` as specified by the user, e.g.:
 
    ```yaml
@@ -185,8 +212,8 @@ Show tasks in the current config. Loop until the user confirms the task list is
              ...
    ```
 
-3. Apply changes.
-4. Show updated list and ask: "Is the task list final, or do you want to make more changes?"
+4. Apply changes.
+5. Show updated list and ask: "Is the task list final, or do you want to make more changes?"
 
 **Known Issues**
 
@@ -215,7 +242,7 @@ If the user needs multi-node evaluation (model >120B, or more throughput), read
 
 - The docs may show incorrect parameter names for logging. Use `max_logged_requests` and `max_logged_responses` (NOT `max_saved_*` or `max_*`).
 
-**Step 7.5: Check container registry authentication (SLURM only)**
+**Step 7.5: Check container registry authentication for private images (SLURM only)**
 
 NEL's default deployment images by framework:
 
@@ -226,28 +253,33 @@ NEL's default deployment images by framework:
 | TRT-LLM | `nvcr.io/nvidia/tensorrt-llm/release:...` | NGC |
 | Evaluation tasks | `nvcr.io/nvidia/eval-factory/*:26.03` | NGC |
 
-Before submitting, verify the cluster has credentials for the deployment image. See `skills/common/slurm-setup.md` section 6 for the full procedure.
+Before submitting, identify the exact deployment and evaluation-task images that will be pulled. If the images are public, skip the registry-authentication preflight; pyxis/enroot can pull public images without stored credentials. Do not require credentials just because the registry is DockerHub or NGC.
+
+Only verify cluster credentials when an image is private or access-restricted (private DockerHub repo, private NGC repo, internal registry, or user-provided image that is not known to be public). See `skills/common/slurm-setup.md` section 6 for the credential setup procedure.
 
 ```bash
 ssh <host> "grep -E '^\s*machine\s+' ~/.config/enroot/.credentials 2>/dev/null"
 ```
 
 **Decision flow (check before submitting):**
-1. Check if the cluster has credentials for the default DockerHub image (see command above)
-2. If DockerHub credentials exist → use the default image and submit
-3. If DockerHub credentials are missing but can be added → add them (see `slurm-setup.md` section 6), then submit
-4. If DockerHub credentials cannot be added → override `deployment.image` to the NGC alternative and submit:
+1. If the selected images are public → submit without an auth preflight
+2. If any selected image is private or access-restricted → check for credentials for that image's registry (see command above)
+3. If credentials exist → use the selected image and submit
+4. If credentials are missing but can be added → add them (see `slurm-setup.md` section 6), then submit
+5. If credentials cannot be added → switch to a public image when a compatible one exists, for example:
 
    ```yaml
    deployment:
      image: nvcr.io/nvidia/vllm:<YY.MM>-py3  # check https://catalog.ngc.nvidia.com/orgs/nvidia/containers/vllm for latest tag
    ```
 
-5. **Do not retry more than once** without fixing the auth issue
+6. **Do not retry more than once** after an auth failure without fixing credentials or switching images
 
 **Step 8: Run the evaluation**
 
-Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run.
+Use a gated `dry-run -> canary -> full-run` sequence. Run the commands directly
+when the user has asked you to launch evals; otherwise, ask before submitting jobs.
+Do not submit the full run until the dry-run and limited-samples canary both pass.
 
 **Important**: Export required environment variables based on your config. If any tokens or keys are missing, point the user to `recipes/env.example` — it lists all possible keys with notes on which tasks need them. Ask the user to copy it, fill in their keys, and source it:
 
@@ -265,38 +297,96 @@ export NEMO_EVALUATOR_TRUST_PRE_CMD=1
 export DUMMY_API_KEY=dummy
 ```
 
-1. **Dry-run** (validates config without running):
+**Step 8.1: Dry-run / NEL CLI config validation** (validates config without running):
 
-   ```bash
-   nel run --config <config_path> --dry-run
-   ```
+```bash
+nel run --config <config_path> --dry-run
+```
 
-2. **Test with limited samples** (quick validation run):
+Check the NEL output before launching anything. Fix unresolved `???` values,
+bad Hydra overrides, missing env var references, invalid mounts, image/container
+problems, sbatch issues, and obvious deployment argument errors before moving on.
 
-   ```bash
-   nel run --config <config_path> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
-   ```
+**Step 8.2: Limited-samples canary** (operational validation before production):
 
-3. **Re-run a single task** (useful for debugging or re-testing after config changes):
+```bash
+nel run --config <config_path> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
+```
 
-   ```bash
-   nel run --config <config_path> -t <task_name>
-   ```
+Use the canary to tune parallelism and catch runtime failures that the dry-run
+cannot catch: judge API auth/rate-limit errors, evaluation container failures,
+code-execution sandbox/container errors, vLLM health/OOM issues, bad request
+formatting, log path problems, and unexpectedly low evaluated-sample counts.
+Inspect logs before accepting the canary, not just result files:
 
-   Combine with `-o` for limited samples: `nel run --config <config_path> -t <task_name> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10`
+```bash
+nel status <canary_invocation_id>
+nel info <canary_invocation_id> --logs
+ssh <user>@<host> "grep -i 'traceback\|exception\|error\|failed\|oom\|killed\|timeout\|unauthorized\|rate limit\|sandbox\|container\|judge\|parse\|scoring' <log_path>/*.log"
+```
 
-4. **Full evaluation** (production run):
+If the benchmark set mixes different dependency profiles, canary each risky
+class or task: LLM-judge tasks, code-execution tasks, and ordinary model-only
+tasks can fail for different reasons. For evals that depend on inference judges
+or code execution containers, start with conservative `parallelism` and raise it
+only after the canary logs show those dependencies are healthy. Do not over-raise
+parallelism just to saturate the model server; judge services and code containers
+often become the bottleneck or failure point first.
 
-   ```bash
-   nel run --config <config_path>
-   ```
+**Single-task rerun** (useful for canary debugging or re-testing after config changes):
 
-After the dry-run, check the output from `nel` for any problems with the config. If there are no problems, propose to first execute the test run with limited samples and then execute the full evaluation. If there are problems, resolve them before executing the full evaluation.
+```bash
+nel run --config <config_path> -t <task_name>
+```
+
+Combine with `-o` for limited samples: `nel run --config <config_path> -t <task_name> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10`
+
+**Step 8.3: Full evaluation** (production run after the canary passes):
+
+```bash
+nel run --config <config_path>
+```
+
+Before the full run, remove the `limit_samples` override and keep only the
+parallelism/settings that the canary validated. If the canary fails, fix the
+config, credentials, image/container, judge setup, code-execution environment, or
+parallelism, then rerun the canary before launching the full evaluation.
 
 **Monitoring Progress**
 
 After job submission, register the job per the **monitor skill** for durable cross-session tracking. For one-off queries (live status, debugging a failed run, analyzing results) use the **launching-evals skill**; for querying past runs in MLflow use **accessing-mlflow**.
 
+**Step 9: Verify completed evaluation run**
+
+Before pulling/reporting scores, validate the completed run itself. Do not accept a run as complete just because `results.yml` or a summary file exists.
+
+For each completed invocation/run directory, whether baseline, quantized, or a single-model run:
+
+1. Inspect client, server/deployment, SLURM, judge, and task-specific/code-execution logs as applicable. Search for `Traceback`, `Exception`, `ERROR`, `FAILED`, `OOM`, `Killed`, `timeout`, `rate limit`, `unauthorized`, `connection refused/reset`, `health check`, `sandbox`, `container`, `judge`, `parse`, `scoring`, and task-specific failure strings.
+2. Confirm the inference server loaded the intended checkpoint/model and stayed healthy through the run: no startup failure, mid-run crash/restart, OOM, request validation failure, max-context truncation, quantization load error, or repeated 4xx/5xx responses.
+3. For judge-backed tasks, confirm judge calls succeeded and were parsed/scored correctly: no auth/rate-limit failures, malformed judge responses, invalid JSON, missing scores, or fallback/default scores.
+4. For code-execution tasks, inspect executor/sandbox/container logs for setup failures, package install failures, timeouts, thread/process exhaustion, permission errors, harness crashes, or skipped tests that would make scores non-comparable.
+5. Confirm sample accounting: expected samples/repeats match completed, scored samples; no unexpected dropped/skipped/failed samples, `unknown_agent_error`, `failed_samples_policy` aborts, empty outputs, or partial result files.
+6. If reasoning traces are present, confirm they are parsed/stripped/ignored before scoring consistently. Check for parser errors, unmatched reasoning delimiters, `finish_reason: length`, reasoning text leaked into answers, answers stripped with the reasoning, or reasoning disabled when the config intended it to be active.
+
+Report the run-validation summary before any score: log scan status, sample accounting, reasoning/answer parsing status, and any errors or warnings found. If any validation item fails, either rerun/fix it or label the result as incomplete or invalid.
+
+For score harvesting, use the `Score Extraction` Python snippet from the matching
+task reference in `recipes/tasks/<task>.md`. Do not rely on ad hoc `results.yml`
+greps when a task reference defines the canonical score and stderr fields.
+
+**Step 10: Verify baseline-vs-quantized comparability**
+
+Before treating a baseline-vs-quantized delta as a model quality result, verify the validated runs are comparable:
+
+1. Confirm the prompt text and chat template/rendered messages match between the baseline and quantized evaluations.
+2. Confirm generation settings match, including temperature, top_p, top_k, max tokens, stop strings, reasoning mode/budget, and any task-specific overrides.
+3. Confirm reasoning-trace handling is consistent between runs.
+4. Confirm the number of evaluated/scored samples matches for each task and split.
+5. Confirm the same accuracy metric/score field is used for the baseline and quantized comparison.
+
+Report the comparability summary alongside the score: prompt/template status, generation-setting status, sample-count status, reasoning-handling status, and the exact score field used. If any item differs, either rerun with matched settings or label the result as not an apples-to-apples quantization comparison.
+
 **NEL-specific diagnostics** (for debugging failures):
 
 ```bash
@@ -311,7 +401,7 @@ nel info <invocation_id> --logs
 ssh <user>@<host> "tail -100 <log_path>/server-<slurm_job_id>-*.log"   # deployment errors
 ssh <user>@<host> "tail -100 <log_path>/client-<slurm_job_id>.log"     # evaluation errors
 ssh <user>@<host> "tail -100 <log_path>/slurm-<slurm_job_id>.log"      # scheduling/walltime
-ssh <user>@<host> "grep -i 'error\|failed' <log_path>/*.log"           # search all logs
+ssh <user>@<host> "grep -i 'traceback\|exception\|error\|failed\|oom\|killed\|timeout\|unauthorized\|rate limit\|sandbox\|container\|judge\|parse\|scoring' <log_path>/*.log"  # search all logs
 ```
 
 ---
@@ -333,6 +423,11 @@ Config Generation Progress:
 - [ ] Step 5: Confirm tasks (iterative)
 - [ ] Step 6: Advanced - Multi-node (Data Parallel)
 - [ ] Step 7: Advanced - Interceptors
-- [ ] Step 7.5: Check container registry auth (SLURM only)
+- [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+  - [ ] Step 8.1: Dry-run / NEL CLI config validation
+  - [ ] Step 8.2: Limited-samples canary
+  - [ ] Step 8.3: Full evaluation
+- [ ] Step 9: Verify completed evaluation run
+- [ ] Step 10: Verify baseline-vs-quantized comparability
 ```
diff --git a/.claude/skills/evaluation/recipes/env.example b/.claude/skills/evaluation/recipes/env.example
index 8d9b9bfa6d9..6cb1728e58a 100644
--- a/.claude/skills/evaluation/recipes/env.example
+++ b/.claude/skills/evaluation/recipes/env.example
@@ -18,11 +18,11 @@ NEMO_EVALUATOR_TRUST_PRE_CMD=1
 
 # --- Optional: task-specific keys ---
 
-# AIME 2025 (simple_evals variant only, not ns_aime2025)
+# AIME 2025, HLE, AA-LCR, and other judge-backed tasks
 # JUDGE_API_KEY=
 
-# tau2_bench_telecom (LLM judge)
-# JUDGE_API_KEY_NVDEV_QWEN235B=
+# tau2_bench_telecom user simulator endpoint
+# USER_API_KEY=
 
 # terminal-bench-hard (AWS sandbox)
 # AWS_ACCESS_KEY_ID=
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index 77887b3f8c3..ad9f40b9124 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -2,12 +2,12 @@
 #
 # A balanced set of benchmarks for validating quantized model quality.
 # Copy this file and customize for your needs.
-# Task snippets in recipes/tasks/ define per-task configs — the agent
-# composes them into a runnable config like this one.
+# Task references in recipes/tasks/ define benchmark requirements and YAML
+# fragments — the agent composes them into a runnable config like this one.
 #
 # Includes:
 #   - MMLU-Pro (knowledge, completions)
-#   - GPQA Diamond (reasoning, chat, 5 repeats)
+#   - GPQA Diamond (reasoning, chat, 32 repeats)
 #   - LiveCodeBench v6 (code, chat, 3 repeats)
 #   - IFBench (instruction following, chat, 8 repeats)
 #
@@ -25,7 +25,7 @@
 # Run a single task:
 #   nel run --config ... -t ns_gpqa
 #
-# Smoke test (2 samples):
+# Canary (2 samples): use this before a full run to validate logs and tune parallelism.
 #   nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2
 defaults:
   - execution: slurm/default
@@ -63,7 +63,7 @@ evaluation:
         api_key_name: DUMMY_API_KEY
   tasks:
   # Knowledge (chat endpoint, short)
-    - name: ns_mmlu_pro
+    - name: nemo_skills.ns_mmlu_pro
       nemo_evaluator_config:
         config:
           params:
@@ -77,14 +77,14 @@ evaluation:
                 - max_new_tokens
                 - max_completion_tokens
 
-  # Reasoning (chat endpoint, 5 repeats, short)
+  # Reasoning (chat endpoint, 32 repeats, short)
     - name: ns_gpqa
       nemo_evaluator_config:
         config:
           params:
             extra:
               args: ++prompt_config=eval/aai/mcq-4choices
-              num_repeats: 5
+              num_repeats: 32
         target:
           api_endpoint:
             adapter_config:
diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
new file mode 100644
index 00000000000..93d5f4db1f9
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -0,0 +1,72 @@
+# AA-LCR
+
+## Task Details
+
+- Task: `aa_lcr`
+- Harness: AA-LCR, chat
+- Primary metric: `accuracy.accuracy`
+- Run time: Long
+- Samples: 3
+- Requires: `HF_TOKEN`, `JUDGE_API_KEY`
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/AA-LCR.html>
+
+## Params
+
+Recommended judge: use Qwen3 235B as an OpenAI-compatible equality-checker
+judge, and keep the same judge across comparable runs.
+
+AA-LCR is long-context sensitive. For 128K-context models, avoid capping
+generation tokens for this task unless the deployment needs the cap for
+stability.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: aa_lcr
+  container: nvcr.io/nvidia/eval-factory/aa-lcr:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+    JUDGE_API_KEY: host:JUDGE_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          n_samples: 3
+          judge:
+            model_id: <qwen3_235b_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
+            api_key: JUDGE_API_KEY
+```
+
+## Score Extraction
+
+AA-LCR accuracy comes from:
+
+```text
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.value
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.stats.stderr
+```
+
+```python
+import yaml
+
+
+def extract_aa_lcr_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["aa_lcr"]["metrics"]["accuracy"]["scores"]
+    entry = scores["accuracy"]
+    accuracy = entry["value"] * 100
+    stderr = entry.get("stats", {}).get("stderr")
+    stderr_pp = stderr * 100 if stderr is not None else None
+
+    return {
+        "group": "aa_lcr",
+        "metric": "accuracy",
+        "score_key": "accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr_pp,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
new file mode 100644
index 00000000000..ed11a8b05a1
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -0,0 +1,93 @@
+# AIME 2025
+
+## Task Details
+
+- Task: `ns_aime2025`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-16] symbolic_correct`
+- Run time: Long for reasoning models with lengthy thinking traces
+- Repeats: 16
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_aime2025
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 100000
+        max_retries: 10
+        extra:
+          num_repeats: 16
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
+
+AIME accuracy (in percentage points) comes from:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
+```
+
+For repeated runs, report stderr as percentage points:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct_statistics_std_err_across_runs.value * 100
+```
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count.
+If the repeat count is unknown, use the highest available `avg-of-N`.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_aime2025_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["aime25"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["symbolic_correct"]["value"]
+    stderr_value = scores.get(
+        "symbolic_correct_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = stderr_value * 100 if stderr_value is not None else None
+
+    return {
+        "group": "aime25",
+        "metric": metric_name,
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
deleted file mode 100644
index 1cf5643f481..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# AIME 2025 (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-16] symbolic_correct
-# Run time: Long (reasoning models generate lengthy thinking traces) | Repeats: 16
-# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY.
-#       This NeMo Skills variant uses symbolic scoring — no external API keys needed.
-  - name: ns_aime2025
-    nemo_evaluator_config:
-      config:
-        params:
-          request_timeout: 100000
-          max_retries: 10
-          extra:
-            num_repeats: 16
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
new file mode 100644
index 00000000000..f9393a04118
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -0,0 +1,94 @@
+# GPQA Diamond
+
+## Task Details
+
+- Task: `ns_gpqa`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-32] symbolic_correct`
+- Run time: Short
+- Samples: 32
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_gpqa
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          args: ++prompt_config=eval/aai/mcq-4choices
+          n_samples: 32
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
+
+GPQA accuracy comes from:
+
+```text
+results.groups.gpqa.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
+```
+
+For repeated runs, report stderr as percentage points:
+
+```text
+symbolic_correct_statistics_std_err_across_runs.value * 100
+```
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count. If the
+repeat count is unknown, use the highest available `avg-of-N`.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_gpqa_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["gpqa"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["symbolic_correct"]["value"]
+    stderr_value = scores.get(
+        "symbolic_correct_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = stderr_value * 100 if stderr_value is not None else None
+
+    return {
+        "group": "gpqa",
+        "metric": metric_name,
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
deleted file mode 100644
index 3692175d987..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# GPQA Diamond (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-5] symbolic_correct
-# Run time: Short | Repeats: 5
-  - name: ns_gpqa
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            args: ++prompt_config=eval/aai/mcq-4choices
-            num_repeats: 5
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
new file mode 100644
index 00000000000..35fcf3950c0
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -0,0 +1,91 @@
+# IFBench
+
+## Task Details
+
+- Task: `ns_ifbench`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-8] prompt_loose_accuracy`
+- Run time: Super short
+- Repeats: 8
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_ifbench
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 8
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
+
+IFBench primary AA-aligned accuracy (in percentage points) comes from:
+
+```text
+results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
+```
+
+`results.yml` does **not** include a direct
+`prompt_loose_accuracy_statistics_std_err_across_runs`; the closest available
+across-run stderr is `prompt_statistics_std_err_across_runs`. It is computed
+over the strict + loose prompt-level average rather than
+`prompt_loose_accuracy` alone, so report it as an approximate uncertainty.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_ifbench_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["ifbench"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["prompt_loose_accuracy"]["value"]
+    proxy_stderr_value = scores.get(
+        "prompt_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = proxy_stderr_value * 100 if proxy_stderr_value is not None else None
+
+    return {
+        "group": "ifbench",
+        "metric": metric_name,
+        "score_key": "prompt_loose_accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr,
+        "stderr_source": "prompt_statistics_std_err_across_runs (proxy)",
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
deleted file mode 100644
index 46cbc2db085..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# IFBench (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
-# Run time: Super Short | Repeats: 8
-  - name: ns_ifbench
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 8
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.md b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
new file mode 100644
index 00000000000..f61b04ba562
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
@@ -0,0 +1,36 @@
+# LiveCodeBench v6
+
+## Task Details
+
+- Task: `ns_livecodebench`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-3] accuracy`
+- Run time: Medium
+- Repeats: 3
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_livecodebench
+  nemo_evaluator_config:
+    config:
+      params:
+        max_retries: 10
+        extra:
+          dataset_split: test_v6_2408_2505
+          num_repeats: 3
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
deleted file mode 100644
index 202387a1eb6..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# LiveCodeBench v6 (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-3] accuracy
-# Run time: Medium | Repeats: 3
-  - name: ns_livecodebench
-    nemo_evaluator_config:
-      config:
-        params:
-          max_retries: 10
-          extra:
-            dataset_split: test_v6_2408_2505
-            num_repeats: 3
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
new file mode 100644
index 00000000000..4579e824889
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -0,0 +1,64 @@
+# MMLU-Pro
+
+## Task Details
+
+- Task: `nemo_skills.ns_mmlu_pro`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1 symbolic_correct`
+- Run time: Short
+- Repeats: 1
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: nemo_skills.ns_mmlu_pro
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 1
+          args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
+
+```text
+results.groups.mmlu_pro.metrics.pass@1.scores.symbolic_correct.value
+```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmlu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmlu_pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmlu_pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
deleted file mode 100644
index be16a546a39..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# MMLU-Pro (NeMo Skills, chat)
-# Primary metric: symbolic_correct
-# Run time: Short | Repeats: 1
-  - name: ns_mmlu_pro
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 1
-            args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
new file mode 100644
index 00000000000..f3490e2b046
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
@@ -0,0 +1,63 @@
+# MMMU-Pro
+
+## Task Details
+
+- Task: `ns_mmmu_pro`
+- Harness: NeMo Skills, multimodal chat
+- Primary metric: `pass@1 symbolic_correct`
+- Run time: Medium
+- Repeats: 1
+- Requires: `HF_TOKEN`
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+MMMU-Pro is a multimodal task. Use a multimodal-capable endpoint.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_mmmu_pro
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 1
+```
+
+## Score Extraction
+
+MMMU-Pro accuracy (already in percentage points) comes from:
+
+```text
+results.groups."mmmu-pro".metrics.pass@1.scores.symbolic_correct.value
+```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmmu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmmu-pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmmu-pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
new file mode 100644
index 00000000000..4c952bc3b0a
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
@@ -0,0 +1,67 @@
+# HLE AA
+
+## Task Details
+
+- Task: `ns_hle_aa`
+- Harness: nemo-skills, chat
+- Primary metric: `pass@1 judge_correct`
+- Run time: Long
+- Repeats: 1
+- Requires: `HF_TOKEN`, `JUDGE_API_KEY`
+- Reference: <https://docs.nvidia.com/nemo/evaluator/nightly/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+This is the text-only HLE task with params aligned to Artificial Analysis Index
+v2. HLE is judge-scored and requires judge credentials.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_hle_aa
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+    JUDGE_API_KEY: host:JUDGE_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          judge:
+            model_id: <hle_aa_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
+            api_key: JUDGE_API_KEY
+```
+
+## Score Extraction
+
+HLE AA accuracy comes from:
+
+```text
+results.groups.hle.metrics.pass@1.scores.judge_correct.value
+```
+
+```python
+import yaml
+
+
+def extract_ns_hle_aa_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["hle"]["metrics"]["pass@1"]["scores"]
+    accuracy = scores["judge_correct"]["value"]
+    symbolic = scores.get("symbolic_correct", {}).get("value")
+    n = scores["judge_correct"].get("stats", {}).get("count")
+
+    return {
+        "group": "hle",
+        "metric": "pass@1",
+        "score_key": "judge_correct",
+        "accuracy": accuracy,
+        "symbolic_correct": symbolic,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
new file mode 100644
index 00000000000..46e21074ba3
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -0,0 +1,145 @@
+# SciCode
+
+## Task Details
+
+- Task: `ns_scicode`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-3] subtask_accuracy`
+- Run time: Long
+- Repeats: 3
+- Requires: None
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+SciCode is a NeMo Skills code/reasoning benchmark with multi-step prompts and a
+code-execution sandbox. Check this reference before creating or modifying NEL
+configs for SciCode; the benchmark has deployment, parallelism, and score
+harvesting requirements beyond the task YAML fragment.
+
+## Config Requirements
+
+- Use `--max-model-len 65536` for the deployment. Do not leave the generic
+  `32768` fallback in place; SciCode multi-step prompts can exceed 32K tokens.
+- Keep `parallelism: 4` unless a canary proves a different value is safe. Higher
+  parallelism can flood the code-execution sandbox and produce resource/thread
+  failures even when the SLURM job completes.
+- Generate enough answer tokens for multi-step solutions:
+  `++inference.tokens_to_generate=32768`.
+- For reasoning-capable endpoints that support OpenAI-style effort controls, set
+  `reasoning_effort: high` through `params_to_add`, not prompt text.
+- Use repeats when runtime permits so the result file contains uncertainty
+  estimates. The intended full-run plan is `num_repeats: 3`; if using a variant
+  that expects `n_repeats`, keep it aligned at `3`. Lower repeat counts are fine
+  for canaries, but do not report stderr from a run that did not produce repeat
+  statistics.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_scicode
+  nemo_evaluator_config:
+    config:
+      params:
+        max_retries: 10
+        parallelism: 4
+        extra:
+          args: ++inference.tokens_to_generate=32768
+          num_repeats: 3
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+          params_to_add:
+            reasoning_effort: high
+```
+
+Also make sure the deployment-level args include `--max-model-len 65536`,
+preserving any other required model-card or quantization args:
+
+```yaml
+deployment:
+  extra_args: --max-model-len 65536
+```
+
+## Score Extraction
+
+SciCode accuracy comes from:
+
+```text
+results.groups.scicode.metrics."pass@1[avg-of-N]".scores.subtask_accuracy.value
+```
+
+For repeated runs, report stderr as:
+
+```text
+subtask_accuracy_statistics_std_err_across_runs.value * 100 * num_problems / num_subtasks
+```
+
+The helper below also supports GPQA's matching layout, where accuracy comes from
+`symbolic_correct.value` and stderr is
+`symbolic_correct_statistics_std_err_across_runs.value * 100`.
+
+```python
+import re
+import yaml
+
+
+TASKS = {
+    "scicode": {
+        "score_key": "subtask_accuracy",
+        "stderr_scale": "subtasks",
+    },
+    "gpqa": {
+        "score_key": "symbolic_correct",
+        "stderr_scale": "percent",
+    },
+}
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_pass1_metric(metrics):
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_score(path, group="scicode"):
+    spec = TASKS[group]
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"][group]["metrics"]
+    metric_name = select_pass1_metric(metrics)
+    scores = metrics[metric_name]["scores"]
+
+    score_key = spec["score_key"]
+    accuracy = scores[score_key]["value"]
+
+    stderr_key = f"{score_key}_statistics_std_err_across_runs"
+    stderr_value = scores.get(stderr_key, {}).get("value")
+    stderr = None
+    if stderr_value is not None:
+        if spec["stderr_scale"] == "subtasks":
+            num_problems = scores["num_problems"]["value"]
+            num_subtasks = scores["num_subtasks"]["value"]
+            stderr = stderr_value * 100 * num_problems / num_subtasks
+        else:
+            stderr = stderr_value * 100
+
+    return {
+        "group": group,
+        "metric": metric_name,
+        "score_key": score_key,
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
deleted file mode 100644
index 724b6935759..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/scicode.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# SciCode (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-3] subtask_accuracy
-# Run time: Long | Repeats: 3
-  - name: ns_scicode
-    nemo_evaluator_config:
-      config:
-        params:
-          max_retries: 10
-          extra:
-            num_repeats: 3
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
new file mode 100644
index 00000000000..ea96cbbf17c
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
@@ -0,0 +1,38 @@
+# Tau2 Bench Telecom
+
+## Task Details
+
+- Task: `tau2_bench_telecom`
+- Harness: tau2_bench, chat
+- Primary metric: `pass_1`
+- Run time: Long
+- Samples: 3
+- Requires: `USER_API_KEY`
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/tau2_bench.html#tau2-bench-tau2-bench-telecom>
+
+## Params
+
+Tau2 Bench uses the evaluated model as the agent and a separate LLM endpoint as
+the user simulator. Configure the user simulator explicitly and keep it fixed
+across comparable runs.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: tau2_bench_telecom
+  container: nvcr.io/nvidia/eval-factory/tau2-bench:26.03
+  env_vars:
+    USER_API_KEY: host:USER_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          user:
+            model_id: <user_simulator_model_id>
+            url: <openai_compatible_user_simulator_chat_completions_url>
+            api_key: USER_API_KEY
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/tests/evals.json b/.claude/skills/evaluation/tests/evals.json
index 0f35dacd7a8..823d5bc6790 100644
--- a/.claude/skills/evaluation/tests/evals.json
+++ b/.claude/skills/evaluation/tests/evals.json
@@ -7,7 +7,7 @@
     "expected_behavior": [
       "Verifies nel is installed by running 'nel --version'",
       "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks) before generating the config",
-      "Runs 'nel skills build-config' with correct flags matching user answers: --execution slurm --deployment vllm --model-type reasoning --benchmarks standard code math_reasoning --export mlflow",
+      "Runs 'nel skills build-config' with correct flags matching user answers, using NEL's current model_type terminology for the chat/reasoning template family",
       "Searches the web for the model card on HuggingFace and extracts model-specific settings",
       "Sets correct HF handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
       "Sets reasoning sampling params from model card: temperature=1.0, top_p=1.0",
@@ -22,7 +22,11 @@
       "Presents task list and waits for user confirmation before proceeding",
       "Configures request and response logging interceptors under evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config using correct field names (max_logged_requests/max_logged_responses, not max_saved_*)",
       "Handles dry-run failure for missing HF_TOKEN_FOR_GPQA_DIAMOND by offering to fix the config",
-      "Successfully submits test run with limit_samples=10 after dry-run passes",
+      "Successfully submits a limited-samples canary with limit_samples=10 after dry-run passes",
+      "Inspects canary logs for judge, evaluation container, deployment, and code-execution errors before allowing the full run",
+      "Tunes parallelism from canary results and avoids over-raising it for judge-backed or code-execution tasks",
+      "Before reporting final scores, inspects full-run logs for judge, inference server, code-execution, reasoning parsing, dropped-sample, and scoring errors",
+      "Keeps final-run validity checks separate from baseline-vs-quantized comparability checks",
       "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked"
     ]
   },
@@ -36,13 +40,16 @@
       "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks)",
       "Runs nel skills build-config with correct flags matching user answers",
       "Sets deployment.checkpoint_path to ./llama-3.1-8b-fp8 and deployment.hf_model_handle to null",
-      "Auto-detects quantization format by reading ./llama-3.1-8b-fp8/hf_quant_config.json",
-      "Finds quant_algo=FP8 and adds --quantization modelopt to deployment.extra_args",
+      "Auto-detects quantization format by reading ./llama-3.1-8b-fp8/config.json first, then ./llama-3.1-8b-fp8/hf_quant_config.json if needed",
+      "Finds quant_algo=FP8 and does not add a vLLM --quantization flag unless the vLLM version, model card, or dry-run error requires one",
       "Recommends accuracy-sensitive benchmarks from references/quantization-benchmarks.md",
       "Searches web for Llama-3.1-8B model card and extracts sampling params, context length, TP settings",
       "Fills in remaining missing values by asking user",
-      "Runs dry-run, then test with limit_samples=10, then full evaluation",
-      "Reports accuracy results per benchmark"
+      "Runs dry-run, then a limited-samples canary with limit_samples=10, then full evaluation only after canary log checks pass",
+      "Tunes parallelism from the canary and avoids over-raising it when benchmarks depend on inference judges or code execution containers",
+      "Before reporting accuracy, scans final run logs and sample accounting for runtime errors, dropped/skipped samples, reasoning parsing problems, and wrong score-field usage",
+      "Runs comparability verification as a separate step after final-run validation when comparing baseline and quantized results",
+      "Reports accuracy results per benchmark only after log validation passes"
     ]
   },
   {
@@ -54,11 +61,14 @@
       "Verifies nel is installed by running nel --version",
       "Asks 5 base config questions with execution=slurm pre-selected based on user request",
       "Runs nel skills build-config with --execution slurm --deployment vllm --benchmarks standard",
-      "Detects FP8 quantization from hf_quant_config.json and sets deployment.extra_args with --quantization modelopt",
+      "Detects FP8 quantization from config.json or hf_quant_config.json and does not add a vLLM --quantization flag unless the vLLM version, model card, or dry-run error requires one",
       "Reads references/quantization-benchmarks.md and recommends accuracy-sensitive benchmarks",
       "Uses WebSearch to research model card for sampling params and context length",
       "Fills in SLURM-specific values: hostname, account, partition from user input",
-      "Runs dry-run validation before full evaluation",
+      "Runs dry-run validation and a limited-samples canary before full evaluation",
+      "Checks canary logs for judge, container, deployment, and code-execution failures before treating the run as ready",
+      "Checks final run logs and sample accounting before accepting result files as complete and comparable",
+      "Separately verifies baseline-vs-quantized comparability before presenting an accuracy delta",
       "Provides SSH-based log monitoring commands for SLURM execution"
     ]
   }
diff --git a/.claude/skills/launching-evals/references/analyze-results.md b/.claude/skills/launching-evals/references/analyze-results.md
index fd49d400468..48e0a87bb59 100644
--- a/.claude/skills/launching-evals/references/analyze-results.md
+++ b/.claude/skills/launching-evals/references/analyze-results.md
@@ -50,8 +50,8 @@ Check logs for silent errors that may invalidate results:
 5. **Max model length**: Verify `max-model-len` = 131072 (leaderboard-recommended). Long context benchmarks (AA LCR, RULER) and agentic benchmarks may require a longer `max-model-len`.
 6. **RULER tasks**: Check thinking disabled, walltime=4h, rope-scaling for Qwen models
 7. **AA baseline comparison**: Compare results against Artificial Analysis published scores. Exact match not expected — flag significant deviations.
+8. **Model baseline comparison**: For quantized runs, compare results against the matching baseline model run when available. The baseline may be unquantized or simply less quantized (for example, FP8 as the baseline for NVFP4). Use the same benchmark version, task config, serving args, token limits, dataset setup, and infrastructure before treating the delta as a quantization effect.
 
 ## Step 4: Report findings
 
 Present key metrics from `results.yml` in a table and summarize the metrics from `eval_factory_metrics.json` in a concise manner (include only the most important metrics or anomalies). If multiple runs, include side-by-side comparison of metrics (e.g. accuracy, latency, tokens count, memory). Summarize any issues found. Recommend improvements if applicable.
-
diff --git a/.claude/skills/monitor/SKILL.md b/.claude/skills/monitor/SKILL.md
index cd896c347e9..14ce4e14c32 100644
--- a/.claude/skills/monitor/SKILL.md
+++ b/.claude/skills/monitor/SKILL.md
@@ -10,13 +10,31 @@ Monitor jobs submitted to SLURM clusters — PTQ quantization, NEL evaluation, m
 ## When to use
 
 1. **Auto-monitor** — another skill (PTQ, evaluation, deployment) just submitted a job. Register the job and set up monitoring immediately.
-2. **User-initiated** — user asks about a job status, possibly in a new conversation. Check the registry, identify the job, and report.
+2. **User-initiated** — user asks about a job status. Check the current session registry first; if the job is not registered there, use the discovery steps below.
 
 ---
 
 ## Job Registry
 
-All active jobs are tracked in `.claude/active_jobs.json`. This file is the single source of truth for what's being monitored.
+Active jobs are tracked in per-session registries under `.claude/agents/`.
+This avoids multiple agents clobbering one shared registry when they run at
+the same time.
+
+Use the current agent session id as `<session_id>`:
+
+- Claude Code: `$CLAUDE_CODE_SESSION_ID`, or the `session_id` field from hook input
+- Codex: `$CODEX_THREAD_ID`
+- If no session id is available, create a stable id for the current terminal session and reuse it for every job registered by that agent
+
+Registry layout:
+
+```text
+.claude/agents/
+  <session_id>/
+    active_jobs.json
+```
+
+Each session's `active_jobs.json` is a JSON array:
 
 ```json
 [
@@ -27,7 +45,11 @@ All active jobs are tracked in `.claude/active_jobs.json`. This file is the sing
     "user": "<ssh_user>",
     "submitted": "YYYY-MM-DD HH:MM",
     "description": "<what this job does>",
-    "last_status": "<last known status>"
+    "last_status": "<last known status>",
+    "owner": {
+      "agent": "claude-code|codex|manual",
+      "session_id": "<session_id>"
+    }
   }
 ]
 ```
@@ -40,45 +62,92 @@ All active jobs are tracked in `.claude/active_jobs.json`. This file is the sing
 
 Every time a job is submitted (by any skill or manually):
 
-1. **Add an entry** to `.claude/active_jobs.json`. Create the file if it doesn't exist.
-2. **Set up a durable recurring cron** (if one isn't already running) that polls all registered jobs every 15 minutes. The cron prompt should: read the registry, check each job, report state changes to the user, remove completed jobs, and delete itself when the registry is empty.
+1. **Add an entry** to `.claude/agents/<session_id>/active_jobs.json`. Create the session directory and file if they don't exist.
+2. **Start a durable monitor** (if one isn't already watching the registry) that polls this session's registered jobs until they reach terminal status. Prefer the Claude Code `Monitor` tool when it is available: write a small watcher that reads `.claude/agents/<session_id>/active_jobs.json`, checks every job with the appropriate method below, prints state-change events, updates `last_status`, removes terminal jobs from the session registry, and exits when no active jobs remain for this session.
+
+The monitor should terminate naturally when every registered job has reached a terminal state. If the `Monitor` tool is not available in the current harness, run an equivalent background process that implements the same loop and lets the agent resume/restart when the process exits.
 
 Always do both steps. Don't try to predict job duration.
 
 ---
 
-## On Cron Fire / Status Check
+## On Monitor Event / Status Check
 
-Whether triggered by the cron or by the user asking "check status":
+Whether triggered by monitor output or by the user asking "check status":
 
-1. **Read the registry** from `.claude/active_jobs.json`
+1. **Read the registry** from `.claude/agents/<session_id>/active_jobs.json`
 2. **Check each job** using the appropriate method (see below)
 3. **Report only state changes** — compare against `last_status` in registry
-4. **Update `last_status`** in the registry
-5. **Remove completed jobs** — any job in a terminal state (COMPLETED, FAILED, CANCELLED, KILLED)
-6. **If registry is empty** — delete the recurring cron
+4. **Update `last_status`** in the session registry
+5. **Remove completed jobs** — any job in a terminal state (COMPLETED, FAILED, CANCELLED, KILLED, TIMEOUT, NODE_FAIL, OUT_OF_MEMORY, PREEMPTED, BOOT_FAIL, DEADLINE)
+6. **If no active jobs remain** — let the monitor exit
 
 ---
 
 ## How to Check Each Job Type
 
+Each check method has its **own** status vocabulary. A watcher that mixes them
+(e.g. uses SLURM's `COMPLETED` terminal-state regex against `nel status` output)
+will silently never fire terminal transitions. Always match against the
+vocabulary of the source you're polling.
+
 ### NEL jobs (`type: nel`)
 
-- **Check:** `nel status <id>`
-- **On completion:** `nel info <id>` to fetch results
-- **On failure:** `nel info <id> --logs` then inspect server/client/SLURM logs via SSH
+- **Check:** `nel status <id>`.
+
+```bash
+extract_nel_state() {
+  local jid="$1" nel_bin="${NEL:-nel}" output state_col
+  output=$("$nel_bin" status "$jid" 2>&1)
+  state_col=$(echo "$output" \
+    | awk -F'|' -v prefix="$jid." 'index($1, prefix) == 1 { print $2; exit }')
+  [ -z "$state_col" ] && state_col="$output"
+  echo "$state_col" \
+    | LC_ALL=C tr '[:lower:]' '[:upper:]' \
+    | awk 'match($0, /(PENDING|RUNNING|SUCCESS|FAILED|KILLED|ERROR|NOT[[:space:]]+FOUND)/) { print substr($0, RSTART, RLENGTH); exit }' \
+    | sed 's/[[:space:]][[:space:]]*/ /g'
+}
+
+is_nel_terminal() {
+  case "$(extract_nel_state "$1")" in
+    SUCCESS|FAILED|KILLED|ERROR|"NOT FOUND") return 0 ;;
+    *) return 1 ;;
+  esac
+}
+```
+
+- **On completion:** `nel info <id>` to fetch results.
+- **On failure:** `nel info <id> --logs` then inspect server/client/SLURM logs via SSH.
 
 ### Launcher jobs (`type: launcher`)
 
-- **Check:** Tail the launcher's background output file for key events
-- **Key events:** experiment ID, SLURM job ID, container import, calibration progress, export path, final status
-- **On failure:** Look for `Traceback`, `Error`, or `FAILED` in the output
+- **Check:** Tail the launcher's background output file for key events.
+- **Key events:** experiment ID, SLURM job ID, container import, calibration progress, export path, final status.
+- **On failure:** Look for `Traceback`, `Error`, or `FAILED` in the output.
 
 ### Raw SLURM jobs (`type: slurm`)
 
-- **Check:** `ssh <host> "squeue -j <id> -h -o '%T %M %R'"` — if empty, job left the queue
-- **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`
-- **On failure:** Check the job's output log file
+- **Check:** `sacct`; use `sacct` for the termination check because `squeue`
+  can lag in `COMPLETING` after `sacct` reports a terminal state.
+
+```bash
+extract_slurm_state() {
+  local jid="$1" host="$2"
+  ssh "$host" "sacct -j $jid -X --format=State --noheader -P 2>/dev/null | head -1" \
+    | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
+    | sed 's/^CANCELLED by .*/CANCELLED/'
+}
+
+is_slurm_terminal() {
+  case "$(extract_slurm_state "$1" "$2")" in
+    COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY|PREEMPTED|BOOT_FAIL|DEADLINE) return 0 ;;
+    *) return 1 ;;
+  esac
+}
+```
+
+- **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`.
+- **On failure:** Check the job's output log file.
 
 ---
 
@@ -86,7 +155,7 @@ Whether triggered by the cron or by the user asking "check status":
 
 When the user asks about a job without specifying an ID, check in order:
 
-1. `.claude/active_jobs.json` — most reliable, has context
+1. `.claude/agents/<current_session_id>/active_jobs.json` — current agent's jobs
 2. `nel ls runs --since 1d` — recent NEL runs
 3. `ssh <host> "squeue -u <user>"` — active SLURM jobs
 4. `ls -lt tools/launcher/experiments/cicd/ | head -10` — recent launcher experiments
diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
index e9e73254bd9..753917882e8 100644
--- a/.claude/skills/ptq/SKILL.md
+++ b/.claude/skills/ptq/SKILL.md
@@ -59,6 +59,10 @@ If a model-specific recipe exists, use `--recipe <path>` — it may contain tune
 
 Use `--qformat <name>` (e.g., `--qformat nvfp4`). Format definitions: `modelopt/torch/quantization/config.py`. General PTQ recipes in `modelopt_recipes/general/ptq/` correspond to the same formats — `--qformat` is the simpler way to use them.
 
+Before running PTQ, sanity-check the selected qformat/recipe against the model structure. Inspect the recipe's include/exclude patterns and summarize which layer groups will be quantized and approximately how many modules/layers match (attention projections, MLP projections, experts, etc.). If the match count is 0, or far smaller than expected for the model, stop and fix the recipe or ask the user before launching calibration.
+
+If the source checkpoint is already quantized and the requested recipe/config reduces quantization coverage, confirm that intent with the user before running. For example, if an FP8 checkpoint is used as input and the recipe excludes some layers so they would fall back to BF16 instead of staying quantized, call out the affected layer groups and ask whether that FP8-to-BF16 fallback is intended.
+
 > NVFP4 can be calibrated on Hopper but requires Blackwell for inference.
 
 ## Step 4 — Run PTQ
@@ -131,7 +135,15 @@ Report the path and size to the user.
 
 ### Post-quantization validation
 
-Validate the exported checkpoint's quantization pattern matches the recipe. Quantization config patterns can silently miss layers if the model uses non-standard naming (e.g., Gemma4 `experts.*` missed by `*mlp*` patterns) — this only surfaces later as deployment failures. Read `references/checkpoint-validation.md` for the validation script, expected patterns per recipe, and common pattern gaps.
+This is a required gate before any deployment or evaluation submission. Do not submit an eval, start a serving job, or hand off the checkpoint as ready until the gate has passed.
+
+Read `references/checkpoint-validation.md` and perform all three validation groups on the exact checkpoint path that will be deployed/evaluated:
+
+1. Check output size and estimated bits per weight against the baseline/source checkpoint.
+2. Check quantized-weight coverage against the requested qformat/recipe/config.
+3. Check metadata consistency against the baseline/source model.
+
+Report the gate result before moving on. The report must include source size, output size, output/source size ratio, layer precision counts (for example NVFP4, FP8, INT4, BF16/unquantized excluded, unexpected unquantized, declaration mismatches), and metadata diffs. If the output/source ratio is >= 1.0 for a compression recipe, if any intended layer group is missing quantization, or if metadata changed unexpectedly, stop and fix the checkpoint or ask the user before proceeding.
 
 **Next steps**: If the user wants to deploy or evaluate the quantized checkpoint, use the **deployment** or **evaluation** skill. The checkpoint workspace carries over. If the model required patches during PTQ (e.g., transformers upgrade), the same fixes will likely be needed at deployment and evaluation time.
 
@@ -160,7 +172,7 @@ Validate the exported checkpoint's quantization pattern matches the recipe. Quan
 | `references/launcher-guide.md` | Step 4B only (launcher path) |
 | `tools/launcher/CLAUDE.md` | Step 4B only, if you need more launcher detail |
 | `references/unsupported-models.md` | Step 4C only (unlisted model) |
-| `references/checkpoint-validation.md` | Step 5: validate quantization pattern matches recipe |
+| `references/checkpoint-validation.md` | Step 5: mandatory post-PTQ gate before deployment/evaluation |
 | `skills/common/remote-execution.md` | Step 4A/4C only, if target is remote |
 | `skills/common/slurm-setup.md` | Step 4A/4C only, if using SLURM manually (not launcher) |
 | `references/slurm-setup-ptq.md` | Step 4A/4C only, PTQ-specific SLURM (container, GPU sizing, FSDP2) |
diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.claude/skills/ptq/references/checkpoint-validation.md
index 68d1ddd075c..f62ecbe0cc2 100644
--- a/.claude/skills/ptq/references/checkpoint-validation.md
+++ b/.claude/skills/ptq/references/checkpoint-validation.md
@@ -1,6 +1,29 @@
 # Post-Quantization Checkpoint Validation
 
-Verify the exported checkpoint's quantization pattern matches the recipe used. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
+Before treating an exported checkpoint as ready for deployment/evaluation, verify checkpoint size/bits, quantized-weight coverage, and metadata consistency. This is a gate, not a guideline: do not submit evals, start serving jobs, or mark the checkpoint ready until all required checks pass and the validation report is recorded.
+
+## Required checks
+
+1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. Record source size, output size, and output/source ratio. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. If the size reduction is small or missing, explain why before proceeding.
+2. The weights that were actually quantized match what the requested qformat/recipe/config targeted. Record layer precision counts grouped by actual/declarative precision, such as NVFP4, FP8, INT4, BF16/unquantized excluded, unexpected unquantized, and declaration mismatches. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
+3. Metadata that should not change still matches the baseline/source model. Compare generation settings, tokenizer files, chat template, model architecture fields, max positions/context length, and special tokens; quantization should affect weights and quantization metadata, not silently change prompting or generation behavior. Record every diff and classify it as expected or blocking.
+
+## Gate report
+
+Before moving to deployment/evaluation, report a table in this shape:
+
+| Check | Result |
+| --- | --- |
+| Size vs source | `<output> GB / <source> GB = <ratio>x`; PASS only if the ratio matches the recipe's compression intent |
+| Layer precision counts | `<count> NVFP4 / <count> FP8 / <count> INT4 / <count> BF16-or-excluded / <count> unexpected / <count> declaration mismatches` |
+| Metadata | `no unexpected diffs` or list exact diffs |
+
+Stop instead of proceeding if:
+
+- Output/source ratio is `>= 1.0` for a compression recipe, unless the user explicitly accepts the explanation.
+- Any layer group intended to be quantized has zero or unexpectedly low coverage.
+- Any layer has quantization metadata inconsistent with its declared precision.
+- Prompting, tokenizer, generation, architecture, context-length, or special-token metadata changed unexpectedly.
 
 ## Expected quantization patterns by recipe
 
@@ -13,18 +36,49 @@ Verify the exported checkpoint's quantization pattern matches the recipe used. Q
 | `fp8` | All linear layers | lm_head, norms, embeddings |
 | `int4_awq` | All linear layers | lm_head, norms, embeddings |
 
-## Validation script
+## Size check
+
+Compare only checkpoint weight files, not cache directories or eval artifacts:
+
+```bash
+python3 -c "
+from pathlib import Path
+
+source = Path('<source_checkpoint_path>')
+output = Path('<output_path>')
+
+def safetensor_bytes(path):
+    files = list(path.glob('*.safetensors')) if path.is_dir() else [path]
+    return sum(p.stat().st_size for p in files)
+
+src = safetensor_bytes(source)
+dst = safetensor_bytes(output)
+ratio = dst / src if src else float('nan')
+print(f'Source safetensors: {src / 1e9:.2f} GB')
+print(f'Output safetensors: {dst / 1e9:.2f} GB')
+print(f'Output/source ratio: {ratio:.2f}x')
+"
+```
+
+Treat the ratio as the first-order bits-per-weight proxy unless you separately load tensors and compute exact parameter bit counts. For compression recipes, a ratio at or above `1.0x` is blocking unless the user explicitly accepts the explanation.
+
+## Layer coverage and precision script
 
-Run against the exported checkpoint to check every linear layer is either quantized (has scale params) or explicitly excluded:
+Run against the exported checkpoint to check every linear layer is either quantized with the expected precision or explicitly excluded. This handles both uniform `quant_algo` exports and mixed-precision `quantized_layers` exports:
 
 ```bash
 python3 -c "
-import json, fnmatch
+import collections, fnmatch, json, os
 
 output = '<output_path>'
-idx = json.load(open(f'{output}/model.safetensors.index.json'))
-cfg = json.load(open(f'{output}/hf_quant_config.json'))
-excludes = cfg['quantization']['exclude_modules']
+idx = json.load(open(os.path.join(output, 'model.safetensors.index.json')))
+cfg = json.load(open(os.path.join(output, 'hf_quant_config.json')))
+q = cfg.get('quantization', {})
+excludes = q.get('exclude_modules', []) or q.get('ignore', [])
+declared_layers = q.get('quantized_layers') or {}
+uniform_algo = q.get('quant_algo')
+if uniform_algo == 'MIXED_PRECISION':
+    uniform_algo = None
 
 all_keys = set(idx['weight_map'].keys())
 # Identify linear weight params (skip norms, embeddings, scalars, scales)
@@ -32,27 +86,48 @@ skip_suffixes = ('_scale', '_scale_2', 'layernorm', 'layer_norm', 'norm.weight',
 linear_weights = sorted(k for k in all_keys
     if k.endswith('.weight') and not any(s in k.lower() for s in skip_suffixes))
 
-# Check which have quantization scales
-quantized, excluded, unexpected = [], [], []
+def is_excluded(base, weight):
+    return any(fnmatch.fnmatch(weight, p) or fnmatch.fnmatch(base, p) for p in excludes)
+
+def declared_algo(base):
+    if base in declared_layers:
+        return declared_layers[base].get('quant_algo', 'DECLARED_UNKNOWN')
+    if is_excluded(base, base + '.weight'):
+        return 'BF16/EXCLUDED'
+    if uniform_algo:
+        return uniform_algo
+    return 'UNDECLARED'
+
+precision_counts = collections.Counter()
+unexpected = []
+mismatches = []
 for w in linear_weights:
     base = w.rsplit('.weight', 1)[0]
-    has_scales = any(f'{base}.{s}' in all_keys for s in ['weight_scale', 'input_scale'])
-    is_excluded = any(fnmatch.fnmatch(w, p) or fnmatch.fnmatch(base, p) for p in excludes)
-
-    if has_scales:
-        quantized.append(w)
-    elif is_excluded:
-        excluded.append(w)
+    algo = declared_algo(base)
+    has_scales = any(f'{base}.{s}' in all_keys for s in
+                     ['weight_scale', 'weight_scale_2', 'input_scale', 'activation_scale', 'weight_scale_inv'])
+
+    if has_scales and algo not in ('BF16/EXCLUDED', 'UNDECLARED'):
+        precision_counts[algo] += 1
+    elif has_scales and algo in ('BF16/EXCLUDED', 'UNDECLARED'):
+        precision_counts['QUANTIZED_BUT_' + algo.replace('/', '_')] += 1
+        mismatches.append((w, algo, 'has quantization scales'))
+    elif not has_scales and algo == 'BF16/EXCLUDED':
+        precision_counts['BF16/EXCLUDED'] += 1
     else:
-        unexpected.append(w)
-
-print(f'Quantized layers: {len(quantized)}')
-print(f'Excluded layers (in exclude_modules): {len(excluded)}')
+        precision_counts['UNEXPECTED_UNQUANTIZED'] += 1
+        unexpected.append((w, algo, 'no quantization scales'))
+
+print('Layer precision counts:')
+for name, count in sorted(precision_counts.items()):
+    print(f'  {name}: {count}')
+print(f'Unexpected unquantized layers: {len(unexpected)}')
+print(f'Declaration mismatches: {len(mismatches)}')
 if unexpected:
     print(f'\nWARNING: {len(unexpected)} layers have NO scales and are NOT in exclude list:')
     # Group by module type for readability
     groups = {}
-    for w in unexpected:
+    for w, algo, reason in unexpected:
         parts = w.split('.')
         module_type = next((p for p in parts if p in
             ('self_attn', 'mlp', 'experts', 'router', 'lm_head', 'embed_tokens', 'vision_tower')), 'other')
@@ -64,8 +139,14 @@ if unexpected:
     print('Likely cause: quantization config patterns did not match these module names.')
     print('This WILL cause deployment failures (framework loads them as quantized but they are BF16).')
     print('Fix: add missing patterns to the config, or add to exclude_modules if intentionally unquantized.')
-else:
-    print('\nAll layers are either quantized or explicitly excluded. Checkpoint is consistent.')
+if mismatches:
+    print(f'\nWARNING: {len(mismatches)} layers have declaration/metadata mismatches:')
+    for w, algo, reason in mismatches[:20]:
+        print(f'  {w}: declared {algo}, {reason}')
+    if len(mismatches) > 20:
+        print(f'  ... {len(mismatches) - 20} more')
+if not unexpected and not mismatches:
+    print('\nAll layers are quantized at the declared precision or explicitly excluded.')
 "
 ```
 
diff --git a/.claude/skills/ptq/references/unsupported-models.md b/.claude/skills/ptq/references/unsupported-models.md
index 1a198f3e886..a2fa036362e 100644
--- a/.claude/skills/ptq/references/unsupported-models.md
+++ b/.claude/skills/ptq/references/unsupported-models.md
@@ -13,7 +13,7 @@ After download, inspect the model files on the target machine (use `remote_run`
 1. **Read `README.md`** — often lists required transformers versions, dependencies, or `trust_remote_code` requirements
 2. **Check for `modeling_*.py` or `tokenization_*.py`** — custom code shipped with the model. If found, **always use `--trust_remote_code`** with `hf_ptq.py`, and `trust_remote_code=True` in any custom scripts. Without it, `AutoConfig`, `AutoTokenizer`, and `AutoModel` will fail to resolve custom classes.
 
-Write custom scripts locally (in `./workspaces/<model>/scripts/`), then sync to remote before running.
+Write custom scripts locally (in `./workspaces/<session_id>/<model>/scripts/`), then sync to remote before running.
 
 **Check transformers compatibility** (on the target machine):
 
diff --git a/.claude/skills/ptq/tests.json b/.claude/skills/ptq/tests.json
index 706da3693b8..307a95d52fb 100644
--- a/.claude/skills/ptq/tests.json
+++ b/.claude/skills/ptq/tests.json
@@ -72,6 +72,19 @@
         "Applies manual dequantize_fp8_params for fused expert tensors",
         "Runs smoke test first, then full calibration"
       ]
+    },
+    {
+      "id": 6,
+      "prompt": "Quantize an FP8 source checkpoint with a partial NVFP4 recipe, then evaluate the quantized checkpoint",
+      "expected_output": "Agent treats post-PTQ checkpoint validation as a required gate before submitting any eval",
+      "files": [],
+      "expectations": [
+        "After PTQ completes, reads checkpoint-validation.md before creating or submitting eval jobs",
+        "Reports source safetensors size, output safetensors size, and output/source ratio",
+        "Reports layer precision counts, including NVFP4/FP8/intended quantized layers, BF16 or excluded layers, unexpected unquantized layers, and declaration mismatches",
+        "Checks metadata consistency against the source checkpoint and records any diffs",
+        "Stops before eval submission if the size ratio is >= 1.0 for a compression recipe, if intended layer coverage is missing, or if metadata changes unexpectedly"
+      ]
     }
   ]
 }
diff --git a/.gitignore b/.gitignore
index 66ce5568ee0..8303a3a28bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,7 @@ venv/
 
 # Ignore claude local settings
 .claude/settings.local.json
+.claude/agents/
 CLAUDE.local.md
 AGENTS.override.md
 
diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index 8fd4e25ee79..2316236c10c 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -263,6 +263,7 @@ def build_slurm_executor(
     executor = run.SlurmExecutor(
         account=slurm_config.account,
         partition=slurm_config.partition,
+        qos=slurm_config.qos,
         ntasks_per_node=slurm_config.ntasks_per_node,
         gpus_per_node=slurm_config.gpus_per_node,
         nodes=slurm_config.nodes,
diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index d2a8cd48d11..66fa11dc471 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -15,8 +15,11 @@
 
 """Slurm configuration and factory for the ModelOpt Launcher."""
 
+# ruff: noqa: UP045
+
 import os
 from dataclasses import dataclass
+from typing import Optional
 
 import nemo_run as run
 
@@ -29,15 +32,16 @@ class SlurmConfig:
     No internal cluster defaults are embedded here.
     """
 
-    host: str = None
+    host: Optional[str] = None
     port: int = 22
-    account: str = None
+    account: Optional[str] = None
     partition: str = "batch"
-    container: str = None
+    qos: Optional[str] = None
+    container: Optional[str] = None
     modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt"
-    container_mounts: list[str] = None
-    srun_args: list[str] = None
-    array: str = None
+    container_mounts: Optional[list[str]] = None
+    srun_args: Optional[list[str]] = None
+    array: Optional[str] = None
     nodes: int = 1
     ntasks_per_node: int = 1
     gpus_per_node: int = 1
@@ -51,6 +55,7 @@ def slurm_factory(
     host: str = os.environ.get("SLURM_HOST", ""),
     account: str = os.environ.get("SLURM_ACCOUNT", ""),
     partition: str = os.environ.get("SLURM_PARTITION", "batch"),
+    qos: Optional[str] = os.environ.get("SLURM_QOS"),
     nodes: int = 1,
     ntasks_per_node: int = 1,
     gpus_per_node: int = 1,
@@ -60,7 +65,7 @@ def slurm_factory(
         "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")),
     ],
     srun_args: list[str] = ["--no-container-mount-home"],
-    array: str = None,  # noqa: RUF013
+    array: Optional[str] = None,
     time: str = "04:00:00",
 ) -> SlurmConfig:
     """Generic Slurm factory — configure via environment variables or CLI overrides."""
@@ -68,6 +73,7 @@ def slurm_factory(
         host=host,
         account=account,
         partition=partition,
+        qos=qos,
         nodes=nodes,
         ntasks_per_node=ntasks_per_node,
         gpus_per_node=gpus_per_node,