From 66fd103888eed4ff8bf306479ac85960c001e27f Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Fri, 8 May 2026 13:57:59 -0500
Subject: [PATCH 01/26] Update evaluation skill guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/common/environment-setup.md      |  2 ++
 .claude/skills/evaluation/SKILL.md              | 13 ++++++++++---
 .../references/gpqa_diamond_aa_v3.yaml          |  8 ++++++++
 .../references/reference-benchmark-configs.md   | 17 +++++++++++++++++
 .../evaluation/references/scicode_aa_v2.yaml    |  8 ++++++++
 .../references/analyze-results.md               |  2 +-
 6 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 .claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
 create mode 100644 .claude/skills/evaluation/references/reference-benchmark-configs.md
 create mode 100644 .claude/skills/evaluation/references/scicode_aa_v2.yaml

diff --git a/.claude/skills/common/environment-setup.md b/.claude/skills/common/environment-setup.md
index 2eee2cd2a2b..a40ace2a5f9 100644
--- a/.claude/skills/common/environment-setup.md
+++ b/.claude/skills/common/environment-setup.md
@@ -29,6 +29,8 @@ cat ~/.config/modelopt/clusters.yaml 2>/dev/null || cat .claude/clusters.yaml 2>
 
 If a cluster config exists with content → **use the remote cluster** (do not fall back to local even if local GPUs are available — the cluster config indicates the user's preferred execution environment). Otherwise → **local execution**.
 
+If the cluster config contains multiple clusters and the user did not name the target cluster, ask which cluster to use before calling `remote_load_cluster`. Do not silently fall back to `default_cluster` in multi-cluster configs; different clusters can have different filesystems, GPU types, auth paths, and SSH setup.
+
 For remote, connect:
 
 ```bash
diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 69920814828..a9e6febc2fb 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -65,6 +65,8 @@ Prompt the user with "I'll ask you 5 questions to build the base config we'll ad
 - NIM
 - TRT-LLM
 
+Prefer vLLM for NEL self-deployment unless the user explicitly asks for another runtime, the model card requires another runtime, or the evaluation targets an already-running endpoint (`deployment: none`).
+
 3. Auto-export:
 
 - None (auto-export disabled)
@@ -158,6 +160,10 @@ Combine all detected flags into a single `deployment.extra_args` override. The r
 
 When a quantized checkpoint is detected, read `references/quantization-benchmarks.md` for benchmark sensitivity rankings and recommended sets. Present recommendations to the user and ask which to include.
 
+**Baseline comparison preflight:**
+
+When a quantized checkpoint is detected, identify the matching baseline before launching the full quantized run. The baseline is usually the pre-quantization source model/checkpoint for this run, but it may itself be quantized (for example, an FP8 checkpoint used as the baseline for an NVFP4 checkpoint). First infer the baseline from the PTQ source model/checkpoint in the workspace or config used to create the quantized checkpoint. If it cannot be inferred, ask the user for the baseline model/checkpoint or an existing baseline invocation/run path. If no matching baseline exists, prepare a companion baseline config and launch it before or alongside the quantized config. The baseline config should match the quantized config's benchmark versions, task configs, serving args, token limits, dataset setup, credentials, cluster, and container as closely as possible; change only the model/checkpoint and adjust quantization-specific flags to match the baseline checkpoint. Do not treat the quantized score as release-ready until the baseline comparison exists.
+
 Read `references/model-card-research.md` for the full extraction checklist (sampling params, reasoning config, ARM64 compatibility, pre_cmd, etc.). Use WebSearch to research the model card, present findings, and ask the user to confirm.
 
 **Step 4: Fill in remaining missing values**
@@ -171,7 +177,8 @@ Read `references/model-card-research.md` for the full extraction checklist (samp
 Show tasks in the current config. Loop until the user confirms the task list is final:
 
 1. Tell the user: "Run `nel ls tasks` to see all available tasks".
-2. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
+2. If the task list includes GPQA Diamond / `gpqa_diamond_aa_v3` / `GPQA` or SciCode / `scicode_aa_v2`, read `references/reference-benchmark-configs.md`, then apply the matching YAML file (`references/gpqa_diamond_aa_v3.yaml` or `references/scicode_aa_v2.yaml`) unless the user explicitly asks for different sampling or prompt settings.
+3. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
    To add per-task `nemo_evaluator_config` as specified by the user, e.g.:
 
    ```yaml
@@ -185,8 +192,8 @@ Show tasks in the current config. Loop until the user confirms the task list is
              ...
    ```
 
-3. Apply changes.
-4. Show updated list and ask: "Is the task list final, or do you want to make more changes?"
+4. Apply changes.
+5. Show updated list and ask: "Is the task list final, or do you want to make more changes?"
 
 **Known Issues**
 
diff --git a/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml b/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
new file mode 100644
index 00000000000..1dc15369258
--- /dev/null
+++ b/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
@@ -0,0 +1,8 @@
+tasks:
+  - name: nemo_skills.ns_gpqa
+    nemo_evaluator_config:
+      config:
+        params:
+          extra:
+            num_repeats: 8
+            args: ++prompt_config=eval/aai/mcq-4choices ++inference.tokens_to_generate=null
diff --git a/.claude/skills/evaluation/references/reference-benchmark-configs.md b/.claude/skills/evaluation/references/reference-benchmark-configs.md
new file mode 100644
index 00000000000..695890d1cf0
--- /dev/null
+++ b/.claude/skills/evaluation/references/reference-benchmark-configs.md
@@ -0,0 +1,17 @@
+# Reference Benchmark Task Configs
+
+Use these task-level YAML files when the user asks for the named reference benchmarks.
+Keep the task stanza shape intact unless the user explicitly asks to change sampling or
+prompt settings.
+
+## GPQA Diamond AA v3
+
+Aliases: `gpqa_diamond_aa_v3`, `GPQA Diamond`, `GPQA`.
+
+Config file: `references/gpqa_diamond_aa_v3.yaml`
+
+## SciCode AA v2
+
+Aliases: `scicode_aa_v2`, `SciCode`.
+
+Config file: `references/scicode_aa_v2.yaml`
diff --git a/.claude/skills/evaluation/references/scicode_aa_v2.yaml b/.claude/skills/evaluation/references/scicode_aa_v2.yaml
new file mode 100644
index 00000000000..19e25c10c4e
--- /dev/null
+++ b/.claude/skills/evaluation/references/scicode_aa_v2.yaml
@@ -0,0 +1,8 @@
+tasks:
+  - name: nemo_skills.ns_scicode
+    nemo_evaluator_config:
+      config:
+        params:
+          extra:
+            num_repeats: 8
+            args: ++inference.tokens_to_generate=null
diff --git a/.claude/skills/launching-evals/references/analyze-results.md b/.claude/skills/launching-evals/references/analyze-results.md
index fd49d400468..26147954f3e 100644
--- a/.claude/skills/launching-evals/references/analyze-results.md
+++ b/.claude/skills/launching-evals/references/analyze-results.md
@@ -50,8 +50,8 @@ Check logs for silent errors that may invalidate results:
 5. **Max model length**: Verify `max-model-len` = 131072 (leaderboard-recommended). Long context benchmarks (AA LCR, RULER) and agentic benchmarks may require a longer `max-model-len`.
 6. **RULER tasks**: Check thinking disabled, walltime=4h, rope-scaling for Qwen models
 7. **AA baseline comparison**: Compare results against Artificial Analysis published scores. Exact match not expected — flag significant deviations.
+8. **Model baseline comparison**: For quantized runs, compare results against the matching unquantized baseline model run when available. Use the same benchmark version, task config, serving args, token limits, dataset setup, and infrastructure before treating the delta as a quantization effect.
 
 ## Step 4: Report findings
 
 Present key metrics from `results.yml` in a table and summarize the metrics from `eval_factory_metrics.json` in a concise manner (include only the most important metrics or anomalies). If multiple runs, include side-by-side comparison of metrics (e.g. accuracy, latency, tokens count, memory). Summarize any issues found. Recommend improvements if applicable.
-

From 8aad1eb1d05cdc032b699c3d1f336a8f201b9e13 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Fri, 8 May 2026 16:21:49 -0500
Subject: [PATCH 02/26] Refine agent skill guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md         | 40 ++++++++++++----------
 .claude/skills/evaluation/tests/evals.json |  8 ++---
 .claude/skills/monitor/SKILL.md            | 14 ++++----
 .claude/skills/ptq/SKILL.md                |  4 +++
 4 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index a9e6febc2fb..621b256954c 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -30,7 +30,7 @@ Config Generation Progress:
 - [ ] Step 5: Confirm tasks (iterative)
 - [ ] Step 6: Advanced - Multi-node (Data Parallel)
 - [ ] Step 7: Advanced - Interceptors
-- [ ] Step 7.5: Check container registry auth (SLURM only)
+- [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
 ```
 
@@ -114,24 +114,21 @@ Ask for model path. Determine type:
 
 **Auto-detect ModelOpt quantization format** (checkpoint paths only):
 
-Check for `hf_quant_config.json` in the checkpoint directory:
+Check `config.json` first for a `quantization_config` section with `quant_method: "modelopt"`. If absent, check the legacy/backward-compatible `hf_quant_config.json`:
 
 ```bash
+cat <checkpoint_path>/config.json 2>/dev/null
 cat <checkpoint_path>/hf_quant_config.json 2>/dev/null
 ```
 
-If found, read `quantization.quant_algo` and set the correct vLLM/SGLang quantization flag in `deployment.extra_args`:
+If ModelOpt quantization is detected, read the quantization algorithm from `quantization_config.quant_algo` or `quantization.quant_algo`.
 
-| `quant_algo` | Flag to add |
-|-------------|-------------|
-| `FP8` | `--quantization modelopt` |
-| `W4A8_AWQ` | `--quantization modelopt` |
-| `NVFP4`, `NVFP4_AWQ` | `--quantization modelopt_fp4` |
-| Other values | Try `--quantization modelopt`; consult vLLM/SGLang docs if unsure |
+- **vLLM:** Do not add a `--quantization` flag by default. Recent vLLM reads `quantization_config` / `hf_quant_config.json` and selects the ModelOpt backend automatically; adding a stale or mismatched flag can cause a config mismatch. Only add an explicit flag if the model card, vLLM version, or dry-run error requires it.
+- **SGLang:** Use SGLang-specific docs/model-card guidance. For offline ModelOpt checkpoints, recent SGLang can parse the config in many cases; if an explicit flag is required, common values are `--quantization modelopt_fp8` for FP8 and `--quantization modelopt_fp4` for NVFP4. Some exported ModelOpt flows document `--quantization modelopt`; verify against the installed SGLang version.
 
-If no `hf_quant_config.json`, also check `config.json` for a `quantization_config` section with `quant_method: "modelopt"`. If neither is found, the checkpoint is unquantized — no flag needed.
+If neither file contains a ModelOpt quantization config, treat the checkpoint as unquantized — no quantization flag needed.
 
-> **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These are not in `hf_quant_config.json` — they are discovered during model card research below.
+> **Note:** Some models require additional env vars for deployment (e.g., `VLLM_NVFP4_GEMM_BACKEND=marlin` for Nemotron Super). These may not be in the quantization config files — they are discovered during model card research below.
 
 **Auto-detect deployment settings from checkpoint:**
 
@@ -166,6 +163,8 @@ When a quantized checkpoint is detected, identify the matching baseline before l
 
 Read `references/model-card-research.md` for the full extraction checklist (sampling params, reasoning config, ARM64 compatibility, pre_cmd, etc.). Use WebSearch to research the model card, present findings, and ask the user to confirm.
 
+For reasoning-capable models, prefer reasoning mode for evaluation because it usually produces the highest task scores; configure the model-card-specific on/off control and any reasoning budget or effort setting. If the user wants lower variance/noise, lower latency/cost, or an apples-to-apples comparison against non-reasoning baselines, also consider a non-reasoning companion run.
+
 **Step 4: Fill in remaining missing values**
 
 - Find all remaining `???` missing values in the config.
@@ -222,7 +221,7 @@ If the user needs multi-node evaluation (model >120B, or more throughput), read
 
 - The docs may show incorrect parameter names for logging. Use `max_logged_requests` and `max_logged_responses` (NOT `max_saved_*` or `max_*`).
 
-**Step 7.5: Check container registry authentication (SLURM only)**
+**Step 7.5: Check container registry authentication for private images (SLURM only)**
 
 NEL's default deployment images by framework:
 
@@ -233,24 +232,27 @@ NEL's default deployment images by framework:
 | TRT-LLM | `nvcr.io/nvidia/tensorrt-llm/release:...` | NGC |
 | Evaluation tasks | `nvcr.io/nvidia/eval-factory/*:26.03` | NGC |
 
-Before submitting, verify the cluster has credentials for the deployment image. See `skills/common/slurm-setup.md` section 6 for the full procedure.
+Before submitting, identify the exact deployment and evaluation-task images that will be pulled. If the images are public, skip the registry-authentication preflight; pyxis/enroot can pull public images without stored credentials. Do not require credentials just because the registry is DockerHub or NGC.
+
+Only verify cluster credentials when an image is private or access-restricted (private DockerHub repo, private NGC repo, internal registry, or user-provided image that is not known to be public). See `skills/common/slurm-setup.md` section 6 for the credential setup procedure.
 
 ```bash
 ssh <host> "grep -E '^\s*machine\s+' ~/.config/enroot/.credentials 2>/dev/null"
 ```
 
 **Decision flow (check before submitting):**
-1. Check if the cluster has credentials for the default DockerHub image (see command above)
-2. If DockerHub credentials exist → use the default image and submit
-3. If DockerHub credentials are missing but can be added → add them (see `slurm-setup.md` section 6), then submit
-4. If DockerHub credentials cannot be added → override `deployment.image` to the NGC alternative and submit:
+1. If the selected images are public → submit without an auth preflight
+2. If any selected image is private or access-restricted → check for credentials for that image's registry (see command above)
+3. If credentials exist → use the selected image and submit
+4. If credentials are missing but can be added → add them (see `slurm-setup.md` section 6), then submit
+5. If credentials cannot be added → switch to a public image when a compatible one exists, for example:
 
    ```yaml
    deployment:
      image: nvcr.io/nvidia/vllm:<YY.MM>-py3  # check https://catalog.ngc.nvidia.com/orgs/nvidia/containers/vllm for latest tag
    ```
 
-5. **Do not retry more than once** without fixing the auth issue
+6. **Do not retry more than once** after an auth failure without fixing credentials or switching images
 
 **Step 8: Run the evaluation**
 
@@ -340,6 +342,6 @@ Config Generation Progress:
 - [ ] Step 5: Confirm tasks (iterative)
 - [ ] Step 6: Advanced - Multi-node (Data Parallel)
 - [ ] Step 7: Advanced - Interceptors
-- [ ] Step 7.5: Check container registry auth (SLURM only)
+- [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
 ```
diff --git a/.claude/skills/evaluation/tests/evals.json b/.claude/skills/evaluation/tests/evals.json
index 0f35dacd7a8..9fc6712a53d 100644
--- a/.claude/skills/evaluation/tests/evals.json
+++ b/.claude/skills/evaluation/tests/evals.json
@@ -7,7 +7,7 @@
     "expected_behavior": [
       "Verifies nel is installed by running 'nel --version'",
       "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks) before generating the config",
-      "Runs 'nel skills build-config' with correct flags matching user answers: --execution slurm --deployment vllm --model-type reasoning --benchmarks standard code math_reasoning --export mlflow",
+      "Runs 'nel skills build-config' with correct flags matching user answers, using NEL's current model_type terminology for the chat/reasoning template family",
       "Searches the web for the model card on HuggingFace and extracts model-specific settings",
       "Sets correct HF handle: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
       "Sets reasoning sampling params from model card: temperature=1.0, top_p=1.0",
@@ -36,8 +36,8 @@
       "Asks all 5 base config questions (execution, deployment, auto-export, model type, benchmarks)",
       "Runs nel skills build-config with correct flags matching user answers",
       "Sets deployment.checkpoint_path to ./llama-3.1-8b-fp8 and deployment.hf_model_handle to null",
-      "Auto-detects quantization format by reading ./llama-3.1-8b-fp8/hf_quant_config.json",
-      "Finds quant_algo=FP8 and adds --quantization modelopt to deployment.extra_args",
+      "Auto-detects quantization format by reading ./llama-3.1-8b-fp8/config.json first, then ./llama-3.1-8b-fp8/hf_quant_config.json if needed",
+      "Finds quant_algo=FP8 and does not add a vLLM --quantization flag unless the vLLM version, model card, or dry-run error requires one",
       "Recommends accuracy-sensitive benchmarks from references/quantization-benchmarks.md",
       "Searches web for Llama-3.1-8B model card and extracts sampling params, context length, TP settings",
       "Fills in remaining missing values by asking user",
@@ -54,7 +54,7 @@
       "Verifies nel is installed by running nel --version",
       "Asks 5 base config questions with execution=slurm pre-selected based on user request",
       "Runs nel skills build-config with --execution slurm --deployment vllm --benchmarks standard",
-      "Detects FP8 quantization from hf_quant_config.json and sets deployment.extra_args with --quantization modelopt",
+      "Detects FP8 quantization from config.json or hf_quant_config.json and does not add a vLLM --quantization flag unless the vLLM version, model card, or dry-run error requires one",
       "Reads references/quantization-benchmarks.md and recommends accuracy-sensitive benchmarks",
       "Uses WebSearch to research model card for sampling params and context length",
       "Fills in SLURM-specific values: hostname, account, partition from user input",
diff --git a/.claude/skills/monitor/SKILL.md b/.claude/skills/monitor/SKILL.md
index cd896c347e9..52b47ec55ed 100644
--- a/.claude/skills/monitor/SKILL.md
+++ b/.claude/skills/monitor/SKILL.md
@@ -41,22 +41,24 @@ All active jobs are tracked in `.claude/active_jobs.json`. This file is the sing
 Every time a job is submitted (by any skill or manually):
 
 1. **Add an entry** to `.claude/active_jobs.json`. Create the file if it doesn't exist.
-2. **Set up a durable recurring cron** (if one isn't already running) that polls all registered jobs every 15 minutes. The cron prompt should: read the registry, check each job, report state changes to the user, remove completed jobs, and delete itself when the registry is empty.
+2. **Start a durable monitor** (if one isn't already watching the registry) that polls all registered jobs until they reach terminal status. Prefer the Claude Code `Monitor` tool when it is available: write a small watcher that reads the registry on each poll, checks every job with the appropriate method below, prints state-change events, updates `last_status`, removes terminal jobs, and exits when the registry is empty.
+
+The monitor should terminate naturally when every registered job has reached a terminal state. If the `Monitor` tool is not available in the current harness, run an equivalent background process that implements the same loop and lets the agent resume/restart when the process exits.
 
 Always do both steps. Don't try to predict job duration.
 
 ---
 
-## On Cron Fire / Status Check
+## On Monitor Event / Status Check
 
-Whether triggered by the cron or by the user asking "check status":
+Whether triggered by monitor output or by the user asking "check status":
 
 1. **Read the registry** from `.claude/active_jobs.json`
 2. **Check each job** using the appropriate method (see below)
 3. **Report only state changes** — compare against `last_status` in registry
 4. **Update `last_status`** in the registry
-5. **Remove completed jobs** — any job in a terminal state (COMPLETED, FAILED, CANCELLED, KILLED)
-6. **If registry is empty** — delete the recurring cron
+5. **Remove completed jobs** — any job in a terminal state (COMPLETED, FAILED, CANCELLED, KILLED, TIMEOUT, NODE_FAIL, OUT_OF_MEMORY, PREEMPTED, BOOT_FAIL, DEADLINE)
+6. **If registry is empty** — let the monitor exit
 
 ---
 
@@ -76,7 +78,7 @@ Whether triggered by the cron or by the user asking "check status":
 
 ### Raw SLURM jobs (`type: slurm`)
 
-- **Check:** `ssh <host> "squeue -j <id> -h -o '%T %M %R'"` — if empty, job left the queue
+- **Check:** `ssh <host> "sacct -j <id> --format=JobID%12,JobName%25,State%12,Elapsed%10 -n"` and filter out `extern`, `batch`, and step rows like `.<step>`. Use `sacct` for the termination check; `squeue` can lag in `COMPLETING` after `sacct` reports a terminal state.
 - **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`
 - **On failure:** Check the job's output log file
 
diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
index e9e73254bd9..a73530c2242 100644
--- a/.claude/skills/ptq/SKILL.md
+++ b/.claude/skills/ptq/SKILL.md
@@ -59,6 +59,10 @@ If a model-specific recipe exists, use `--recipe <path>` — it may contain tune
 
 Use `--qformat <name>` (e.g., `--qformat nvfp4`). Format definitions: `modelopt/torch/quantization/config.py`. General PTQ recipes in `modelopt_recipes/general/ptq/` correspond to the same formats — `--qformat` is the simpler way to use them.
 
+Before running PTQ, sanity-check the selected qformat/recipe against the model structure. Inspect the recipe's include/exclude patterns and summarize which layer groups will be quantized and approximately how many modules/layers match (attention projections, MLP projections, experts, etc.). If the match count is 0, or far smaller than expected for the model, stop and fix the recipe or ask the user before launching calibration.
+
+If the source checkpoint is already quantized and the requested recipe/config reduces quantization coverage, confirm that intent with the user before running. For example, if an FP8 checkpoint is used as input and the recipe excludes some layers so they would fall back to BF16 instead of staying quantized, call out the affected layer groups and ask whether that FP8-to-BF16 fallback is intended.
+
 > NVFP4 can be calibrated on Hopper but requires Blackwell for inference.
 
 ## Step 4 — Run PTQ

From 02ec0b2aafbdf6280e15bfe4e7c889900e35f948 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Fri, 8 May 2026 16:28:10 -0500
Subject: [PATCH 03/26] Clarify quantized eval baseline comparison

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/launching-evals/references/analyze-results.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude/skills/launching-evals/references/analyze-results.md b/.claude/skills/launching-evals/references/analyze-results.md
index 26147954f3e..48e0a87bb59 100644
--- a/.claude/skills/launching-evals/references/analyze-results.md
+++ b/.claude/skills/launching-evals/references/analyze-results.md
@@ -50,7 +50,7 @@ Check logs for silent errors that may invalidate results:
 5. **Max model length**: Verify `max-model-len` = 131072 (leaderboard-recommended). Long context benchmarks (AA LCR, RULER) and agentic benchmarks may require a longer `max-model-len`.
 6. **RULER tasks**: Check thinking disabled, walltime=4h, rope-scaling for Qwen models
 7. **AA baseline comparison**: Compare results against Artificial Analysis published scores. Exact match not expected — flag significant deviations.
-8. **Model baseline comparison**: For quantized runs, compare results against the matching unquantized baseline model run when available. Use the same benchmark version, task config, serving args, token limits, dataset setup, and infrastructure before treating the delta as a quantization effect.
+8. **Model baseline comparison**: For quantized runs, compare results against the matching baseline model run when available. The baseline may be unquantized or simply less quantized (for example, FP8 as the baseline for NVFP4). Use the same benchmark version, task config, serving args, token limits, dataset setup, and infrastructure before treating the delta as a quantization effect.
 
 ## Step 4: Report findings
 

From 267c19e634c8ae64750adebd12591e8e05aced70 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Mon, 11 May 2026 09:28:09 -0500
Subject: [PATCH 04/26] Document repeat guidance for reasoning evals

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .../evaluation/references/reference-benchmark-configs.md      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.claude/skills/evaluation/references/reference-benchmark-configs.md b/.claude/skills/evaluation/references/reference-benchmark-configs.md
index 695890d1cf0..91484b7d35c 100644
--- a/.claude/skills/evaluation/references/reference-benchmark-configs.md
+++ b/.claude/skills/evaluation/references/reference-benchmark-configs.md
@@ -4,6 +4,10 @@ Use these task-level YAML files when the user asks for the named reference bench
 Keep the task stanza shape intact unless the user explicitly asks to change sampling or
 prompt settings.
 
+For reasoning-mode comparisons, use `num_repeats >= 3` when the benchmark supports
+repeats. Single-trial noise can hide or mimic low-single-digit percentage-point
+effects, so do not rely on a one-shot comparison when judging small deltas.
+
 ## GPQA Diamond AA v3
 
 Aliases: `gpqa_diamond_aa_v3`, `GPQA Diamond`, `GPQA`.

From de7cd395b05e1167d8d8b183f2fa27108eb0e7ec Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Mon, 11 May 2026 10:14:53 -0500
Subject: [PATCH 05/26] Add PTQ and evaluation verification guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md                 | 14 ++++++++++++++
 .claude/skills/ptq/SKILL.md                        |  2 +-
 .../skills/ptq/references/checkpoint-validation.md | 12 +++++++++++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 621b256954c..c5e345fae16 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -32,6 +32,7 @@ Config Generation Progress:
 - [ ] Step 7: Advanced - Interceptors
 - [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+- [ ] Step 9: Verify evaluation comparability
 ```
 
 **Step 1: Check prerequisites**
@@ -306,6 +307,18 @@ After the dry-run, check the output from `nel` for any problems with the config.
 
 After job submission, register the job per the **monitor skill** for durable cross-session tracking. For one-off queries (live status, debugging a failed run, analyzing results) use the **launching-evals skill**; for querying past runs in MLflow use **accessing-mlflow**.
 
+**Step 9: Verify evaluation comparability**
+
+Before treating a baseline-vs-quantized delta as a model quality result, verify the two runs are comparable:
+
+1. Confirm the prompt text and chat template/rendered messages match between the baseline and quantized evaluations.
+2. Confirm generation settings match, including temperature, top_p, top_k, max tokens, stop strings, reasoning mode/budget, and any task-specific overrides.
+3. If reasoning traces are present, confirm they are stripped or ignored before scoring, consistently for both runs.
+4. Confirm the number of evaluated samples matches for each task and split.
+5. Confirm the same accuracy metric/score field is used for the baseline and quantized comparison.
+
+If any item differs, either rerun with matched settings or label the result as not an apples-to-apples quantization comparison.
+
 **NEL-specific diagnostics** (for debugging failures):
 
 ```bash
@@ -344,4 +357,5 @@ Config Generation Progress:
 - [ ] Step 7: Advanced - Interceptors
 - [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+- [ ] Step 9: Verify evaluation comparability
 ```
diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
index a73530c2242..723d3aae07c 100644
--- a/.claude/skills/ptq/SKILL.md
+++ b/.claude/skills/ptq/SKILL.md
@@ -135,7 +135,7 @@ Report the path and size to the user.
 
 ### Post-quantization validation
 
-Validate the exported checkpoint's quantization pattern matches the recipe. Quantization config patterns can silently miss layers if the model uses non-standard naming (e.g., Gemma4 `experts.*` missed by `*mlp*` patterns) — this only surfaces later as deployment failures. Read `references/checkpoint-validation.md` for the validation script, expected patterns per recipe, and common pattern gaps.
+Read `references/checkpoint-validation.md` and perform the checkpoint size/bits, quantized-weight coverage, and metadata consistency checks before using the checkpoint for deployment/evaluation.
 
 **Next steps**: If the user wants to deploy or evaluate the quantized checkpoint, use the **deployment** or **evaluation** skill. The checkpoint workspace carries over. If the model required patches during PTQ (e.g., transformers upgrade), the same fixes will likely be needed at deployment and evaluation time.
 
diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.claude/skills/ptq/references/checkpoint-validation.md
index 68d1ddd075c..972f321703c 100644
--- a/.claude/skills/ptq/references/checkpoint-validation.md
+++ b/.claude/skills/ptq/references/checkpoint-validation.md
@@ -1,6 +1,16 @@
 # Post-Quantization Checkpoint Validation
 
-Verify the exported checkpoint's quantization pattern matches the recipe used. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
+Before treating an exported checkpoint as ready for deployment/evaluation, verify checkpoint size/bits, quantized-weight coverage, and metadata consistency. If any check fails, stop and fix the checkpoint or ask the user before using it for release-quality deployment/evaluation.
+
+## Required checks
+
+1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. If the size is similar, explain why (for example, only part of the model was quantized) before proceeding.
+2. The weights that were actually quantized match what the requested qformat/recipe/config targeted. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
+3. Metadata that should not change still matches the baseline/source model. Compare generation settings, tokenizer files, chat template, model architecture fields, max positions/context length, and special tokens; quantization should affect weights and quantization metadata, not silently change prompting or generation behavior.
+
+## Size / bits check
+
+Compare checkpoint size against the baseline/source checkpoint and estimate bits per weight from tensor dtypes and file sizes. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. Call out any small or missing size reduction before deployment/evaluation.
 
 ## Expected quantization patterns by recipe
 

From 874581c54d4d2c17efd23f115b6cd1cf9b0b2ebd Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Mon, 11 May 2026 10:24:11 -0500
Subject: [PATCH 06/26] Deduplicate PTQ checkpoint size guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/ptq/references/checkpoint-validation.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.claude/skills/ptq/references/checkpoint-validation.md
index 972f321703c..c946b778ae3 100644
--- a/.claude/skills/ptq/references/checkpoint-validation.md
+++ b/.claude/skills/ptq/references/checkpoint-validation.md
@@ -4,14 +4,10 @@ Before treating an exported checkpoint as ready for deployment/evaluation, verif
 
 ## Required checks
 
-1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. If the size is similar, explain why (for example, only part of the model was quantized) before proceeding.
+1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. If the size reduction is small or missing, explain why before proceeding.
 2. The weights that were actually quantized match what the requested qformat/recipe/config targeted. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
 3. Metadata that should not change still matches the baseline/source model. Compare generation settings, tokenizer files, chat template, model architecture fields, max positions/context length, and special tokens; quantization should affect weights and quantization metadata, not silently change prompting or generation behavior.
 
-## Size / bits check
-
-Compare checkpoint size against the baseline/source checkpoint and estimate bits per weight from tensor dtypes and file sizes. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. Call out any small or missing size reduction before deployment/evaluation.
-
 ## Expected quantization patterns by recipe
 
 | Recipe (`--qformat`) | What should be quantized | What should be excluded |

From 187ca1ed211ab0c9fbd4c8ace45bf0400e200014 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 11:27:50 -0500
Subject: [PATCH 07/26] Deduplicate evaluation recipe guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  6 +++++-
 .../references/gpqa_diamond_aa_v3.yaml        |  8 -------
 .../references/reference-benchmark-configs.md | 21 -------------------
 .../evaluation/references/scicode_aa_v2.yaml  |  8 -------
 4 files changed, 5 insertions(+), 38 deletions(-)
 delete mode 100644 .claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
 delete mode 100644 .claude/skills/evaluation/references/reference-benchmark-configs.md
 delete mode 100644 .claude/skills/evaluation/references/scicode_aa_v2.yaml

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index c5e345fae16..325b18b1bd2 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -177,7 +177,11 @@ For reasoning-capable models, prefer reasoning mode for evaluation because it us
 Show tasks in the current config. Loop until the user confirms the task list is final:
 
 1. Tell the user: "Run `nel ls tasks` to see all available tasks".
-2. If the task list includes GPQA Diamond / `gpqa_diamond_aa_v3` / `GPQA` or SciCode / `scicode_aa_v2`, read `references/reference-benchmark-configs.md`, then apply the matching YAML file (`references/gpqa_diamond_aa_v3.yaml` or `references/scicode_aa_v2.yaml`) unless the user explicitly asks for different sampling or prompt settings.
+2. If the task list includes a benchmark with a pre-built snippet in `recipes/tasks/`,
+   prefer that snippet over hand-written task overrides unless the user explicitly asks
+   for different sampling or prompt settings. For reasoning-mode comparisons, keep the
+   recipe repeat counts; for tasks without a recipe, use `num_repeats >= 3` when the
+   benchmark supports repeats.
 3. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
    To add per-task `nemo_evaluator_config` as specified by the user, e.g.:
 
diff --git a/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml b/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
deleted file mode 100644
index 1dc15369258..00000000000
--- a/.claude/skills/evaluation/references/gpqa_diamond_aa_v3.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-tasks:
-  - name: nemo_skills.ns_gpqa
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 8
-            args: ++prompt_config=eval/aai/mcq-4choices ++inference.tokens_to_generate=null
diff --git a/.claude/skills/evaluation/references/reference-benchmark-configs.md b/.claude/skills/evaluation/references/reference-benchmark-configs.md
deleted file mode 100644
index 91484b7d35c..00000000000
--- a/.claude/skills/evaluation/references/reference-benchmark-configs.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Reference Benchmark Task Configs
-
-Use these task-level YAML files when the user asks for the named reference benchmarks.
-Keep the task stanza shape intact unless the user explicitly asks to change sampling or
-prompt settings.
-
-For reasoning-mode comparisons, use `num_repeats >= 3` when the benchmark supports
-repeats. Single-trial noise can hide or mimic low-single-digit percentage-point
-effects, so do not rely on a one-shot comparison when judging small deltas.
-
-## GPQA Diamond AA v3
-
-Aliases: `gpqa_diamond_aa_v3`, `GPQA Diamond`, `GPQA`.
-
-Config file: `references/gpqa_diamond_aa_v3.yaml`
-
-## SciCode AA v2
-
-Aliases: `scicode_aa_v2`, `SciCode`.
-
-Config file: `references/scicode_aa_v2.yaml`
diff --git a/.claude/skills/evaluation/references/scicode_aa_v2.yaml b/.claude/skills/evaluation/references/scicode_aa_v2.yaml
deleted file mode 100644
index 19e25c10c4e..00000000000
--- a/.claude/skills/evaluation/references/scicode_aa_v2.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-tasks:
-  - name: nemo_skills.ns_scicode
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 8
-            args: ++inference.tokens_to_generate=null

From 947074b4739adf9d131af912eb9c3aac123ab90a Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 11:32:28 -0500
Subject: [PATCH 08/26] Add SLURM QoS launcher option

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 tools/launcher/core.py         | 1 +
 tools/launcher/slurm_config.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/tools/launcher/core.py b/tools/launcher/core.py
index 8fd4e25ee79..2316236c10c 100644
--- a/tools/launcher/core.py
+++ b/tools/launcher/core.py
@@ -263,6 +263,7 @@ def build_slurm_executor(
     executor = run.SlurmExecutor(
         account=slurm_config.account,
         partition=slurm_config.partition,
+        qos=slurm_config.qos,
         ntasks_per_node=slurm_config.ntasks_per_node,
         gpus_per_node=slurm_config.gpus_per_node,
         nodes=slurm_config.nodes,
diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index d2a8cd48d11..fc2ab987850 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -33,6 +33,7 @@ class SlurmConfig:
     port: int = 22
     account: str = None
     partition: str = "batch"
+    qos: str = None
     container: str = None
     modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt"
     container_mounts: list[str] = None
@@ -51,6 +52,7 @@ def slurm_factory(
     host: str = os.environ.get("SLURM_HOST", ""),
     account: str = os.environ.get("SLURM_ACCOUNT", ""),
     partition: str = os.environ.get("SLURM_PARTITION", "batch"),
+    qos: str = os.environ.get("SLURM_QOS", None),
     nodes: int = 1,
     ntasks_per_node: int = 1,
     gpus_per_node: int = 1,
@@ -68,6 +70,7 @@ def slurm_factory(
         host=host,
         account=account,
         partition=partition,
+        qos=qos,
         nodes=nodes,
         ntasks_per_node=ntasks_per_node,
         gpus_per_node=gpus_per_node,

From d885ad64036751cf9bab0db587a0e073e8b2251f Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 13:10:35 -0500
Subject: [PATCH 09/26] Make PTQ checkpoint validation a required gate

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/ptq/SKILL.md                   |  12 +-
 .../ptq/references/checkpoint-validation.md   | 127 ++++++++++++++----
 .claude/skills/ptq/tests.json                 |  13 ++
 3 files changed, 124 insertions(+), 28 deletions(-)

diff --git a/.claude/skills/ptq/SKILL.md b/.claude/skills/ptq/SKILL.md
index 723d3aae07c..753917882e8 100644
--- a/.claude/skills/ptq/SKILL.md
+++ b/.claude/skills/ptq/SKILL.md
@@ -135,7 +135,15 @@ Report the path and size to the user.
 
 ### Post-quantization validation
 
-Read `references/checkpoint-validation.md` and perform the checkpoint size/bits, quantized-weight coverage, and metadata consistency checks before using the checkpoint for deployment/evaluation.
+This is a required gate before any deployment or evaluation submission. Do not submit an eval, start a serving job, or hand off the checkpoint as ready until the gate has passed.
+
+Read `references/checkpoint-validation.md` and perform all three validation groups on the exact checkpoint path that will be deployed/evaluated:
+
+1. Check output size and estimated bits per weight against the baseline/source checkpoint.
+2. Check quantized-weight coverage against the requested qformat/recipe/config.
+3. Check metadata consistency against the baseline/source model.
+
+Report the gate result before moving on. The report must include source size, output size, output/source size ratio, layer precision counts (for example NVFP4, FP8, INT4, BF16/unquantized excluded, unexpected unquantized, declaration mismatches), and metadata diffs. If the output/source ratio is >= 1.0 for a compression recipe, if any intended layer group is missing quantization, or if metadata changed unexpectedly, stop and fix the checkpoint or ask the user before proceeding.
 
 **Next steps**: If the user wants to deploy or evaluate the quantized checkpoint, use the **deployment** or **evaluation** skill. The checkpoint workspace carries over. If the model required patches during PTQ (e.g., transformers upgrade), the same fixes will likely be needed at deployment and evaluation time.
 
@@ -164,7 +172,7 @@ Read `references/checkpoint-validation.md` and perform the checkpoint size/bits,
 | `references/launcher-guide.md` | Step 4B only (launcher path) |
 | `tools/launcher/CLAUDE.md` | Step 4B only, if you need more launcher detail |
 | `references/unsupported-models.md` | Step 4C only (unlisted model) |
-| `references/checkpoint-validation.md` | Step 5: validate quantization pattern matches recipe |
+| `references/checkpoint-validation.md` | Step 5: mandatory post-PTQ gate before deployment/evaluation |
 | `skills/common/remote-execution.md` | Step 4A/4C only, if target is remote |
 | `skills/common/slurm-setup.md` | Step 4A/4C only, if using SLURM manually (not launcher) |
 | `references/slurm-setup-ptq.md` | Step 4A/4C only, PTQ-specific SLURM (container, GPU sizing, FSDP2) |
diff --git a/.claude/skills/ptq/references/checkpoint-validation.md b/.claude/skills/ptq/references/checkpoint-validation.md
index c946b778ae3..f62ecbe0cc2 100644
--- a/.claude/skills/ptq/references/checkpoint-validation.md
+++ b/.claude/skills/ptq/references/checkpoint-validation.md
@@ -1,12 +1,29 @@
 # Post-Quantization Checkpoint Validation
 
-Before treating an exported checkpoint as ready for deployment/evaluation, verify checkpoint size/bits, quantized-weight coverage, and metadata consistency. If any check fails, stop and fix the checkpoint or ask the user before using it for release-quality deployment/evaluation.
+Before treating an exported checkpoint as ready for deployment/evaluation, verify checkpoint size/bits, quantized-weight coverage, and metadata consistency. This is a gate, not a guideline: do not submit evals, start serving jobs, or mark the checkpoint ready until all required checks pass and the validation report is recorded.
 
 ## Required checks
 
-1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. If the size reduction is small or missing, explain why before proceeding.
-2. The weights that were actually quantized match what the requested qformat/recipe/config targeted. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
-3. Metadata that should not change still matches the baseline/source model. Compare generation settings, tokenizer files, chat template, model architecture fields, max positions/context length, and special tokens; quantization should affect weights and quantization metadata, not silently change prompting or generation behavior.
+1. The quantized checkpoint is smaller on disk than the baseline/source checkpoint and has lower estimated bits per weight. Record source size, output size, and output/source ratio. A partial-quantization recipe may not shrink every tensor, but it should still match the intended quantization coverage. If the size reduction is small or missing, explain why before proceeding.
+2. The weights that were actually quantized match what the requested qformat/recipe/config targeted. Record layer precision counts grouped by actual/declarative precision, such as NVFP4, FP8, INT4, BF16/unquantized excluded, unexpected unquantized, and declaration mismatches. Quantization config patterns may silently miss layers if the model uses non-standard naming — this only surfaces later as deployment failures when the serving framework tries to load unquantized weights as quantized.
+3. Metadata that should not change still matches the baseline/source model. Compare generation settings, tokenizer files, chat template, model architecture fields, max positions/context length, and special tokens; quantization should affect weights and quantization metadata, not silently change prompting or generation behavior. Record every diff and classify it as expected or blocking.
+
+## Gate report
+
+Before moving to deployment/evaluation, report a table in this shape:
+
+| Check | Result |
+| --- | --- |
+| Size vs source | `<output> GB / <source> GB = <ratio>x`; PASS only if the ratio matches the recipe's compression intent |
+| Layer precision counts | `<count> NVFP4 / <count> FP8 / <count> INT4 / <count> BF16-or-excluded / <count> unexpected / <count> declaration mismatches` |
+| Metadata | `no unexpected diffs` or list exact diffs |
+
+Stop instead of proceeding if:
+
+- Output/source ratio is `>= 1.0` for a compression recipe, unless the user explicitly accepts the explanation.
+- Any layer group intended to be quantized has zero or unexpectedly low coverage.
+- Any layer has quantization metadata inconsistent with its declared precision.
+- Prompting, tokenizer, generation, architecture, context-length, or special-token metadata changed unexpectedly.
 
 ## Expected quantization patterns by recipe
 
@@ -19,18 +36,49 @@ Before treating an exported checkpoint as ready for deployment/evaluation, verif
 | `fp8` | All linear layers | lm_head, norms, embeddings |
 | `int4_awq` | All linear layers | lm_head, norms, embeddings |
 
-## Validation script
+## Size check
+
+Compare only checkpoint weight files, not cache directories or eval artifacts:
+
+```bash
+python3 -c "
+from pathlib import Path
+
+source = Path('<source_checkpoint_path>')
+output = Path('<output_path>')
+
+def safetensor_bytes(path):
+    files = list(path.glob('*.safetensors')) if path.is_dir() else [path]
+    return sum(p.stat().st_size for p in files)
+
+src = safetensor_bytes(source)
+dst = safetensor_bytes(output)
+ratio = dst / src if src else float('nan')
+print(f'Source safetensors: {src / 1e9:.2f} GB')
+print(f'Output safetensors: {dst / 1e9:.2f} GB')
+print(f'Output/source ratio: {ratio:.2f}x')
+"
+```
 
-Run against the exported checkpoint to check every linear layer is either quantized (has scale params) or explicitly excluded:
+Treat the ratio as the first-order bits-per-weight proxy unless you separately load tensors and compute exact parameter bit counts. For compression recipes, a ratio at or above `1.0x` is blocking unless the user explicitly accepts the explanation.
+
+## Layer coverage and precision script
+
+Run against the exported checkpoint to check every linear layer is either quantized with the expected precision or explicitly excluded. This handles both uniform `quant_algo` exports and mixed-precision `quantized_layers` exports:
 
 ```bash
 python3 -c "
-import json, fnmatch
+import collections, fnmatch, json, os
 
 output = '<output_path>'
-idx = json.load(open(f'{output}/model.safetensors.index.json'))
-cfg = json.load(open(f'{output}/hf_quant_config.json'))
-excludes = cfg['quantization']['exclude_modules']
+idx = json.load(open(os.path.join(output, 'model.safetensors.index.json')))
+cfg = json.load(open(os.path.join(output, 'hf_quant_config.json')))
+q = cfg.get('quantization', {})
+excludes = q.get('exclude_modules', []) or q.get('ignore', [])
+declared_layers = q.get('quantized_layers') or {}
+uniform_algo = q.get('quant_algo')
+if uniform_algo == 'MIXED_PRECISION':
+    uniform_algo = None
 
 all_keys = set(idx['weight_map'].keys())
 # Identify linear weight params (skip norms, embeddings, scalars, scales)
@@ -38,27 +86,48 @@ skip_suffixes = ('_scale', '_scale_2', 'layernorm', 'layer_norm', 'norm.weight',
 linear_weights = sorted(k for k in all_keys
     if k.endswith('.weight') and not any(s in k.lower() for s in skip_suffixes))
 
-# Check which have quantization scales
-quantized, excluded, unexpected = [], [], []
+def is_excluded(base, weight):
+    return any(fnmatch.fnmatch(weight, p) or fnmatch.fnmatch(base, p) for p in excludes)
+
+def declared_algo(base):
+    if base in declared_layers:
+        return declared_layers[base].get('quant_algo', 'DECLARED_UNKNOWN')
+    if is_excluded(base, base + '.weight'):
+        return 'BF16/EXCLUDED'
+    if uniform_algo:
+        return uniform_algo
+    return 'UNDECLARED'
+
+precision_counts = collections.Counter()
+unexpected = []
+mismatches = []
 for w in linear_weights:
     base = w.rsplit('.weight', 1)[0]
-    has_scales = any(f'{base}.{s}' in all_keys for s in ['weight_scale', 'input_scale'])
-    is_excluded = any(fnmatch.fnmatch(w, p) or fnmatch.fnmatch(base, p) for p in excludes)
-
-    if has_scales:
-        quantized.append(w)
-    elif is_excluded:
-        excluded.append(w)
+    algo = declared_algo(base)
+    has_scales = any(f'{base}.{s}' in all_keys for s in
+                     ['weight_scale', 'weight_scale_2', 'input_scale', 'activation_scale', 'weight_scale_inv'])
+
+    if has_scales and algo not in ('BF16/EXCLUDED', 'UNDECLARED'):
+        precision_counts[algo] += 1
+    elif has_scales and algo in ('BF16/EXCLUDED', 'UNDECLARED'):
+        precision_counts['QUANTIZED_BUT_' + algo.replace('/', '_')] += 1
+        mismatches.append((w, algo, 'has quantization scales'))
+    elif not has_scales and algo == 'BF16/EXCLUDED':
+        precision_counts['BF16/EXCLUDED'] += 1
     else:
-        unexpected.append(w)
-
-print(f'Quantized layers: {len(quantized)}')
-print(f'Excluded layers (in exclude_modules): {len(excluded)}')
+        precision_counts['UNEXPECTED_UNQUANTIZED'] += 1
+        unexpected.append((w, algo, 'no quantization scales'))
+
+print('Layer precision counts:')
+for name, count in sorted(precision_counts.items()):
+    print(f'  {name}: {count}')
+print(f'Unexpected unquantized layers: {len(unexpected)}')
+print(f'Declaration mismatches: {len(mismatches)}')
 if unexpected:
     print(f'\nWARNING: {len(unexpected)} layers have NO scales and are NOT in exclude list:')
     # Group by module type for readability
     groups = {}
-    for w in unexpected:
+    for w, algo, reason in unexpected:
         parts = w.split('.')
         module_type = next((p for p in parts if p in
             ('self_attn', 'mlp', 'experts', 'router', 'lm_head', 'embed_tokens', 'vision_tower')), 'other')
@@ -70,8 +139,14 @@ if unexpected:
     print('Likely cause: quantization config patterns did not match these module names.')
     print('This WILL cause deployment failures (framework loads them as quantized but they are BF16).')
     print('Fix: add missing patterns to the config, or add to exclude_modules if intentionally unquantized.')
-else:
-    print('\nAll layers are either quantized or explicitly excluded. Checkpoint is consistent.')
+if mismatches:
+    print(f'\nWARNING: {len(mismatches)} layers have declaration/metadata mismatches:')
+    for w, algo, reason in mismatches[:20]:
+        print(f'  {w}: declared {algo}, {reason}')
+    if len(mismatches) > 20:
+        print(f'  ... {len(mismatches) - 20} more')
+if not unexpected and not mismatches:
+    print('\nAll layers are quantized at the declared precision or explicitly excluded.')
 "
 ```
 
diff --git a/.claude/skills/ptq/tests.json b/.claude/skills/ptq/tests.json
index 706da3693b8..307a95d52fb 100644
--- a/.claude/skills/ptq/tests.json
+++ b/.claude/skills/ptq/tests.json
@@ -72,6 +72,19 @@
         "Applies manual dequantize_fp8_params for fused expert tensors",
         "Runs smoke test first, then full calibration"
       ]
+    },
+    {
+      "id": 6,
+      "prompt": "Quantize an FP8 source checkpoint with a partial NVFP4 recipe, then evaluate the quantized checkpoint",
+      "expected_output": "Agent treats post-PTQ checkpoint validation as a required gate before submitting any eval",
+      "files": [],
+      "expectations": [
+        "After PTQ completes, reads checkpoint-validation.md before creating or submitting eval jobs",
+        "Reports source safetensors size, output safetensors size, and output/source ratio",
+        "Reports layer precision counts, including NVFP4/FP8/intended quantized layers, BF16 or excluded layers, unexpected unquantized layers, and declaration mismatches",
+        "Checks metadata consistency against the source checkpoint and records any diffs",
+        "Stops before eval submission if the size ratio is >= 1.0 for a compression recipe, if intended layer coverage is missing, or if metadata changes unexpectedly"
+      ]
     }
   ]
 }

From be795554c5bc43acbadf9c915f7d51c339d4fe3f Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 13:25:22 -0500
Subject: [PATCH 10/26] Refine evaluation run gating guidance

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            | 73 ++++++++++++++-----
 .../recipes/examples/example_eval.yaml        |  2 +-
 .claude/skills/evaluation/tests/evals.json    | 10 ++-
 3 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 325b18b1bd2..d5d61e85b2b 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -32,6 +32,9 @@ Config Generation Progress:
 - [ ] Step 7: Advanced - Interceptors
 - [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+  - [ ] Step 8.1: Dry-run / NEL CLI config validation
+  - [ ] Step 8.2: Limited-samples canary
+  - [ ] Step 8.3: Full evaluation
 - [ ] Step 9: Verify evaluation comparability
 ```
 
@@ -261,7 +264,9 @@ ssh <host> "grep -E '^\s*machine\s+' ~/.config/enroot/.credentials 2>/dev/null"
 
 **Step 8: Run the evaluation**
 
-Print the following commands to the user. Propose to execute them in order to confirm the config works as expected before the full run.
+Use a gated `dry-run -> canary -> full-run` sequence. Run the commands directly
+when the user has asked you to launch evals; otherwise, ask before submitting jobs.
+Do not submit the full run until the dry-run and limited-samples canary both pass.
 
 **Important**: Export required environment variables based on your config. If any tokens or keys are missing, point the user to `recipes/env.example` — it lists all possible keys with notes on which tasks need them. Ask the user to copy it, fill in their keys, and source it:
 
@@ -279,33 +284,60 @@ export NEMO_EVALUATOR_TRUST_PRE_CMD=1
 export DUMMY_API_KEY=dummy
 ```
 
-1. **Dry-run** (validates config without running):
+**Step 8.1: Dry-run / NEL CLI config validation** (validates config without running):
 
-   ```bash
-   nel run --config <config_path> --dry-run
-   ```
+```bash
+nel run --config <config_path> --dry-run
+```
 
-2. **Test with limited samples** (quick validation run):
+Check the NEL output before launching anything. Fix unresolved `???` values,
+bad Hydra overrides, missing env var references, invalid mounts, image/container
+problems, sbatch issues, and obvious deployment argument errors before moving on.
 
-   ```bash
-   nel run --config <config_path> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
-   ```
+**Step 8.2: Limited-samples canary** (operational validation before production):
 
-3. **Re-run a single task** (useful for debugging or re-testing after config changes):
+```bash
+nel run --config <config_path> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10
+```
 
-   ```bash
-   nel run --config <config_path> -t <task_name>
-   ```
+Use the canary to tune parallelism and catch runtime failures that the dry-run
+cannot catch: judge API auth/rate-limit errors, evaluation container failures,
+code-execution sandbox/container errors, vLLM health/OOM issues, bad request
+formatting, log path problems, and unexpectedly low evaluated-sample counts.
+Inspect logs before accepting the canary, not just result files:
+
+```bash
+nel status <canary_invocation_id>
+nel info <canary_invocation_id> --logs
+ssh <user>@<host> "grep -i 'error\|failed\|exception\|timeout\|unauthorized' <log_path>/*.log"
+```
 
-   Combine with `-o` for limited samples: `nel run --config <config_path> -t <task_name> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10`
+If the benchmark set mixes different dependency profiles, canary each risky
+class or task: LLM-judge tasks, code-execution tasks, and ordinary model-only
+tasks can fail for different reasons. For evals that depend on inference judges
+or code execution containers, start with conservative `parallelism` and raise it
+only after the canary logs show those dependencies are healthy. Do not over-raise
+parallelism just to saturate the model server; judge services and code containers
+often become the bottleneck or failure point first.
 
-4. **Full evaluation** (production run):
+**Single-task rerun** (useful for canary debugging or re-testing after config changes):
 
-   ```bash
-   nel run --config <config_path>
-   ```
+```bash
+nel run --config <config_path> -t <task_name>
+```
+
+Combine with `-o` for limited samples: `nel run --config <config_path> -t <task_name> -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=10`
+
+**Step 8.3: Full evaluation** (production run after the canary passes):
+
+```bash
+nel run --config <config_path>
+```
 
-After the dry-run, check the output from `nel` for any problems with the config. If there are no problems, propose to first execute the test run with limited samples and then execute the full evaluation. If there are problems, resolve them before executing the full evaluation.
+Before the full run, remove the `limit_samples` override and keep only the
+parallelism/settings that the canary validated. If the canary fails, fix the
+config, credentials, image/container, judge setup, code-execution environment, or
+parallelism, then rerun the canary before launching the full evaluation.
 
 **Monitoring Progress**
 
@@ -361,5 +393,8 @@ Config Generation Progress:
 - [ ] Step 7: Advanced - Interceptors
 - [ ] Step 7.5: Check container registry auth for private images (SLURM only)
 - [ ] Step 8: Run the evaluation
+  - [ ] Step 8.1: Dry-run / NEL CLI config validation
+  - [ ] Step 8.2: Limited-samples canary
+  - [ ] Step 8.3: Full evaluation
 - [ ] Step 9: Verify evaluation comparability
 ```
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index 77887b3f8c3..944d3b61302 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -25,7 +25,7 @@
 # Run a single task:
 #   nel run --config ... -t ns_gpqa
 #
-# Smoke test (2 samples):
+# Canary (2 samples): use this before a full run to validate logs and tune parallelism.
 #   nel run --config ... -o ++evaluation.nemo_evaluator_config.config.params.limit_samples=2
 defaults:
   - execution: slurm/default
diff --git a/.claude/skills/evaluation/tests/evals.json b/.claude/skills/evaluation/tests/evals.json
index 9fc6712a53d..d3ea82c35fd 100644
--- a/.claude/skills/evaluation/tests/evals.json
+++ b/.claude/skills/evaluation/tests/evals.json
@@ -22,7 +22,9 @@
       "Presents task list and waits for user confirmation before proceeding",
       "Configures request and response logging interceptors under evaluation.nemo_evaluator_config.config.target.api_endpoint.adapter_config using correct field names (max_logged_requests/max_logged_responses, not max_saved_*)",
       "Handles dry-run failure for missing HF_TOKEN_FOR_GPQA_DIAMOND by offering to fix the config",
-      "Successfully submits test run with limit_samples=10 after dry-run passes",
+      "Successfully submits a limited-samples canary with limit_samples=10 after dry-run passes",
+      "Inspects canary logs for judge, evaluation container, deployment, and code-execution errors before allowing the full run",
+      "Tunes parallelism from canary results and avoids over-raising it for judge-backed or code-execution tasks",
       "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked"
     ]
   },
@@ -41,7 +43,8 @@
       "Recommends accuracy-sensitive benchmarks from references/quantization-benchmarks.md",
       "Searches web for Llama-3.1-8B model card and extracts sampling params, context length, TP settings",
       "Fills in remaining missing values by asking user",
-      "Runs dry-run, then test with limit_samples=10, then full evaluation",
+      "Runs dry-run, then a limited-samples canary with limit_samples=10, then full evaluation only after canary log checks pass",
+      "Tunes parallelism from the canary and avoids over-raising it when benchmarks depend on inference judges or code execution containers",
       "Reports accuracy results per benchmark"
     ]
   },
@@ -58,7 +61,8 @@
       "Reads references/quantization-benchmarks.md and recommends accuracy-sensitive benchmarks",
       "Uses WebSearch to research model card for sampling params and context length",
       "Fills in SLURM-specific values: hostname, account, partition from user input",
-      "Runs dry-run validation before full evaluation",
+      "Runs dry-run validation and a limited-samples canary before full evaluation",
+      "Checks canary logs for judge, container, deployment, and code-execution failures before treating the run as ready",
       "Provides SSH-based log monitoring commands for SLURM execution"
     ]
   }

From 8b6cc5fff65f4892eb1ffeea26669a34a1478bf9 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 13:33:01 -0500
Subject: [PATCH 11/26] Document NEL timeout resume behavior

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index d5d61e85b2b..2af35db96b1 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -18,6 +18,17 @@ If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md
 
 This skill is often the final stage of the PTQ → Deploy → Eval pipeline. If the model required runtime patches during deployment (transformers upgrade, framework source fixes), carry those patches into the NEL config via `deployment.command`.
 
+### NEL Timeout and Resume Behavior
+
+NEL submissions commonly create a dependency chain of SLURM jobs. The first job
+runs the evaluation and writes response/result caches. A dependent follow-on job
+resumes from those caches if the first job times out, then queues another follow-on
+job so long-running evals can continue across walltime windows.
+
+Do not assume a timeout means the evaluation failed or produced invalid results.
+Treat timeouts as expected resume events until `nel status`/`nel info`, artifacts,
+and logs show a terminal failure or invalid run.
+
 ### Workflow
 
 ```text

From 717507f199694913a1d0b894c82f89d46907a79e Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 13:35:59 -0500
Subject: [PATCH 12/26] Split evaluation validation and comparability steps

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md         | 35 ++++++++++++++++------
 .claude/skills/evaluation/tests/evals.json |  8 ++++-
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 2af35db96b1..3949af887b2 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -46,7 +46,8 @@ Config Generation Progress:
   - [ ] Step 8.1: Dry-run / NEL CLI config validation
   - [ ] Step 8.2: Limited-samples canary
   - [ ] Step 8.3: Full evaluation
-- [ ] Step 9: Verify evaluation comparability
+- [ ] Step 9: Verify completed evaluation run
+- [ ] Step 10: Verify baseline-vs-quantized comparability
 ```
 
 **Step 1: Check prerequisites**
@@ -320,7 +321,7 @@ Inspect logs before accepting the canary, not just result files:
 ```bash
 nel status <canary_invocation_id>
 nel info <canary_invocation_id> --logs
-ssh <user>@<host> "grep -i 'error\|failed\|exception\|timeout\|unauthorized' <log_path>/*.log"
+ssh <user>@<host> "grep -i 'traceback\|exception\|error\|failed\|oom\|killed\|timeout\|unauthorized\|rate limit\|sandbox\|container\|judge\|parse\|scoring' <log_path>/*.log"
 ```
 
 If the benchmark set mixes different dependency profiles, canary each risky
@@ -354,17 +355,32 @@ parallelism, then rerun the canary before launching the full evaluation.
 
 After job submission, register the job per the **monitor skill** for durable cross-session tracking. For one-off queries (live status, debugging a failed run, analyzing results) use the **launching-evals skill**; for querying past runs in MLflow use **accessing-mlflow**.
 
-**Step 9: Verify evaluation comparability**
+**Step 9: Verify completed evaluation run**
 
-Before treating a baseline-vs-quantized delta as a model quality result, verify the two runs are comparable:
+Before pulling/reporting scores, validate the completed run itself. Do not accept a run as complete just because `results.yml` or a summary file exists.
+
+For each completed invocation/run directory, whether baseline, quantized, or a single-model run:
+
+1. Inspect client, server/deployment, SLURM, judge, and task-specific/code-execution logs as applicable. Search for `Traceback`, `Exception`, `ERROR`, `FAILED`, `OOM`, `Killed`, `timeout`, `rate limit`, `unauthorized`, `connection refused/reset`, `health check`, `sandbox`, `container`, `judge`, `parse`, `scoring`, and task-specific failure strings.
+2. Confirm the inference server loaded the intended checkpoint/model and stayed healthy through the run: no startup failure, mid-run crash/restart, OOM, request validation failure, max-context truncation, quantization load error, or repeated 4xx/5xx responses.
+3. For judge-backed tasks, confirm judge calls succeeded and were parsed/scored correctly: no auth/rate-limit failures, malformed judge responses, invalid JSON, missing scores, or fallback/default scores.
+4. For code-execution tasks, inspect executor/sandbox/container logs for setup failures, package install failures, timeouts, thread/process exhaustion, permission errors, harness crashes, or skipped tests that would make scores non-comparable.
+5. Confirm sample accounting: expected samples/repeats match completed, scored samples; no unexpected dropped/skipped/failed samples, `unknown_agent_error`, `failed_samples_policy` aborts, empty outputs, or partial result files.
+6. If reasoning traces are present, confirm they are parsed/stripped/ignored before scoring consistently. Check for parser errors, unmatched reasoning delimiters, `finish_reason: length`, reasoning text leaked into answers, answers stripped with the reasoning, or reasoning disabled when the config intended it to be active.
+
+Report the run-validation summary before any score: log scan status, sample accounting, reasoning/answer parsing status, and any errors or warnings found. If any validation item fails, either rerun/fix it or label the result as incomplete or invalid.
+
+**Step 10: Verify baseline-vs-quantized comparability**
+
+Before treating a baseline-vs-quantized delta as a model quality result, verify the validated runs are comparable:
 
 1. Confirm the prompt text and chat template/rendered messages match between the baseline and quantized evaluations.
 2. Confirm generation settings match, including temperature, top_p, top_k, max tokens, stop strings, reasoning mode/budget, and any task-specific overrides.
-3. If reasoning traces are present, confirm they are stripped or ignored before scoring, consistently for both runs.
-4. Confirm the number of evaluated samples matches for each task and split.
+3. Confirm reasoning-trace handling is consistent between runs.
+4. Confirm the number of evaluated/scored samples matches for each task and split.
 5. Confirm the same accuracy metric/score field is used for the baseline and quantized comparison.
 
-If any item differs, either rerun with matched settings or label the result as not an apples-to-apples quantization comparison.
+Report the comparability summary alongside the score: prompt/template status, generation-setting status, sample-count status, reasoning-handling status, and the exact score field used. If any item differs, either rerun with matched settings or label the result as not an apples-to-apples quantization comparison.
 
 **NEL-specific diagnostics** (for debugging failures):
 
@@ -380,7 +396,7 @@ nel info <invocation_id> --logs
 ssh <user>@<host> "tail -100 <log_path>/server-<slurm_job_id>-*.log"   # deployment errors
 ssh <user>@<host> "tail -100 <log_path>/client-<slurm_job_id>.log"     # evaluation errors
 ssh <user>@<host> "tail -100 <log_path>/slurm-<slurm_job_id>.log"      # scheduling/walltime
-ssh <user>@<host> "grep -i 'error\|failed' <log_path>/*.log"           # search all logs
+ssh <user>@<host> "grep -i 'traceback\|exception\|error\|failed\|oom\|killed\|timeout\|unauthorized\|rate limit\|sandbox\|container\|judge\|parse\|scoring' <log_path>/*.log"  # search all logs
 ```
 
 ---
@@ -407,5 +423,6 @@ Config Generation Progress:
   - [ ] Step 8.1: Dry-run / NEL CLI config validation
   - [ ] Step 8.2: Limited-samples canary
   - [ ] Step 8.3: Full evaluation
-- [ ] Step 9: Verify evaluation comparability
+- [ ] Step 9: Verify completed evaluation run
+- [ ] Step 10: Verify baseline-vs-quantized comparability
 ```
diff --git a/.claude/skills/evaluation/tests/evals.json b/.claude/skills/evaluation/tests/evals.json
index d3ea82c35fd..823d5bc6790 100644
--- a/.claude/skills/evaluation/tests/evals.json
+++ b/.claude/skills/evaluation/tests/evals.json
@@ -25,6 +25,8 @@
       "Successfully submits a limited-samples canary with limit_samples=10 after dry-run passes",
       "Inspects canary logs for judge, evaluation container, deployment, and code-execution errors before allowing the full run",
       "Tunes parallelism from canary results and avoids over-raising it for judge-backed or code-execution tasks",
+      "Before reporting final scores, inspects full-run logs for judge, inference server, code-execution, reasoning parsing, dropped-sample, and scoring errors",
+      "Keeps final-run validity checks separate from baseline-vs-quantized comparability checks",
       "Provides monitoring commands (nel status, nel info --logs) and inspects server logs via SSH when asked"
     ]
   },
@@ -45,7 +47,9 @@
       "Fills in remaining missing values by asking user",
       "Runs dry-run, then a limited-samples canary with limit_samples=10, then full evaluation only after canary log checks pass",
       "Tunes parallelism from the canary and avoids over-raising it when benchmarks depend on inference judges or code execution containers",
-      "Reports accuracy results per benchmark"
+      "Before reporting accuracy, scans final run logs and sample accounting for runtime errors, dropped/skipped samples, reasoning parsing problems, and wrong score-field usage",
+      "Runs comparability verification as a separate step after final-run validation when comparing baseline and quantized results",
+      "Reports accuracy results per benchmark only after log validation passes"
     ]
   },
   {
@@ -63,6 +67,8 @@
       "Fills in SLURM-specific values: hostname, account, partition from user input",
       "Runs dry-run validation and a limited-samples canary before full evaluation",
       "Checks canary logs for judge, container, deployment, and code-execution failures before treating the run as ready",
+      "Checks final run logs and sample accounting before accepting result files as complete and comparable",
+      "Separately verifies baseline-vs-quantized comparability before presenting an accuracy delta",
       "Provides SSH-based log monitoring commands for SLURM execution"
     ]
   }

From 9092bc47d279dbb6d808b7caa6175ee42ad2d24c Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Thu, 14 May 2026 14:14:53 -0500
Subject: [PATCH 13/26] Convert evaluation task snippets to references

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  24 +--
 .../recipes/examples/example_eval.yaml        |   4 +-
 .../evaluation/recipes/tasks/aime2025.md      |  37 +++++
 .../evaluation/recipes/tasks/aime2025.yaml    |  19 ---
 .../skills/evaluation/recipes/tasks/gpqa.md   |  98 ++++++++++++
 .../skills/evaluation/recipes/tasks/gpqa.yaml |  16 --
 .../evaluation/recipes/tasks/ifbench.md       |  32 ++++
 .../evaluation/recipes/tasks/ifbench.yaml     |  15 --
 .../evaluation/recipes/tasks/livecodebench.md |  34 +++++
 .../recipes/tasks/livecodebench.yaml          |  17 ---
 .../evaluation/recipes/tasks/mmlu_pro.md      |  33 +++++
 .../evaluation/recipes/tasks/mmlu_pro.yaml    |  16 --
 .../evaluation/recipes/tasks/scicode.md       | 139 ++++++++++++++++++
 .../evaluation/recipes/tasks/scicode.yaml     |  16 --
 14 files changed, 389 insertions(+), 111 deletions(-)
 create mode 100644 .claude/skills/evaluation/recipes/tasks/aime2025.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/aime2025.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/gpqa.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/gpqa.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/ifbench.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/ifbench.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/livecodebench.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/livecodebench.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
 create mode 100644 .claude/skills/evaluation/recipes/tasks/scicode.md
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/scicode.yaml

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 3949af887b2..e41723089a2 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -56,13 +56,15 @@ Test that `nel` is installed with `nel --version`. If not, instruct the user to
 
 If the user already has a config file (e.g., "run this config", "evaluate with my-config.yaml"), skip to Step 8. Optionally review it for common issues (missing `???` values, quantization flags) before running.
 
-**Shortcut: use pre-built task snippets.** If the user asks for a specific benchmark (e.g., "run MMLU-Pro", "evaluate with AIME"), check `recipes/tasks/` (relative to this skill's directory) for a matching task snippet. Available: mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode. Task snippets contain only the task-specific config (name, params, repeats) — not the full NEL config. To use them:
+**Shortcut: use task references.** For named benchmarks, read the matching
+`recipes/tasks/<name>.md` before creating or editing the config. Available:
+mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode.
 
-1. Read the task snippet(s) the user wants
+1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
-3. Replace the `tasks:` section with the selected snippet(s)
-4. Do Step 3 (auto-detect model settings from checkpoint) and Step 4 (fill in `???` values)
-5. Proceed to Step 7.5/8
+3. Copy the selected YAML fragment(s) into `evaluation.tasks`.
+4. Apply any notes from the reference.
+5. Do Step 3, Step 4, then Step 7.5/8.
 
 **Step 2: Build the base config file**
 
@@ -192,11 +194,9 @@ For reasoning-capable models, prefer reasoning mode for evaluation because it us
 Show tasks in the current config. Loop until the user confirms the task list is final:
 
 1. Tell the user: "Run `nel ls tasks` to see all available tasks".
-2. If the task list includes a benchmark with a pre-built snippet in `recipes/tasks/`,
-   prefer that snippet over hand-written task overrides unless the user explicitly asks
-   for different sampling or prompt settings. For reasoning-mode comparisons, keep the
-   recipe repeat counts; for tasks without a recipe, use `num_repeats >= 3` when the
-   benchmark supports repeats.
+2. If the task list includes a benchmark with a reference in `recipes/tasks/`,
+   read it before editing the config and prefer its YAML fragment unless the user
+   asks for different settings. Keep the reference repeat counts.
 3. Ask if they want to add/remove tasks or add/remove/modify task-specific parameter overrides.
    To add per-task `nemo_evaluator_config` as specified by the user, e.g.:
 
@@ -370,6 +370,10 @@ For each completed invocation/run directory, whether baseline, quantized, or a s
 
 Report the run-validation summary before any score: log scan status, sample accounting, reasoning/answer parsing status, and any errors or warnings found. If any validation item fails, either rerun/fix it or label the result as incomplete or invalid.
 
+For score harvesting, use the `Score Extraction` Python snippet from the matching
+task reference in `recipes/tasks/<task>.md`. Do not rely on ad hoc `results.yml`
+greps when a task reference defines the canonical score and stderr fields.
+
 **Step 10: Verify baseline-vs-quantized comparability**
 
 Before treating a baseline-vs-quantized delta as a model quality result, verify the validated runs are comparable:
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index 944d3b61302..22bcb167fdc 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -2,8 +2,8 @@
 #
 # A balanced set of benchmarks for validating quantized model quality.
 # Copy this file and customize for your needs.
-# Task snippets in recipes/tasks/ define per-task configs — the agent
-# composes them into a runnable config like this one.
+# Task references in recipes/tasks/ define benchmark requirements and YAML
+# fragments — the agent composes them into a runnable config like this one.
 #
 # Includes:
 #   - MMLU-Pro (knowledge, completions)
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
new file mode 100644
index 00000000000..9196e2ec557
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -0,0 +1,37 @@
+# AIME 2025
+
+## Task Details
+
+- Task: `ns_aime2025`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-16] symbolic_correct`
+- Run time: Long for reasoning models with lengthy thinking traces
+- Repeats: 16
+- The AA variant, `simple_evals.AIME_2025`, requires `JUDGE_API_KEY`.
+  This NeMo Skills variant uses symbolic scoring and does not require an
+  external judge API key.
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_aime2025
+  nemo_evaluator_config:
+    config:
+      params:
+        request_timeout: 100000
+        max_retries: 10
+        extra:
+          num_repeats: 16
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml b/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
deleted file mode 100644
index 1cf5643f481..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# AIME 2025 (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-16] symbolic_correct
-# Run time: Long (reasoning models generate lengthy thinking traces) | Repeats: 16
-# Note: The AA variant (simple_evals.AIME_2025) requires JUDGE_API_KEY.
-#       This NeMo Skills variant uses symbolic scoring — no external API keys needed.
-  - name: ns_aime2025
-    nemo_evaluator_config:
-      config:
-        params:
-          request_timeout: 100000
-          max_retries: 10
-          extra:
-            num_repeats: 16
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
new file mode 100644
index 00000000000..d6e5b2b6198
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -0,0 +1,98 @@
+# GPQA Diamond
+
+## Task Details
+
+- Task: `ns_gpqa`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-16] symbolic_correct`
+- Run time: Short
+- Samples: 16
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_gpqa
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          args: ++prompt_config=eval/aai/mcq-4choices
+          n_samples: 16
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
+
+GPQA accuracy comes from:
+
+```text
+results.groups.gpqa.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
+```
+
+For repeated runs, report stderr as percentage points:
+
+```text
+symbolic_correct_statistics_std_err_across_runs.value * 100
+```
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count. If the
+repeat count is unknown, use the highest available `avg-of-N`.
+
+```python
+import re
+import sys
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_gpqa_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["gpqa"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["symbolic_correct"]["value"]
+    stderr_value = scores.get(
+        "symbolic_correct_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = stderr_value * 100 if stderr_value is not None else None
+
+    return {
+        "group": "gpqa",
+        "metric": metric_name,
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+
+if __name__ == "__main__":
+    path = sys.argv[1]
+    repeats = int(sys.argv[2]) if len(sys.argv) > 2 else None
+    print(extract_gpqa_score(path, repeats))
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml b/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
deleted file mode 100644
index 3692175d987..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# GPQA Diamond (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-5] symbolic_correct
-# Run time: Short | Repeats: 5
-  - name: ns_gpqa
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            args: ++prompt_config=eval/aai/mcq-4choices
-            num_repeats: 5
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
new file mode 100644
index 00000000000..6008a9391ca
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -0,0 +1,32 @@
+# IFBench
+
+## Task Details
+
+- Task: `ns_ifbench`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-8] prompt_strict_accuracy`
+- Run time: Super short
+- Repeats: 8
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_ifbench
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 8
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml b/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
deleted file mode 100644
index 46cbc2db085..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-# IFBench (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-8] prompt_strict_accuracy
-# Run time: Super Short | Repeats: 8
-  - name: ns_ifbench
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 8
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.md b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
new file mode 100644
index 00000000000..15323b18124
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
@@ -0,0 +1,34 @@
+# LiveCodeBench v6
+
+## Task Details
+
+- Task: `ns_livecodebench`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-3] accuracy`
+- Run time: Medium
+- Repeats: 3
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_livecodebench
+  nemo_evaluator_config:
+    config:
+      params:
+        max_retries: 10
+        extra:
+          dataset_split: test_v6_2408_2505
+          num_repeats: 3
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml b/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
deleted file mode 100644
index 202387a1eb6..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/livecodebench.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-# LiveCodeBench v6 (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-3] accuracy
-# Run time: Medium | Repeats: 3
-  - name: ns_livecodebench
-    nemo_evaluator_config:
-      config:
-        params:
-          max_retries: 10
-          extra:
-            dataset_split: test_v6_2408_2505
-            num_repeats: 3
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
new file mode 100644
index 00000000000..3b2bec8c92f
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -0,0 +1,33 @@
+# MMLU-Pro
+
+## Task Details
+
+- Task: `ns_mmlu_pro`
+- Harness: NeMo Skills, chat
+- Primary metric: `symbolic_correct`
+- Run time: Short
+- Repeats: 1
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_mmlu_pro
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 1
+          args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+```
+
+## Score Extraction
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
deleted file mode 100644
index be16a546a39..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# MMLU-Pro (NeMo Skills, chat)
-# Primary metric: symbolic_correct
-# Run time: Short | Repeats: 1
-  - name: ns_mmlu_pro
-    nemo_evaluator_config:
-      config:
-        params:
-          extra:
-            num_repeats: 1
-            args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
new file mode 100644
index 00000000000..fba67c15640
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -0,0 +1,139 @@
+# SciCode
+
+SciCode is a NeMo Skills code/reasoning benchmark with multi-step prompts and a
+code-execution sandbox. Check this reference before creating or modifying NEL
+configs for SciCode; the benchmark has deployment, parallelism, and score
+harvesting requirements beyond the task YAML fragment.
+
+## Config Requirements
+
+- Use `--max-model-len 65536` for the deployment. Do not leave the generic
+  `32768` fallback in place; SciCode multi-step prompts can exceed 32K tokens.
+- Keep `parallelism: 4` unless a canary proves a different value is safe. Higher
+  parallelism can flood the code-execution sandbox and produce resource/thread
+  failures even when the SLURM job completes.
+- Generate enough answer tokens for multi-step solutions:
+  `++inference.tokens_to_generate=32768`.
+- For reasoning-capable endpoints that support OpenAI-style effort controls, set
+  `reasoning_effort: high` through `params_to_add`, not prompt text.
+- Use repeats when runtime permits so the result file contains uncertainty
+  estimates. The intended full-run plan is `num_repeats: 3`; if using a variant
+  that expects `n_repeats`, keep it aligned at `3`. Lower repeat counts are fine
+  for canaries, but do not report stderr from a run that did not produce repeat
+  statistics.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_scicode
+  nemo_evaluator_config:
+    config:
+      params:
+        max_retries: 10
+        parallelism: 4
+        extra:
+          args: ++inference.tokens_to_generate=32768
+          num_repeats: 3
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_new_tokens
+            - max_completion_tokens
+          params_to_add:
+            reasoning_effort: high
+```
+
+Also make sure the deployment-level args include `--max-model-len 65536`,
+preserving any other required model-card or quantization args:
+
+```yaml
+deployment:
+  extra_args: --max-model-len 65536
+```
+
+## Score Extraction
+
+SciCode accuracy comes from:
+
+```text
+results.groups.scicode.metrics."pass@1[avg-of-N]".scores.subtask_accuracy.value
+```
+
+For repeated runs, report stderr as:
+
+```text
+subtask_accuracy_statistics_std_err_across_runs.value * 100 * num_problems / num_subtasks
+```
+
+The helper below also supports GPQA's matching layout, where accuracy comes from
+`symbolic_correct.value` and stderr is
+`symbolic_correct_statistics_std_err_across_runs.value * 100`.
+
+```python
+import re
+import sys
+import yaml
+
+
+TASKS = {
+    "scicode": {
+        "score_key": "subtask_accuracy",
+        "stderr_scale": "subtasks",
+    },
+    "gpqa": {
+        "score_key": "symbolic_correct",
+        "stderr_scale": "percent",
+    },
+}
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_pass1_metric(metrics):
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_score(path, group="scicode"):
+    spec = TASKS[group]
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"][group]["metrics"]
+    metric_name = select_pass1_metric(metrics)
+    scores = metrics[metric_name]["scores"]
+
+    score_key = spec["score_key"]
+    accuracy = scores[score_key]["value"]
+
+    stderr_key = f"{score_key}_statistics_std_err_across_runs"
+    stderr_value = scores.get(stderr_key, {}).get("value")
+    stderr = None
+    if stderr_value is not None:
+        if spec["stderr_scale"] == "subtasks":
+            num_problems = scores["num_problems"]["value"]
+            num_subtasks = scores["num_subtasks"]["value"]
+            stderr = stderr_value * 100 * num_problems / num_subtasks
+        else:
+            stderr = stderr_value * 100
+
+    return {
+        "group": group,
+        "metric": metric_name,
+        "score_key": score_key,
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+
+if __name__ == "__main__":
+    path = sys.argv[1]
+    group = sys.argv[2] if len(sys.argv) > 2 else "scicode"
+    print(extract_score(path, group))
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.yaml b/.claude/skills/evaluation/recipes/tasks/scicode.yaml
deleted file mode 100644
index 724b6935759..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/scicode.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-# SciCode (NeMo Skills, chat)
-# Primary metric: pass@1[avg-of-3] subtask_accuracy
-# Run time: Long | Repeats: 3
-  - name: ns_scicode
-    nemo_evaluator_config:
-      config:
-        params:
-          max_retries: 10
-          extra:
-            num_repeats: 3
-      target:
-        api_endpoint:
-          adapter_config:
-            params_to_remove:
-              - max_new_tokens
-              - max_completion_tokens

From 1b5e0318e52d28769bc37d21f5ac7a4b9e716431 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Fri, 15 May 2026 16:58:25 -0500
Subject: [PATCH 14/26] Add evaluation task references

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  3 +-
 .claude/skills/evaluation/recipes/env.example |  7 ++-
 .../skills/evaluation/recipes/tasks/aa_lcr.md | 52 +++++++++++++++++++
 .../evaluation/recipes/tasks/aime2025.md      |  3 ++
 .../skills/evaluation/recipes/tasks/hle_aa.md | 41 +++++++++++++++
 .../evaluation/recipes/tasks/ifbench.md       |  8 ++-
 .../recipes/tasks/mmlu_pro_aa_v3.md           | 31 +++++++++++
 7 files changed, 141 insertions(+), 4 deletions(-)
 create mode 100644 .claude/skills/evaluation/recipes/tasks/aa_lcr.md
 create mode 100644 .claude/skills/evaluation/recipes/tasks/hle_aa.md
 create mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index e41723089a2..7ddeb6f91a3 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -58,7 +58,8 @@ If the user already has a config file (e.g., "run this config", "evaluate with m
 
 **Shortcut: use task references.** For named benchmarks, read the matching
 `recipes/tasks/<name>.md` before creating or editing the config. Available:
-mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode.
+mmlu_pro, mmlu_pro_aa_v3, gpqa, aime2025, livecodebench, ifbench,
+scicode, aa_lcr, hle_aa.
 
 1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
diff --git a/.claude/skills/evaluation/recipes/env.example b/.claude/skills/evaluation/recipes/env.example
index 8d9b9bfa6d9..f84f06eb157 100644
--- a/.claude/skills/evaluation/recipes/env.example
+++ b/.claude/skills/evaluation/recipes/env.example
@@ -18,11 +18,14 @@ NEMO_EVALUATOR_TRUST_PRE_CMD=1
 
 # --- Optional: task-specific keys ---
 
-# AIME 2025 (simple_evals variant only, not ns_aime2025)
+# AIME 2025 simple_evals and judge-backed tasks
 # JUDGE_API_KEY=
 
+# HLE AA judge API key
+# INFERENCE_API_KEY=
+
 # tau2_bench_telecom (LLM judge)
-# JUDGE_API_KEY_NVDEV_QWEN235B=
+# TAU2_JUDGE_API_KEY=
 
 # terminal-bench-hard (AWS sandbox)
 # AWS_ACCESS_KEY_ID=
diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
new file mode 100644
index 00000000000..c809d04f362
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -0,0 +1,52 @@
+# AA-LCR
+
+## Task Details
+
+- Task: `nemo_skills.ns_aa_lcr`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1 judge_correct`
+- Run time: Long
+- Repeats: 3
+- Requires `HF_TOKEN` and `JUDGE_API_KEY`
+
+Recommended judge: use Qwen3 235B A22B 2507 Non-Reasoning as an
+OpenAI-compatible equality-checker judge, and keep the same judge across
+comparable runs.
+
+AA-LCR is long-context sensitive. For 128K-context models, avoid capping
+generation tokens for this task unless the deployment needs the cap for
+stability.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: nemo_skills.ns_aa_lcr
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+    JUDGE_API_KEY: host:JUDGE_API_KEY
+  nemo_evaluator_config:
+    target:
+      api_endpoint:
+        adapter_config:
+          params_to_remove:
+            - max_tokens
+    config:
+      params:
+        extra:
+          num_repeats: 3
+          judge:
+            model_id: <qwen3_235b_a22b_2507_non_reasoning_judge_model_id>
+            url: <openai_compatible_judge_url>
+            api_key: JUDGE_API_KEY
+```
+
+## Score Extraction
+
+AA-LCR accuracy comes from:
+
+```text
+results.groups.aalcr.metrics.pass@1.scores.judge_correct.value
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
index 9196e2ec557..4dc89742b9c 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.md
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -35,3 +35,6 @@ Use this inside the top-level `evaluation.tasks` list:
 ```
 
 ## Score Extraction
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured sample/repeat
+count.
diff --git a/.claude/skills/evaluation/recipes/tasks/hle_aa.md b/.claude/skills/evaluation/recipes/tasks/hle_aa.md
new file mode 100644
index 00000000000..9fbc5eeb5ce
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/hle_aa.md
@@ -0,0 +1,41 @@
+# HLE AA
+
+## Task Details
+
+- Task: `nemo_skills.ns_hle_aa`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1 judge_correct`
+- Run time: Long
+- Repeats: 1
+- Requires `HF_TOKEN` and `INFERENCE_API_KEY`
+
+Recommended judge: use OpenAI GPT-4o as the OpenAI-compatible equality-checker
+judge, matching the original HLE paper setup, and keep the same judge across
+comparable runs.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: nemo_skills.ns_hle_aa
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+    INFERENCE_API_KEY: host:INFERENCE_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          judge:
+            model_id: <gpt_4o_judge_model_id>
+            api_key: INFERENCE_API_KEY
+```
+
+## Score Extraction
+
+HLE AA accuracy comes from:
+
+```text
+results.groups.hle.metrics.pass@1.scores.judge_correct.value
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
index 6008a9391ca..34892ad0b33 100644
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.md
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -4,7 +4,7 @@
 
 - Task: `ns_ifbench`
 - Harness: NeMo Skills, chat
-- Primary metric: `pass@1[avg-of-8] prompt_strict_accuracy`
+- Primary metric: `pass@1[avg-of-8] prompt_loose_accuracy`
 - Run time: Super short
 - Repeats: 8
 
@@ -30,3 +30,9 @@ Use this inside the top-level `evaluation.tasks` list:
 ```
 
 ## Score Extraction
+
+IFBench accuracy comes from:
+
+```text
+results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md
new file mode 100644
index 00000000000..eec12ff5f3c
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md
@@ -0,0 +1,31 @@
+# MMLU-Pro AA v3
+
+## Task Details
+
+- Task: `simple_evals.mmlu_pro_aa_v3`
+- Harness: simple-evals, chat
+- Primary metric: task accuracy
+- Run time: Medium
+- Samples: 1
+- Requires `HF_TOKEN`
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: simple_evals.mmlu_pro_aa_v3
+  container: nvcr.io/nvidia/eval-factory/simple-evals:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          n_samples: 1
+```
+
+## Score Extraction
+
+Inspect the generated `results.yml` for the exact simple-evals group and score
+key.

From cce42cc3abf0b5bba1c0b5469ed39924942c07f4 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Mon, 18 May 2026 10:08:57 -0500
Subject: [PATCH 15/26] Use NeMo Skills MMLU-Pro recipe

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  3 +-
 .../recipes/examples/example_eval.yaml        |  2 +-
 .../evaluation/recipes/tasks/mmlu_pro.md      |  4 +--
 .../recipes/tasks/mmlu_pro_aa_v3.md           | 31 -------------------
 4 files changed, 4 insertions(+), 36 deletions(-)
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 7ddeb6f91a3..a52fe04cec0 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -58,8 +58,7 @@ If the user already has a config file (e.g., "run this config", "evaluate with m
 
 **Shortcut: use task references.** For named benchmarks, read the matching
 `recipes/tasks/<name>.md` before creating or editing the config. Available:
-mmlu_pro, mmlu_pro_aa_v3, gpqa, aime2025, livecodebench, ifbench,
-scicode, aa_lcr, hle_aa.
+mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode, aa_lcr, hle_aa.
 
 1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index 22bcb167fdc..c8afd1a2b43 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -63,7 +63,7 @@ evaluation:
         api_key_name: DUMMY_API_KEY
   tasks:
   # Knowledge (chat endpoint, short)
-    - name: ns_mmlu_pro
+    - name: nemo_skills.ns_mmlu_pro
       nemo_evaluator_config:
         config:
           params:
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
index 3b2bec8c92f..48c767de25c 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -2,7 +2,7 @@
 
 ## Task Details
 
-- Task: `ns_mmlu_pro`
+- Task: `nemo_skills.ns_mmlu_pro`
 - Harness: NeMo Skills, chat
 - Primary metric: `symbolic_correct`
 - Run time: Short
@@ -15,7 +15,7 @@
 Use this inside the top-level `evaluation.tasks` list:
 
 ```yaml
-- name: ns_mmlu_pro
+- name: nemo_skills.ns_mmlu_pro
   nemo_evaluator_config:
     config:
       params:
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md
deleted file mode 100644
index eec12ff5f3c..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro_aa_v3.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# MMLU-Pro AA v3
-
-## Task Details
-
-- Task: `simple_evals.mmlu_pro_aa_v3`
-- Harness: simple-evals, chat
-- Primary metric: task accuracy
-- Run time: Medium
-- Samples: 1
-- Requires `HF_TOKEN`
-
-## YAML Fragment
-
-Use this inside the top-level `evaluation.tasks` list:
-
-```yaml
-- name: simple_evals.mmlu_pro_aa_v3
-  container: nvcr.io/nvidia/eval-factory/simple-evals:26.03
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-  nemo_evaluator_config:
-    config:
-      params:
-        extra:
-          n_samples: 1
-```
-
-## Score Extraction
-
-Inspect the generated `results.yml` for the exact simple-evals group and score
-key.

From f40898d37267056246371d717cc51e646ee96f31 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Mon, 18 May 2026 11:15:09 -0500
Subject: [PATCH 16/26] Update evaluation task recipes

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  3 +-
 .claude/skills/evaluation/recipes/env.example |  9 ++--
 .../skills/evaluation/recipes/tasks/aa_lcr.md | 33 +++++++--------
 .../evaluation/recipes/tasks/aime2025.md      |  7 +---
 .../skills/evaluation/recipes/tasks/gpqa.md   |  2 +
 .../skills/evaluation/recipes/tasks/hle_aa.md | 41 -------------------
 .../evaluation/recipes/tasks/hle_aa_v2.md     | 36 ++++++++++++++++
 .../evaluation/recipes/tasks/ifbench.md       |  2 +
 .../evaluation/recipes/tasks/livecodebench.md |  2 +
 .../evaluation/recipes/tasks/mmlu_pro.md      |  4 +-
 .../evaluation/recipes/tasks/mmmu_pro.md      | 37 +++++++++++++++++
 .../evaluation/recipes/tasks/scicode.md       | 12 ++++++
 .../recipes/tasks/tau2_bench_telecom.md       | 38 +++++++++++++++++
 13 files changed, 154 insertions(+), 72 deletions(-)
 delete mode 100644 .claude/skills/evaluation/recipes/tasks/hle_aa.md
 create mode 100644 .claude/skills/evaluation/recipes/tasks/hle_aa_v2.md
 create mode 100644 .claude/skills/evaluation/recipes/tasks/mmmu_pro.md
 create mode 100644 .claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index a52fe04cec0..02916f7c70a 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -58,7 +58,8 @@ If the user already has a config file (e.g., "run this config", "evaluate with m
 
 **Shortcut: use task references.** For named benchmarks, read the matching
 `recipes/tasks/<name>.md` before creating or editing the config. Available:
-mmlu_pro, gpqa, aime2025, livecodebench, ifbench, scicode, aa_lcr, hle_aa.
+mmlu_pro, mmmu_pro, gpqa, aime2025, livecodebench, ifbench, scicode,
+aa_lcr, hle_aa_v2, tau2_bench_telecom.
 
 1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
diff --git a/.claude/skills/evaluation/recipes/env.example b/.claude/skills/evaluation/recipes/env.example
index f84f06eb157..6cb1728e58a 100644
--- a/.claude/skills/evaluation/recipes/env.example
+++ b/.claude/skills/evaluation/recipes/env.example
@@ -18,14 +18,11 @@ NEMO_EVALUATOR_TRUST_PRE_CMD=1
 
 # --- Optional: task-specific keys ---
 
-# AIME 2025 simple_evals and judge-backed tasks
+# AIME 2025, HLE, AA-LCR, and other judge-backed tasks
 # JUDGE_API_KEY=
 
-# HLE AA judge API key
-# INFERENCE_API_KEY=
-
-# tau2_bench_telecom (LLM judge)
-# TAU2_JUDGE_API_KEY=
+# tau2_bench_telecom user simulator endpoint
+# USER_API_KEY=
 
 # terminal-bench-hard (AWS sandbox)
 # AWS_ACCESS_KEY_ID=
diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
index c809d04f362..8dab59cea57 100644
--- a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
+++ b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -2,16 +2,18 @@
 
 ## Task Details
 
-- Task: `nemo_skills.ns_aa_lcr`
-- Harness: NeMo Skills, chat
+- Task: `aa_lcr`
+- Harness: AA-LCR, chat
 - Primary metric: `pass@1 judge_correct`
 - Run time: Long
-- Repeats: 3
-- Requires `HF_TOKEN` and `JUDGE_API_KEY`
+- Samples: 3
+- Requires: `HF_TOKEN`, `JUDGE_API_KEY`
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/AA-LCR.html
 
-Recommended judge: use Qwen3 235B A22B 2507 Non-Reasoning as an
-OpenAI-compatible equality-checker judge, and keep the same judge across
-comparable runs.
+## Params
+
+Recommended judge: use Qwen3 235B as an OpenAI-compatible equality-checker
+judge, and keep the same judge across comparable runs.
 
 AA-LCR is long-context sensitive. For 128K-context models, avoid capping
 generation tokens for this task unless the deployment needs the cap for
@@ -22,24 +24,19 @@ stability.
 Use this inside the top-level `evaluation.tasks` list:
 
 ```yaml
-- name: nemo_skills.ns_aa_lcr
-  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+- name: aa_lcr
+  container: nvcr.io/nvidia/eval-factory/aa-lcr:26.03
   env_vars:
     HF_TOKEN: host:HF_TOKEN
     JUDGE_API_KEY: host:JUDGE_API_KEY
   nemo_evaluator_config:
-    target:
-      api_endpoint:
-        adapter_config:
-          params_to_remove:
-            - max_tokens
     config:
       params:
         extra:
-          num_repeats: 3
+          n_samples: 3
           judge:
-            model_id: <qwen3_235b_a22b_2507_non_reasoning_judge_model_id>
-            url: <openai_compatible_judge_url>
+            model_id: <qwen3_235b_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
             api_key: JUDGE_API_KEY
 ```
 
@@ -48,5 +45,5 @@ Use this inside the top-level `evaluation.tasks` list:
 AA-LCR accuracy comes from:
 
 ```text
-results.groups.aalcr.metrics.pass@1.scores.judge_correct.value
+results.groups.aa_lcr.metrics.pass@1.scores.judge_correct.value
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
index 4dc89742b9c..f68ee5ed21d 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.md
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -7,11 +7,8 @@
 - Primary metric: `pass@1[avg-of-16] symbolic_correct`
 - Run time: Long for reasoning models with lengthy thinking traces
 - Repeats: 16
-- The AA variant, `simple_evals.AIME_2025`, requires `JUDGE_API_KEY`.
-  This NeMo Skills variant uses symbolic scoring and does not require an
-  external judge API key.
-
-## Params
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## YAML Fragment
 
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
index d6e5b2b6198..0e481a5ff0d 100644
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.md
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -7,6 +7,8 @@
 - Primary metric: `pass@1[avg-of-16] symbolic_correct`
 - Run time: Short
 - Samples: 16
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/hle_aa.md b/.claude/skills/evaluation/recipes/tasks/hle_aa.md
deleted file mode 100644
index 9fbc5eeb5ce..00000000000
--- a/.claude/skills/evaluation/recipes/tasks/hle_aa.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# HLE AA
-
-## Task Details
-
-- Task: `nemo_skills.ns_hle_aa`
-- Harness: NeMo Skills, chat
-- Primary metric: `pass@1 judge_correct`
-- Run time: Long
-- Repeats: 1
-- Requires `HF_TOKEN` and `INFERENCE_API_KEY`
-
-Recommended judge: use OpenAI GPT-4o as the OpenAI-compatible equality-checker
-judge, matching the original HLE paper setup, and keep the same judge across
-comparable runs.
-
-## YAML Fragment
-
-Use this inside the top-level `evaluation.tasks` list:
-
-```yaml
-- name: nemo_skills.ns_hle_aa
-  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
-  env_vars:
-    HF_TOKEN: host:HF_TOKEN
-    INFERENCE_API_KEY: host:INFERENCE_API_KEY
-  nemo_evaluator_config:
-    config:
-      params:
-        extra:
-          judge:
-            model_id: <gpt_4o_judge_model_id>
-            api_key: INFERENCE_API_KEY
-```
-
-## Score Extraction
-
-HLE AA accuracy comes from:
-
-```text
-results.groups.hle.metrics.pass@1.scores.judge_correct.value
-```
diff --git a/.claude/skills/evaluation/recipes/tasks/hle_aa_v2.md b/.claude/skills/evaluation/recipes/tasks/hle_aa_v2.md
new file mode 100644
index 00000000000..cba1bcc8b46
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/hle_aa_v2.md
@@ -0,0 +1,36 @@
+# HLE AA v2
+
+## Task Details
+
+- Task: `hle_aa_v2`
+- Harness: HLE, chat
+- Primary metric: `pass@1 judge_correct`
+- Run time: Long
+- Repeats: 1
+- Requires: `HF_TOKEN`, `JUDGE_API_KEY`
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/hle.html
+
+## Params
+
+This is the text-only HLE task with params aligned to Artificial Analysis Index
+v2. HLE is judge-scored and requires judge credentials.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: hle_aa_v2
+  container: nvcr.io/nvidia/eval-factory/hle:26.03
+  env_vars:
+    HF_TOKEN: host:HF_TOKEN
+    JUDGE_API_KEY: host:JUDGE_API_KEY
+```
+
+## Score Extraction
+
+HLE AA v2 accuracy comes from:
+
+```text
+results.groups.hle.metrics.pass@1.scores.judge_correct.value
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
index 34892ad0b33..c914fb18aa9 100644
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.md
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -7,6 +7,8 @@
 - Primary metric: `pass@1[avg-of-8] prompt_loose_accuracy`
 - Run time: Super short
 - Repeats: 8
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.md b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
index 15323b18124..cb6d6afc56a 100644
--- a/.claude/skills/evaluation/recipes/tasks/livecodebench.md
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
@@ -7,6 +7,8 @@
 - Primary metric: `pass@1[avg-of-3] accuracy`
 - Run time: Medium
 - Repeats: 3
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
index 48c767de25c..9757838b3b3 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -4,9 +4,11 @@
 
 - Task: `nemo_skills.ns_mmlu_pro`
 - Harness: NeMo Skills, chat
-- Primary metric: `symbolic_correct`
+- Primary metric: `pass@1 symbolic_correct`
 - Run time: Short
 - Repeats: 1
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
new file mode 100644
index 00000000000..90d67e28b62
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
@@ -0,0 +1,37 @@
+# MMMU-Pro
+
+## Task Details
+
+- Task: `ns_mmmu_pro`
+- Harness: NeMo Skills, multimodal chat
+- Primary metric: `pass@1 symbolic_correct`
+- Run time: Medium
+- Repeats: 1
+- Requires: `HF_TOKEN`
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+
+## Params
+
+MMMU-Pro is a multimodal task. Use a multimodal-capable endpoint.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_mmmu_pro
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 1
+```
+
+## Score Extraction
+
+MMMU-Pro accuracy comes from:
+
+```text
+results.groups."mmmu-pro".metrics.pass@1.scores.symbolic_correct.value
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
index fba67c15640..677af7771d9 100644
--- a/.claude/skills/evaluation/recipes/tasks/scicode.md
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -1,5 +1,17 @@
 # SciCode
 
+## Task Details
+
+- Task: `ns_scicode`
+- Harness: NeMo Skills, chat
+- Primary metric: `pass@1[avg-of-3] subtask_accuracy`
+- Run time: Long
+- Repeats: 3
+- Requires: None
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+
+## Params
+
 SciCode is a NeMo Skills code/reasoning benchmark with multi-step prompts and a
 code-execution sandbox. Check this reference before creating or modifying NEL
 configs for SciCode; the benchmark has deployment, parallelism, and score
diff --git a/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
new file mode 100644
index 00000000000..4006e978a8f
--- /dev/null
+++ b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
@@ -0,0 +1,38 @@
+# Tau2 Bench Telecom
+
+## Task Details
+
+- Task: `tau2_bench_telecom`
+- Harness: tau2_bench, chat
+- Primary metric: `pass_1`
+- Run time: Long
+- Samples: 3
+- Requires: `USER_API_KEY`
+- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/tau2_bench.html#tau2-bench-tau2-bench-telecom
+
+## Params
+
+Tau2 Bench uses the evaluated model as the agent and a separate LLM endpoint as
+the user simulator. Configure the user simulator explicitly and keep it fixed
+across comparable runs.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: tau2_bench_telecom
+  container: nvcr.io/nvidia/eval-factory/tau2-bench:26.03
+  env_vars:
+    USER_API_KEY: host:USER_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          user:
+            model_id: <user_simulator_model_id>
+            url: <openai_compatible_user_simulator_chat_completions_url>
+            api_key: USER_API_KEY
+```
+
+## Score Extraction

From b793ea5d358ad56260f21cfb370a661aead69a09 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Tue, 19 May 2026 11:28:36 -0500
Subject: [PATCH 17/26] Add debugging playbooks skill

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/debugging-playbooks/SKILL.md   |  22 +++
 .../references/vllm-aot-cache-poisoning.md    | 139 ++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 .claude/skills/debugging-playbooks/SKILL.md
 create mode 100644 .claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md

diff --git a/.claude/skills/debugging-playbooks/SKILL.md b/.claude/skills/debugging-playbooks/SKILL.md
new file mode 100644
index 00000000000..533cb0f0c1d
--- /dev/null
+++ b/.claude/skills/debugging-playbooks/SKILL.md
@@ -0,0 +1,22 @@
+---
+name: debugging-playbooks
+description: Diagnostic playbooks for tricky failures — failures where the traceback misdirects and the first 2-3 reasonable hypotheses turn out wrong. Use when a run fails with a framework-internal-looking error (cryptic torch.compile / dynamo / NCCL / vLLM / transformers / CUDA / pyxis / enroot / NEL / SLURM / container runtime), the top frame appears to blame the wrong layer (e.g. the user's code, ModelOpt, the quantized linear, the wrapper class) but fixing that layer doesn't help, or the symptom recurs across unrelated changes. Use this skill when you've eliminated the obvious suspects and the bug hasn't budged. Don't reach for this on the first guess; reach for it when the obvious answers don't pan out. Each playbook is keyed by a literal symptom string from logs so future agents can grep for it.
+---
+
+# Debugging playbooks
+
+When a failure surfaces a symptom that doesn't clearly map to the code under change, check whether one of the documented playbooks below already describes it. Each playbook is keyed by the literal symptom string so future agents can match by grep.
+
+| Symptom (literal string from logs) | Playbook |
+| --- | --- |
+| `AttributeError: 'NoneType' object has no attribute 'size'` during vLLM `profile_run` / `_dummy_run` / CUDA-graph capture | [vllm-aot-cache-poisoning.md](references/vllm-aot-cache-poisoning.md) |
+
+## When to add a new playbook
+
+Add an entry when **all three** are true:
+
+1. The root cause was non-obvious from the traceback — the immediate frame was misleading (e.g. blames ModelOpt when the bug is in vLLM).
+2. The symptom is likely to recur across runs (different models, different containers).
+3. There is a concrete fix (config change, env var, cache invalidation) that future agents should reach for before deeper debugging.
+
+Each playbook should include: the literal symptom string, the actual mechanism, how to confirm the diagnosis, and the minimal fix.
diff --git a/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
new file mode 100644
index 00000000000..c43d18a391f
--- /dev/null
+++ b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
@@ -0,0 +1,139 @@
+# vLLM AOT compile-cache poisoning across multimodal-on / multimodal-off runs
+
+Applies to **any** model whose vLLM architecture supports multimodal input —
+this is modality-agnostic, covering image, video, audio, or any other
+modality (`vllm/multimodal/registry.py: supports_multimodal_inputs` iterates
+the model's `supported_mm_limits`, which can be `{"image": N}`,
+`{"video": N}`, `{"audio": N}`, `{"image": N, "video": N}`, etc.). The hazard
+appears when multiple vLLM runs against the **same checkpoint** share a
+`VLLM_CACHE_ROOT` and differ in whether **all** of the model's modalities
+are zeroed out via `--limit-mm-per-prompt`.
+
+## Symptom
+
+vLLM startup crashes during `profile_run` / `_dummy_run` / CUDA-graph capture
+with:
+
+```
+AttributeError: 'NoneType' object has no attribute 'size'
+```
+
+The traceback ends inside `torch/_dynamo/utils.py call_size → x.size(i)`,
+after passing through `vllm/compilation/decorators.py: aot_compiled_fn`.
+**There is no model-layer frame** in the failing stack — no attention op,
+no MLP, no quantized linear. The compiled function is loaded from disk and
+crashes in dynamo's prologue, before any decoder layer runs. The log line
+just above the traceback is the smoking gun:
+
+```
+INFO ... [decorators.py:...] Directly load AOT compilation from path
+  /vllm-cache/torch_compile_cache/torch_aot_compile/<hash>/rank_*/model
+```
+
+## Mechanism
+
+vLLM's `@support_torch_compile` decorator caches one compiled `forward` per
+`(aot_compile_hash_factors(vllm_config), _model_hash_key(forward))` key
+(`vllm/compilation/decorators.py`). That key includes the model config and
+quantization, but **does not include** `--limit-mm-per-prompt` or the
+derived `supports_mm_inputs` flag.
+
+`vllm/v1/worker/gpu_model_runner.py: _dummy_run` branches on
+`supports_mm_inputs`:
+
+```python
+if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
+    input_ids, inputs_embeds = self._prepare_mm_inputs(...)   # (None, Tensor)
+else:
+    input_ids = self.input_ids.gpu[:num_tokens_padded]        # (Tensor, None)
+    inputs_embeds = None
+```
+
+`supports_mm_inputs` (`vllm/multimodal/registry.py: supports_multimodal_inputs`)
+returns `False` when **every** supported modality has
+`--limit-mm-per-prompt = 0`. So:
+
+| Run config | `supports_mm_inputs` | Pattern compiled / loaded |
+| --- | --- | --- |
+| `--limit-mm-per-prompt '{"image":0}'` (and `"video":0` etc.) | False | `input_ids=Tensor, inputs_embeds=None` |
+| default, or any modality non-zero | True | `input_ids=None, inputs_embeds=Tensor` |
+
+The `@support_torch_compile` docstring explicitly forbids the same argument
+slot from being `None` on one invocation and a Tensor on another — Dynamo
+specializes on None-vs-Tensor identity per argument, so one cached graph
+cannot serve both patterns. When run A populates the cache slot and run B
+shares the slot but uses the opposite pattern, the prologue calls
+`.size()` on what is now `None` and dies.
+
+This is symmetric: a multimodal-first run followed by a text-only-via-image:0
+run fails the same way, just with the None/Tensor roles swapped.
+
+## How to confirm
+
+1. **Cache hit before the crash.** Look in the server log for
+   `Directly load AOT compilation from path ...` shortly before the
+   traceback. A cache *hit* immediately before a `NoneType.size()` is the
+   diagnostic. (A cold compile would print `Dynamo bytecode transform
+   time` and `Inductor compile took ...` instead.)
+2. **Config delta on `--limit-mm-per-prompt`.** Compare the failing run's
+   serving args against the most recent successful runs that share
+   `$VLLM_CACHE_ROOT`. If they disagree on whether any modality is
+   zero-limited (or one side omits the flag while the other passes
+   `{"image":0}`), the cache slot is colliding.
+3. **Positive control.** Relaunch the failing config with
+   `VLLM_DISABLE_COMPILE_CACHE=1` and change nothing else. If `profile_run`
+   passes, the cache was the cause.
+
+## Fix
+
+Two parts — stop the poisoning, then heal what's already poisoned.
+
+### Stop poisoning
+
+For multimodal-architecture models, do **not** zero out a modality with
+`--limit-mm-per-prompt '{"image":0}'` (or `"video":0`, …) on runs intended
+to share a cache root with multimodal runs. The vision tower weights are
+loaded from the checkpoint regardless of this flag; zeroing only flips
+`supports_mm_inputs` and creates the cache hazard. Text-only inference
+still works without the flag because vLLM's `_preprocess` routes both
+text and multimodal prompts through the same `inputs_embeds` path when
+`supports_mm_inputs=True`:
+
+```python
+# vllm/v1/worker/gpu_model_runner.py: _preprocess
+# NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
+# we always use embeddings (rather than token ids) as input to the
+# multimodal model, even when the input is text.
+inputs_embeds_scheduled = self.model.embed_input_ids(
+    self.input_ids.gpu[:num_scheduled_tokens],
+    multimodal_embeddings=mm_embeds,
+    is_multimodal=is_mm_embed,
+)
+```
+
+A text-only prompt simply has `mm_embeds=[]` / `is_multimodal=False`; the
+call signature into the language model is unchanged. The small cost of
+keeping multimodal inputs enabled is that vLLM allocates an encoder cache
+budget at startup (e.g. a few hundred MB) and prints a vision warmup line.
+
+### Heal existing cache
+
+Either fully wipe and let the next run repopulate:
+
+```bash
+rm -rf "$VLLM_CACHE_ROOT/torch_compile_cache/torch_aot_compile/"
+```
+
+…or sidestep by separating cache roots per multimodal-ness (set a different
+`VLLM_CACHE_ROOT` for the runs that need a different pattern), or just set
+`VLLM_DISABLE_COMPILE_CACHE=1` on the affected runs and accept a one-time
+recompile (~20-30 s) at every startup.
+
+## See also
+
+- `vllm/compilation/decorators.py` — `support_torch_compile` decorator and
+  its docstring on the None-vs-Tensor invariant.
+- `vllm/v1/worker/gpu_model_runner.py` — the input-construction branch in
+  `_dummy_run` and the unified-`inputs_embeds` comment in `_preprocess`.
+- `vllm/multimodal/registry.py` — how `supports_multimodal_inputs` is
+  computed from `--limit-mm-per-prompt`.

From a662e435d53c5a5130ef378c87aa77527178fdc8 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Tue, 19 May 2026 11:48:17 -0500
Subject: [PATCH 18/26] Clarify monitor status handling

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/monitor/SKILL.md | 39 ++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/.claude/skills/monitor/SKILL.md b/.claude/skills/monitor/SKILL.md
index 52b47ec55ed..01af60e2e6f 100644
--- a/.claude/skills/monitor/SKILL.md
+++ b/.claude/skills/monitor/SKILL.md
@@ -64,23 +64,46 @@ Whether triggered by monitor output or by the user asking "check status":
 
 ## How to Check Each Job Type
 
+Each check method has its **own** status vocabulary. A watcher that mixes them
+(e.g. uses SLURM's `COMPLETED` terminal-state regex against `nel status` output)
+will silently never fire terminal transitions. Always match against the
+vocabulary of the source you're polling.
+
 ### NEL jobs (`type: nel`)
 
-- **Check:** `nel status <id>`
-- **On completion:** `nel info <id>` to fetch results
-- **On failure:** `nel info <id> --logs` then inspect server/client/SLURM logs via SSH
+- **Check:** `nel status <id>` — second pipe-delimited column carries the state with a Unicode prefix (e.g. `▶ RUNNING`, `✓ SUCCESS`).
+- **States** (from `nemo_evaluator_launcher.executors.base.ExecutionState` + the CLI status formatter):
+
+  | State (uppercase, as printed) | Terminal? | Indicator |
+  | --- | --- | --- |
+  | `PENDING` | no | `⧗` |
+  | `RUNNING` | no | `▶` |
+  | `SUCCESS` | **yes** | `✓` |
+  | `FAILED` | **yes** | `✗` |
+  | `KILLED` | **yes** | `✗` |
+  | `ERROR` | **yes** | `✗` (synthetic, CLI error path) |
+  | `NOT FOUND` | **yes** | `?` (synthetic, CLI: invocation id unknown) |
+
+  Watcher terminal regex: `^(SUCCESS|FAILED|KILLED|ERROR|NOT FOUND)$`.
+  Strip the Unicode indicator (`▶✓✗⧗?`) and surrounding whitespace before
+  matching.
+- **On completion:** `nel info <id>` to fetch results.
+- **On failure:** `nel info <id> --logs` then inspect server/client/SLURM logs via SSH.
 
 ### Launcher jobs (`type: launcher`)
 
-- **Check:** Tail the launcher's background output file for key events
-- **Key events:** experiment ID, SLURM job ID, container import, calibration progress, export path, final status
-- **On failure:** Look for `Traceback`, `Error`, or `FAILED` in the output
+- **Check:** Tail the launcher's background output file for key events.
+- **Key events:** experiment ID, SLURM job ID, container import, calibration progress, export path, final status.
+- **On failure:** Look for `Traceback`, `Error`, or `FAILED` in the output.
 
 ### Raw SLURM jobs (`type: slurm`)
 
 - **Check:** `ssh <host> "sacct -j <id> --format=JobID%12,JobName%25,State%12,Elapsed%10 -n"` and filter out `extern`, `batch`, and step rows like `.<step>`. Use `sacct` for the termination check; `squeue` can lag in `COMPLETING` after `sacct` reports a terminal state.
-- **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`
-- **On failure:** Check the job's output log file
+- **States (terminal):** `COMPLETED`, `FAILED`, `CANCELLED` (also appears as `CANCELLED by <uid>`), `TIMEOUT`, `NODE_FAIL`, `OUT_OF_MEMORY`, `PREEMPTED`, `BOOT_FAIL`, `DEADLINE`.
+- **States (non-terminal):** `PENDING`, `RUNNING`, `CONFIGURING`, `COMPLETING`, `RESIZING`, `SUSPENDED`, `REQUEUED`.
+  Watcher terminal regex: `^(COMPLETED|FAILED|CANCELLED( by .*)?|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY|PREEMPTED|BOOT_FAIL|DEADLINE)$`.
+- **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`.
+- **On failure:** Check the job's output log file.
 
 ---
 

From f3607529a93177b9a805d7efce893540efe379bb Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Tue, 19 May 2026 11:53:03 -0500
Subject: [PATCH 19/26] Use ns_hle_aa for HLE AA evaluations

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/evaluation/SKILL.md            |  2 +-
 .../tasks/{hle_aa_v2.md => ns_hle_aa.md}      | 22 +++++++++++++------
 2 files changed, 16 insertions(+), 8 deletions(-)
 rename .claude/skills/evaluation/recipes/tasks/{hle_aa_v2.md => ns_hle_aa.md} (51%)

diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index 02916f7c70a..b089b4fddfa 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -59,7 +59,7 @@ If the user already has a config file (e.g., "run this config", "evaluate with m
 **Shortcut: use task references.** For named benchmarks, read the matching
 `recipes/tasks/<name>.md` before creating or editing the config. Available:
 mmlu_pro, mmmu_pro, gpqa, aime2025, livecodebench, ifbench, scicode,
-aa_lcr, hle_aa_v2, tau2_bench_telecom.
+aa_lcr, ns_hle_aa, tau2_bench_telecom.
 
 1. Read the task reference(s) the user wants.
 2. Use `recipes/examples/example_eval.yaml` as the base config template
diff --git a/.claude/skills/evaluation/recipes/tasks/hle_aa_v2.md b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
similarity index 51%
rename from .claude/skills/evaluation/recipes/tasks/hle_aa_v2.md
rename to .claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
index cba1bcc8b46..36c8979b14e 100644
--- a/.claude/skills/evaluation/recipes/tasks/hle_aa_v2.md
+++ b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
@@ -1,14 +1,14 @@
-# HLE AA v2
+# HLE AA
 
 ## Task Details
 
-- Task: `hle_aa_v2`
-- Harness: HLE, chat
+- Task: `ns_hle_aa`
+- Harness: nemo-skills, chat
 - Primary metric: `pass@1 judge_correct`
 - Run time: Long
 - Repeats: 1
 - Requires: `HF_TOKEN`, `JUDGE_API_KEY`
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/hle.html
+- Reference: https://docs.nvidia.com/nemo/evaluator/nightly/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
 ## Params
 
@@ -20,16 +20,24 @@ v2. HLE is judge-scored and requires judge credentials.
 Use this inside the top-level `evaluation.tasks` list:
 
 ```yaml
-- name: hle_aa_v2
-  container: nvcr.io/nvidia/eval-factory/hle:26.03
+- name: ns_hle_aa
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
   env_vars:
     HF_TOKEN: host:HF_TOKEN
     JUDGE_API_KEY: host:JUDGE_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          judge:
+            model_id: <hle_aa_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
+            api_key: JUDGE_API_KEY
 ```
 
 ## Score Extraction
 
-HLE AA v2 accuracy comes from:
+HLE AA accuracy comes from:
 
 ```text
 results.groups.hle.metrics.pass@1.scores.judge_correct.value

From ffa7558aa516f05be91ffe8cb5b034f93d1d401c Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Tue, 19 May 2026 12:49:46 -0500
Subject: [PATCH 20/26] Scope agent state by session

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/common/remote-execution.md     |   8 +-
 .claude/skills/common/workspace-management.md | 116 ++++++++++--------
 .claude/skills/deployment/SKILL.md            |   6 +-
 .claude/skills/evaluation/SKILL.md            |   2 +-
 .claude/skills/monitor/SKILL.md               |  40 ++++--
 .../ptq/references/unsupported-models.md      |   2 +-
 .gitignore                                    |   1 +
 7 files changed, 108 insertions(+), 67 deletions(-)

diff --git a/.claude/skills/common/remote-execution.md b/.claude/skills/common/remote-execution.md
index be770aef936..caaf0ce4db1 100644
--- a/.claude/skills/common/remote-execution.md
+++ b/.claude/skills/common/remote-execution.md
@@ -33,10 +33,10 @@ default_cluster: my-cluster
 Workstation filesystems (`/home/scratch.*`, local NFS) are **not** mounted on the cluster. If a checkpoint was produced on your workstation, copy it to the cluster's own storage before submitting any job that references it — NEL and SLURM do NOT sync checkpoints automatically.
 
 ```bash
-rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/checkpoints/
+rsync -av /path/to/local/checkpoint <cluster-login>:<cluster-workspace>/<session_id>/<model>/checkpoints/
 ```
 
-Use the `workspace` path from your cluster config as the destination. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
+Use the `workspace` path from your cluster config as the destination root, and keep staged checkpoints under the session/model directory. Compute nodes on a given cluster share the same storage as its login node, so once staged, the path works everywhere on that cluster.
 
 See `.claude/clusters.yaml.example` for a fully annotated example with multiple cluster types.
 
@@ -118,8 +118,8 @@ When submitting SLURM jobs remotely, write **two files** locally to avoid shell
 Then upload both and submit:
 
 ```bash
-remote_sync_to /local/scripts/ scripts/
-JOBID=$(remote_run "sbatch /remote/path/scripts/job_slurm.sh" | grep -o '[0-9]\+' | tail -1)
+remote_sync_to /local/scripts/ <session_id>/<model>/scripts/
+JOBID=$(remote_run "sbatch <remote_workspace>/<session_id>/<model>/scripts/job_slurm.sh" | grep -o '[0-9]\+' | tail -1)
 ```
 
 ---
diff --git a/.claude/skills/common/workspace-management.md b/.claude/skills/common/workspace-management.md
index f797e7870ee..29356bca101 100644
--- a/.claude/skills/common/workspace-management.md
+++ b/.claude/skills/common/workspace-management.md
@@ -1,77 +1,95 @@
 # Workspace Management
 
-Organize work by model name so outputs (checkpoints, logs) are easy to find and reuse across PTQ → deploy → eval pipelines.
+Organize work by session id and model name so concurrent agents do not
+clobber each other, while outputs (checkpoints, logs) stay easy to find and
+reuse across PTQ → deploy → eval pipelines within the same session.
 
-## Single-user (default)
+## Session Workspaces
 
-Create a work directory named after the model in the current project:
+Use the same `<session_id>` convention as the monitor skill:
 
-```bash
-mkdir -p ./workspaces/<model-name>
-```
-
-Use descriptive names, not timestamps:
-
-```bash
-# Good
-workspaces/qwen3-0.6b-nvfp4/
-workspaces/llama-3.1-8b-fp8/
-
-# Bad
-workspaces/ptq-20260318-143022/
-workspaces/job-001/
-```
-
-Store outputs (checkpoints, logs) inside the workspace:
-
-```bash
-workspaces/qwen3-0.6b-nvfp4/
-  output/          # quantized checkpoint
-  logs/            # job logs
-  scripts/         # custom PTQ scripts (if unsupported model)
-```
+- Claude Code: `$CLAUDE_CODE_SESSION_ID`, or the `session_id` field from hook input
+- Codex: `$CODEX_THREAD_ID`
+- If no session id is available, create a stable id for the current terminal session and reuse it for every local and remote path created by that agent
 
 ## When to Reuse vs Create
 
-**Before starting any task**, check for an existing workspace:
+**Before starting any task**, check for an existing workspace in the current
+session:
 
 ```bash
-ls ./workspaces/ 2>/dev/null
+ls ./workspaces/<session_id>/ 2>/dev/null
 ```
 
 **Reuse** when:
 
-- Same model (e.g., deploying a model you just quantized)
+- The matching model workspace already exists under `./workspaces/<session_id>/`
 - Task requires output from a previous step (e.g., eval requires the PTQ checkpoint)
 - User says "deploy the model I just quantized"
 
 **Create new** when:
 
-- New model not seen before
+- No matching model workspace exists under `./workspaces/<session_id>/`
 - User explicitly asks for a fresh start
-- Different quantization format for same model (e.g., `qwen3-0.6b-fp8` vs `qwen3-0.6b-nvfp4`)
+
+## Model Workspace Names
+
+Within `./workspaces/<session_id>/`, create one model workspace per model or
+model variant. Include meaningful variant details in the model workspace name,
+for example quantization format or checkpoint role:
+
+```bash
+mkdir -p ./workspaces/<session_id>/<model-name>
+```
+
+Use descriptive model workspace names, not timestamps:
+
+```text
+# Good
+workspaces/<session_id>/qwen3-0.6b-nvfp4/
+workspaces/<session_id>/qwen3-0.6b-fp8/
+workspaces/<session_id>/qwen3-0.6b-baseline/
+
+# Bad
+workspaces/<session_id>/ptq-20260318-143022/
+workspaces/<session_id>/job-001/
+```
+
+Store outputs (checkpoints, logs) inside the model workspace:
+
+```text
+workspaces/<session_id>/qwen3-0.6b-nvfp4/
+  output/          # quantized checkpoint
+  logs/            # job logs
+  scripts/         # custom PTQ scripts (if unsupported model)
+```
 
 ## Remote execution
 
 When using a remote machine (clusters.yaml configured), create matching workspaces on **both** local and remote:
 
-- **Local** `./workspaces/<model>/` — write and edit scripts here
-- **Remote** `<remote_workspace>/workspaces/<model>/` — model downloads, execution, outputs
+- **Local** `./workspaces/<session_id>/<model>/` — write and edit scripts here
+- **Remote** `<remote_workspace>/<session_id>/<model>/` — model downloads, execution, outputs
+
+Session-scope newly created remote run directories, logs, response caches,
+temporary configs, and output artifacts. Shared read-only or concurrency-safe
+caches, such as Hugging Face model caches and prebuilt container image caches,
+can remain outside the session directory.
 
 Before running, sync the local ModelOpt source and scripts to the remote workspace:
 
 ```bash
 # Sync ModelOpt source (first time or after local changes)
-remote_sync_to ./ workspaces/<model>/Model-Optimizer/
+remote_sync_to ./ <session_id>/<model>/Model-Optimizer/
 
 # Sync custom scripts
-remote_sync_to ./workspaces/<model>/scripts/ workspaces/<model>/scripts/
+remote_sync_to ./workspaces/<session_id>/<model>/scripts/ <session_id>/<model>/scripts/
 ```
 
 Download the model on the **remote** machine (avoids transferring large model files):
 
 ```bash
-remote_run "python -c \"from huggingface_hub import snapshot_download; snapshot_download('<model_id>', local_dir='<remote_workspace>/workspaces/<model>/model')\""
+remote_run "python -c \"from huggingface_hub import snapshot_download; snapshot_download('<model_id>', local_dir='<remote_workspace>/<session_id>/<model>/model')\""
 ```
 
 Inspect remote files with `remote_run "cat ..."` — read README, config.json, tokenizer_config.json to understand requirements before writing scripts locally.
@@ -80,7 +98,7 @@ Inspect remote files with `remote_run "cat ..."` — read README, config.json, t
 
 When `MODELOPT_WORKSPACE_ROOT` is set, use it instead of `./workspaces/`:
 
-- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (set by the bot)
+- `MODELOPT_WORKSPACE_ROOT` — user's workspace root (set by the bot); use `$MODELOPT_WORKSPACE_ROOT/<session_id>/<name>/`
 - `MODELOPT_REPO_DIR` — shared upstream repo (read-only, use for fresh copies)
 
 To create a workspace, copy the upstream repo (without `.git`):
@@ -89,7 +107,7 @@ To create a workspace, copy the upstream repo (without `.git`):
 rsync -a --quiet \
     --exclude .git --exclude __pycache__ --exclude '*.pyc' \
     --exclude node_modules --exclude '*.egg-info' --exclude '*.sqsh' \
-    "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<name>/"
+    "$MODELOPT_REPO_DIR/" "$MODELOPT_WORKSPACE_ROOT/<session_id>/<name>/"
 ```
 
 ## Cross-Skill Workspace Flow
@@ -97,7 +115,7 @@ rsync -a --quiet \
 Workspaces carry over across the PTQ → Deploy → Eval pipeline. Each stage adds to the same directory:
 
 ```text
-workspaces/model-name-format/
+workspaces/<session_id>/model-name-format/
   output/              ← PTQ: quantized checkpoint
   eval_results/        ← Evaluation: NEL artifacts (results.yml per task)
   eval_config.yaml     ← Evaluation: NEL config
@@ -109,19 +127,19 @@ workspaces/model-name-format/
 
 ```text
 User: "quantize Qwen3-0.6B with nvfp4"
-Agent: ls workspaces/ → no "qwen3-0.6b-nvfp4"
-       → mkdir workspaces/qwen3-0.6b-nvfp4
-       → run PTQ, output to workspaces/qwen3-0.6b-nvfp4/output/
+Agent: ls workspaces/<session_id>/ → no "qwen3-0.6b-nvfp4"
+       → mkdir workspaces/<session_id>/qwen3-0.6b-nvfp4
+       → run PTQ, output to workspaces/<session_id>/qwen3-0.6b-nvfp4/output/
 
 User: "deploy the model I just quantized"
-Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
-       → reuse, find checkpoint at workspaces/qwen3-0.6b-nvfp4/output/
+Agent: ls workspaces/<session_id>/ → sees "qwen3-0.6b-nvfp4"
+       → reuse, find checkpoint at workspaces/<session_id>/qwen3-0.6b-nvfp4/output/
 
 User: "evaluate the quantized model on MMLU and GSM8K"
-Agent: ls workspaces/ → sees "qwen3-0.6b-nvfp4"
-       → reuse, write eval_config.yaml, results to workspaces/qwen3-0.6b-nvfp4/eval_results/
+Agent: ls workspaces/<session_id>/ → sees "qwen3-0.6b-nvfp4"
+       → reuse, write eval_config.yaml, results to workspaces/<session_id>/qwen3-0.6b-nvfp4/eval_results/
 
 User: "now quantize Llama-3.1-8B with fp8"
-Agent: ls workspaces/ → no llama
-       → mkdir workspaces/llama-3.1-8b-fp8
+Agent: ls workspaces/<session_id>/ → no llama
+       → mkdir workspaces/<session_id>/llama-3.1-8b-fp8
 ```
diff --git a/.claude/skills/deployment/SKILL.md b/.claude/skills/deployment/SKILL.md
index 5210eae6c3c..f14cc0b9822 100644
--- a/.claude/skills/deployment/SKILL.md
+++ b/.claude/skills/deployment/SKILL.md
@@ -38,10 +38,10 @@ The script handles: GPU detection, quantization flag auto-detection (FP8 vs FP4)
 
 ### 0. Check workspace (multi-user / Slack bot)
 
-If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check for existing ones — especially if deploying a checkpoint from a prior PTQ run:
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Before creating a new workspace, check the current session for existing model workspaces — especially if deploying a checkpoint from a prior PTQ run:
 
 ```bash
-ls "$MODELOPT_WORKSPACE_ROOT/" 2>/dev/null
+ls "$MODELOPT_WORKSPACE_ROOT/<session_id>/" 2>/dev/null
 ```
 
 If the user says "deploy the model I just quantized" or references a previous PTQ, find the matching workspace and `cd` into it. The checkpoint should be in that workspace's output directory.
@@ -190,7 +190,7 @@ If a cluster config exists (`~/.config/modelopt/clusters.yaml` or `.claude/clust
    If the checkpoint path is a remote/absolute path (e.g., from a prior PTQ run on the cluster), skip sync — it's already there. Verify with `remote_run "ls <checkpoint_path>/config.json"`. Only sync if the checkpoint is local:
 
    ```bash
-   remote_sync_to <local_checkpoint_path> checkpoints/
+   remote_sync_to <local_checkpoint_path> <session_id>/<model>/checkpoints/
    ```
 
 3. **Deploy based on remote environment:**
diff --git a/.claude/skills/evaluation/SKILL.md b/.claude/skills/evaluation/SKILL.md
index b089b4fddfa..e224299e81c 100644
--- a/.claude/skills/evaluation/SKILL.md
+++ b/.claude/skills/evaluation/SKILL.md
@@ -14,7 +14,7 @@ You're an expert in NeMo Evaluator Launcher! Guide the user through creating pro
 
 ### Workspace and Pipeline Integration
 
-If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
+If `MODELOPT_WORKSPACE_ROOT` is set, read `skills/common/workspace-management.md`. Check for existing workspaces in the current session — especially if evaluating a model from a prior PTQ or deployment step. Reuse the existing workspace so you have access to the quantized checkpoint and any code modifications.
 
 This skill is often the final stage of the PTQ → Deploy → Eval pipeline. If the model required runtime patches during deployment (transformers upgrade, framework source fixes), carry those patches into the NEL config via `deployment.command`.
 
diff --git a/.claude/skills/monitor/SKILL.md b/.claude/skills/monitor/SKILL.md
index 01af60e2e6f..e153c0f5885 100644
--- a/.claude/skills/monitor/SKILL.md
+++ b/.claude/skills/monitor/SKILL.md
@@ -10,13 +10,31 @@ Monitor jobs submitted to SLURM clusters — PTQ quantization, NEL evaluation, m
 ## When to use
 
 1. **Auto-monitor** — another skill (PTQ, evaluation, deployment) just submitted a job. Register the job and set up monitoring immediately.
-2. **User-initiated** — user asks about a job status, possibly in a new conversation. Check the registry, identify the job, and report.
+2. **User-initiated** — user asks about a job status. Check the current session registry first; if the job is not registered there, use the discovery steps below.
 
 ---
 
 ## Job Registry
 
-All active jobs are tracked in `.claude/active_jobs.json`. This file is the single source of truth for what's being monitored.
+Active jobs are tracked in per-session registries under `.claude/agents/`.
+This avoids multiple agents clobbering one shared registry when they run at
+the same time.
+
+Use the current agent session id as `<session_id>`:
+
+- Claude Code: `$CLAUDE_CODE_SESSION_ID`, or the `session_id` field from hook input
+- Codex: `$CODEX_THREAD_ID`
+- If no session id is available, create a stable id for the current terminal session and reuse it for every job registered by that agent
+
+Registry layout:
+
+```text
+.claude/agents/
+  <session_id>/
+    active_jobs.json
+```
+
+Each session's `active_jobs.json` is a JSON array:
 
 ```json
 [
@@ -27,7 +45,11 @@ All active jobs are tracked in `.claude/active_jobs.json`. This file is the sing
     "user": "<ssh_user>",
     "submitted": "YYYY-MM-DD HH:MM",
     "description": "<what this job does>",
-    "last_status": "<last known status>"
+    "last_status": "<last known status>",
+    "owner": {
+      "agent": "claude-code|codex|manual",
+      "session_id": "<session_id>"
+    }
   }
 ]
 ```
@@ -40,8 +62,8 @@ All active jobs are tracked in `.claude/active_jobs.json`. This file is the sing
 
 Every time a job is submitted (by any skill or manually):
 
-1. **Add an entry** to `.claude/active_jobs.json`. Create the file if it doesn't exist.
-2. **Start a durable monitor** (if one isn't already watching the registry) that polls all registered jobs until they reach terminal status. Prefer the Claude Code `Monitor` tool when it is available: write a small watcher that reads the registry on each poll, checks every job with the appropriate method below, prints state-change events, updates `last_status`, removes terminal jobs, and exits when the registry is empty.
+1. **Add an entry** to `.claude/agents/<session_id>/active_jobs.json`. Create the session directory and file if they don't exist.
+2. **Start a durable monitor** (if one isn't already watching the registry) that polls this session's registered jobs until they reach terminal status. Prefer the Claude Code `Monitor` tool when it is available: write a small watcher that reads `.claude/agents/<session_id>/active_jobs.json`, checks every job with the appropriate method below, prints state-change events, updates `last_status`, removes terminal jobs from the session registry, and exits when no active jobs remain for this session.
 
 The monitor should terminate naturally when every registered job has reached a terminal state. If the `Monitor` tool is not available in the current harness, run an equivalent background process that implements the same loop and lets the agent resume/restart when the process exits.
 
@@ -53,12 +75,12 @@ Always do both steps. Don't try to predict job duration.
 
 Whether triggered by monitor output or by the user asking "check status":
 
-1. **Read the registry** from `.claude/active_jobs.json`
+1. **Read the registry** from `.claude/agents/<session_id>/active_jobs.json`
 2. **Check each job** using the appropriate method (see below)
 3. **Report only state changes** — compare against `last_status` in registry
-4. **Update `last_status`** in the registry
+4. **Update `last_status`** in the session registry
 5. **Remove completed jobs** — any job in a terminal state (COMPLETED, FAILED, CANCELLED, KILLED, TIMEOUT, NODE_FAIL, OUT_OF_MEMORY, PREEMPTED, BOOT_FAIL, DEADLINE)
-6. **If registry is empty** — let the monitor exit
+6. **If no active jobs remain** — let the monitor exit
 
 ---
 
@@ -111,7 +133,7 @@ vocabulary of the source you're polling.
 
 When the user asks about a job without specifying an ID, check in order:
 
-1. `.claude/active_jobs.json` — most reliable, has context
+1. `.claude/agents/<current_session_id>/active_jobs.json` — current agent's jobs
 2. `nel ls runs --since 1d` — recent NEL runs
 3. `ssh <host> "squeue -u <user>"` — active SLURM jobs
 4. `ls -lt tools/launcher/experiments/cicd/ | head -10` — recent launcher experiments
diff --git a/.claude/skills/ptq/references/unsupported-models.md b/.claude/skills/ptq/references/unsupported-models.md
index 1a198f3e886..a2fa036362e 100644
--- a/.claude/skills/ptq/references/unsupported-models.md
+++ b/.claude/skills/ptq/references/unsupported-models.md
@@ -13,7 +13,7 @@ After download, inspect the model files on the target machine (use `remote_run`
 1. **Read `README.md`** — often lists required transformers versions, dependencies, or `trust_remote_code` requirements
 2. **Check for `modeling_*.py` or `tokenization_*.py`** — custom code shipped with the model. If found, **always use `--trust_remote_code`** with `hf_ptq.py`, and `trust_remote_code=True` in any custom scripts. Without it, `AutoConfig`, `AutoTokenizer`, and `AutoModel` will fail to resolve custom classes.
 
-Write custom scripts locally (in `./workspaces/<model>/scripts/`), then sync to remote before running.
+Write custom scripts locally (in `./workspaces/<session_id>/<model>/scripts/`), then sync to remote before running.
 
 **Check transformers compatibility** (on the target machine):
 
diff --git a/.gitignore b/.gitignore
index 66ce5568ee0..8303a3a28bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,7 @@ venv/
 
 # Ignore claude local settings
 .claude/settings.local.json
+.claude/agents/
 CLAUDE.local.md
 AGENTS.override.md
 

From 343fe71925da8ec87c16c45fffc05a45673f6655 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 09:19:28 -0500
Subject: [PATCH 21/26] Add evaluation score extraction helpers

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .../skills/evaluation/recipes/tasks/aa_lcr.md | 27 ++++++++-
 .../evaluation/recipes/tasks/aime2025.md      | 60 ++++++++++++++++++-
 .../skills/evaluation/recipes/tasks/gpqa.md   |  6 --
 .../evaluation/recipes/tasks/ifbench.md       | 53 +++++++++++++++-
 .../evaluation/recipes/tasks/mmlu_pro.md      | 29 +++++++++
 .../evaluation/recipes/tasks/mmmu_pro.md      | 28 ++++++++-
 .../evaluation/recipes/tasks/ns_hle_aa.md     | 23 +++++++
 .../evaluation/recipes/tasks/scicode.md       |  6 --
 8 files changed, 214 insertions(+), 18 deletions(-)

diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
index 8dab59cea57..fe8107c30eb 100644
--- a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
+++ b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -4,7 +4,7 @@
 
 - Task: `aa_lcr`
 - Harness: AA-LCR, chat
-- Primary metric: `pass@1 judge_correct`
+- Primary metric: `accuracy.accuracy`
 - Run time: Long
 - Samples: 3
 - Requires: `HF_TOKEN`, `JUDGE_API_KEY`
@@ -45,5 +45,28 @@ Use this inside the top-level `evaluation.tasks` list:
 AA-LCR accuracy comes from:
 
 ```text
-results.groups.aa_lcr.metrics.pass@1.scores.judge_correct.value
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.value
+results.groups.aa_lcr.metrics.accuracy.scores.accuracy.stats.stderr
+```
+
+```python
+import yaml
+
+
+def extract_aa_lcr_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["aa_lcr"]["metrics"]["accuracy"]["scores"]
+    entry = scores["accuracy"]
+    accuracy = entry["value"] * 100
+    stderr = entry.get("stats", {}).get("stderr")
+    stderr_pp = stderr * 100 if stderr is not None else None
+
+    return {
+        "group": "aa_lcr",
+        "metric": "accuracy",
+        "score_key": "accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr_pp,
+    }
+
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
index f68ee5ed21d..18d5e0ce1e1 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.md
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -33,5 +33,61 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-Prefer the `pass@1[avg-of-N]` metric matching the configured sample/repeat
-count.
+AIME accuracy (in percentage points) comes from:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct.value
+```
+
+For repeated runs, report stderr as percentage points:
+
+```text
+results.groups.aime25.metrics."pass@1[avg-of-N]".scores.symbolic_correct_statistics_std_err_across_runs.value * 100
+```
+
+Prefer the `pass@1[avg-of-N]` metric matching the configured repeat count.
+If the repeat count is unknown, use the highest available `avg-of-N`.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_aime2025_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["aime25"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["symbolic_correct"]["value"]
+    stderr_value = scores.get(
+        "symbolic_correct_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = stderr_value * 100 if stderr_value is not None else None
+
+    return {
+        "group": "aime25",
+        "metric": metric_name,
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": stderr,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
index 0e481a5ff0d..9befb2a4527 100644
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.md
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -51,7 +51,6 @@ repeat count is unknown, use the highest available `avg-of-N`.
 
 ```python
 import re
-import sys
 import yaml
 
 
@@ -92,9 +91,4 @@ def extract_gpqa_score(path, repeats=None):
         "stderr": stderr,
     }
 
-
-if __name__ == "__main__":
-    path = sys.argv[1]
-    repeats = int(sys.argv[2]) if len(sys.argv) > 2 else None
-    print(extract_gpqa_score(path, repeats))
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
index c914fb18aa9..f3775fcceac 100644
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.md
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -33,8 +33,59 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-IFBench accuracy comes from:
+IFBench primary AA-aligned accuracy (in percentage points) comes from:
 
 ```text
 results.groups.ifbench.metrics."pass@1[avg-of-N]".scores.prompt_loose_accuracy.value
 ```
+
+`results.yml` does **not** include a direct
+`prompt_loose_accuracy_statistics_std_err_across_runs`; the closest available
+across-run stderr is `prompt_statistics_std_err_across_runs`. It is computed
+over the strict + loose prompt-level average rather than
+`prompt_loose_accuracy` alone, so report it as an approximate uncertainty.
+
+```python
+import re
+import yaml
+
+
+def avg_of(metric_name):
+    match = re.fullmatch(r"pass@1\[avg-of-(\d+)\]", metric_name)
+    return int(match.group(1)) if match else None
+
+
+def select_metric(metrics, repeats=None):
+    if repeats is not None:
+        expected = f"pass@1[avg-of-{repeats}]"
+        if expected in metrics:
+            return expected
+
+    repeated = [name for name in metrics if avg_of(name) is not None]
+    if repeated:
+        return max(repeated, key=avg_of)
+    return "pass@1"
+
+
+def extract_ifbench_score(path, repeats=None):
+    data = yaml.safe_load(open(path))
+    metrics = data["results"]["groups"]["ifbench"]["metrics"]
+    metric_name = select_metric(metrics, repeats)
+    scores = metrics[metric_name]["scores"]
+
+    accuracy = scores["prompt_loose_accuracy"]["value"]
+    proxy_stderr_value = scores.get(
+        "prompt_statistics_std_err_across_runs", {}
+    ).get("value")
+    stderr = proxy_stderr_value * 100 if proxy_stderr_value is not None else None
+
+    return {
+        "group": "ifbench",
+        "metric": metric_name,
+        "score_key": "prompt_loose_accuracy",
+        "accuracy": accuracy,
+        "stderr": stderr,
+        "stderr_source": "prompt_statistics_std_err_across_runs (proxy)",
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
index 9757838b3b3..c57068b5e1e 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -33,3 +33,32 @@ Use this inside the top-level `evaluation.tasks` list:
 ```
 
 ## Score Extraction
+
+```text
+results.groups.mmlu_pro.metrics.pass@1.scores.symbolic_correct.value
+```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmlu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmlu_pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmlu_pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
index 90d67e28b62..1e7542727de 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
@@ -30,8 +30,34 @@ Use this inside the top-level `evaluation.tasks` list:
 
 ## Score Extraction
 
-MMMU-Pro accuracy comes from:
+MMMU-Pro accuracy (already in percentage points) comes from:
 
 ```text
 results.groups."mmmu-pro".metrics.pass@1.scores.symbolic_correct.value
 ```
+
+`num_repeats: 1` is the standard setting, so `results.yml` does not include
+an across-run stderr. The score is computed over a single pass of the
+dataset (`stats.count` equals `num_problems`).
+
+```python
+import yaml
+
+
+def extract_mmmu_pro_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["mmmu-pro"]["metrics"]["pass@1"]["scores"]
+    entry = scores["symbolic_correct"]
+    accuracy = entry["value"]
+    n = entry.get("stats", {}).get("count")
+
+    return {
+        "group": "mmmu-pro",
+        "metric": "pass@1",
+        "score_key": "symbolic_correct",
+        "accuracy": accuracy,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
index 36c8979b14e..b7b36d9346c 100644
--- a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
+++ b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
@@ -42,3 +42,26 @@ HLE AA accuracy comes from:
 ```text
 results.groups.hle.metrics.pass@1.scores.judge_correct.value
 ```
+
+```python
+import yaml
+
+
+def extract_ns_hle_aa_score(path):
+    data = yaml.safe_load(open(path))
+    scores = data["results"]["groups"]["hle"]["metrics"]["pass@1"]["scores"]
+    accuracy = scores["judge_correct"]["value"]
+    symbolic = scores.get("symbolic_correct", {}).get("value")
+    n = scores["judge_correct"].get("stats", {}).get("count")
+
+    return {
+        "group": "hle",
+        "metric": "pass@1",
+        "score_key": "judge_correct",
+        "accuracy": accuracy,
+        "symbolic_correct": symbolic,
+        "stderr": None,
+        "n": n,
+    }
+
+```
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
index 677af7771d9..a12e0390f57 100644
--- a/.claude/skills/evaluation/recipes/tasks/scicode.md
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -86,7 +86,6 @@ The helper below also supports GPQA's matching layout, where accuracy comes from
 
 ```python
 import re
-import sys
 import yaml
 
 
@@ -143,9 +142,4 @@ def extract_score(path, group="scicode"):
         "stderr": stderr,
     }
 
-
-if __name__ == "__main__":
-    path = sys.argv[1]
-    group = sys.argv[2] if len(sys.argv) > 2 else "scicode"
-    print(extract_score(path, group))
 ```

From 32c2072f2fabe3a6e0389a25916b1f53b1f7f802 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 09:20:18 -0500
Subject: [PATCH 22/26] Add robust monitor status parsing

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .claude/skills/monitor/SKILL.md | 62 ++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/.claude/skills/monitor/SKILL.md b/.claude/skills/monitor/SKILL.md
index e153c0f5885..14ce4e14c32 100644
--- a/.claude/skills/monitor/SKILL.md
+++ b/.claude/skills/monitor/SKILL.md
@@ -93,22 +93,29 @@ vocabulary of the source you're polling.
 
 ### NEL jobs (`type: nel`)
 
-- **Check:** `nel status <id>` — second pipe-delimited column carries the state with a Unicode prefix (e.g. `▶ RUNNING`, `✓ SUCCESS`).
-- **States** (from `nemo_evaluator_launcher.executors.base.ExecutionState` + the CLI status formatter):
-
-  | State (uppercase, as printed) | Terminal? | Indicator |
-  | --- | --- | --- |
-  | `PENDING` | no | `⧗` |
-  | `RUNNING` | no | `▶` |
-  | `SUCCESS` | **yes** | `✓` |
-  | `FAILED` | **yes** | `✗` |
-  | `KILLED` | **yes** | `✗` |
-  | `ERROR` | **yes** | `✗` (synthetic, CLI error path) |
-  | `NOT FOUND` | **yes** | `?` (synthetic, CLI: invocation id unknown) |
-
-  Watcher terminal regex: `^(SUCCESS|FAILED|KILLED|ERROR|NOT FOUND)$`.
-  Strip the Unicode indicator (`▶✓✗⧗?`) and surrounding whitespace before
-  matching.
+- **Check:** `nel status <id>`.
+
+```bash
+extract_nel_state() {
+  local jid="$1" nel_bin="${NEL:-nel}" output state_col
+  output=$("$nel_bin" status "$jid" 2>&1)
+  state_col=$(echo "$output" \
+    | awk -F'|' -v prefix="$jid." 'index($1, prefix) == 1 { print $2; exit }')
+  [ -z "$state_col" ] && state_col="$output"
+  echo "$state_col" \
+    | LC_ALL=C tr '[:lower:]' '[:upper:]' \
+    | awk 'match($0, /(PENDING|RUNNING|SUCCESS|FAILED|KILLED|ERROR|NOT[[:space:]]+FOUND)/) { print substr($0, RSTART, RLENGTH); exit }' \
+    | sed 's/[[:space:]][[:space:]]*/ /g'
+}
+
+is_nel_terminal() {
+  case "$(extract_nel_state "$1")" in
+    SUCCESS|FAILED|KILLED|ERROR|"NOT FOUND") return 0 ;;
+    *) return 1 ;;
+  esac
+}
+```
+
 - **On completion:** `nel info <id>` to fetch results.
 - **On failure:** `nel info <id> --logs` then inspect server/client/SLURM logs via SSH.
 
@@ -120,10 +127,25 @@ vocabulary of the source you're polling.
 
 ### Raw SLURM jobs (`type: slurm`)
 
-- **Check:** `ssh <host> "sacct -j <id> --format=JobID%12,JobName%25,State%12,Elapsed%10 -n"` and filter out `extern`, `batch`, and step rows like `.<step>`. Use `sacct` for the termination check; `squeue` can lag in `COMPLETING` after `sacct` reports a terminal state.
-- **States (terminal):** `COMPLETED`, `FAILED`, `CANCELLED` (also appears as `CANCELLED by <uid>`), `TIMEOUT`, `NODE_FAIL`, `OUT_OF_MEMORY`, `PREEMPTED`, `BOOT_FAIL`, `DEADLINE`.
-- **States (non-terminal):** `PENDING`, `RUNNING`, `CONFIGURING`, `COMPLETING`, `RESIZING`, `SUSPENDED`, `REQUEUED`.
-  Watcher terminal regex: `^(COMPLETED|FAILED|CANCELLED( by .*)?|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY|PREEMPTED|BOOT_FAIL|DEADLINE)$`.
+- **Check:** `sacct`; use `sacct` for the termination check because `squeue`
+  can lag in `COMPLETING` after `sacct` reports a terminal state.
+
+```bash
+extract_slurm_state() {
+  local jid="$1" host="$2"
+  ssh "$host" "sacct -j $jid -X --format=State --noheader -P 2>/dev/null | head -1" \
+    | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' \
+    | sed 's/^CANCELLED by .*/CANCELLED/'
+}
+
+is_slurm_terminal() {
+  case "$(extract_slurm_state "$1" "$2")" in
+    COMPLETED|FAILED|CANCELLED|TIMEOUT|NODE_FAIL|OUT_OF_MEMORY|PREEMPTED|BOOT_FAIL|DEADLINE) return 0 ;;
+    *) return 1 ;;
+  esac
+}
+```
+
 - **On completion:** `ssh <host> "sacct -j <id> --format=State,ExitCode,Elapsed -n"`.
 - **On failure:** Check the job's output log file.
 

From 74635c94da459a0b6cd8d117f05ab64661041da0 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 09:54:16 -0500
Subject: [PATCH 23/26] Increase GPQA evaluation repeats

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .../skills/evaluation/recipes/examples/example_eval.yaml    | 6 +++---
 .claude/skills/evaluation/recipes/tasks/gpqa.md             | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.claude/skills/evaluation/recipes/examples/example_eval.yaml b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
index c8afd1a2b43..ad9f40b9124 100644
--- a/.claude/skills/evaluation/recipes/examples/example_eval.yaml
+++ b/.claude/skills/evaluation/recipes/examples/example_eval.yaml
@@ -7,7 +7,7 @@
 #
 # Includes:
 #   - MMLU-Pro (knowledge, completions)
-#   - GPQA Diamond (reasoning, chat, 5 repeats)
+#   - GPQA Diamond (reasoning, chat, 32 repeats)
 #   - LiveCodeBench v6 (code, chat, 3 repeats)
 #   - IFBench (instruction following, chat, 8 repeats)
 #
@@ -77,14 +77,14 @@ evaluation:
                 - max_new_tokens
                 - max_completion_tokens
 
-  # Reasoning (chat endpoint, 5 repeats, short)
+  # Reasoning (chat endpoint, 32 repeats, short)
     - name: ns_gpqa
       nemo_evaluator_config:
         config:
           params:
             extra:
               args: ++prompt_config=eval/aai/mcq-4choices
-              num_repeats: 5
+              num_repeats: 32
         target:
           api_endpoint:
             adapter_config:
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
index 9befb2a4527..4e79ef90ce7 100644
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.md
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -4,9 +4,9 @@
 
 - Task: `ns_gpqa`
 - Harness: NeMo Skills, chat
-- Primary metric: `pass@1[avg-of-16] symbolic_correct`
+- Primary metric: `pass@1[avg-of-32] symbolic_correct`
 - Run time: Short
-- Samples: 16
+- Samples: 32
 - Requires: None
 - Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
 
@@ -23,7 +23,7 @@ Use this inside the top-level `evaluation.tasks` list:
       params:
         extra:
           args: ++prompt_config=eval/aai/mcq-4choices
-          n_samples: 16
+          n_samples: 32
     target:
       api_endpoint:
         adapter_config:

From 44499bae349dd616a01604e07ec36c4a29e6063c Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 10:06:17 -0500
Subject: [PATCH 24/26] Fix markdownlint formatting in skill docs

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 .../references/vllm-aot-cache-poisoning.md                    | 4 ++--
 .claude/skills/evaluation/recipes/tasks/aa_lcr.md             | 2 +-
 .claude/skills/evaluation/recipes/tasks/aime2025.md           | 2 +-
 .claude/skills/evaluation/recipes/tasks/gpqa.md               | 2 +-
 .claude/skills/evaluation/recipes/tasks/ifbench.md            | 2 +-
 .claude/skills/evaluation/recipes/tasks/livecodebench.md      | 2 +-
 .claude/skills/evaluation/recipes/tasks/mmlu_pro.md           | 2 +-
 .claude/skills/evaluation/recipes/tasks/mmmu_pro.md           | 2 +-
 .claude/skills/evaluation/recipes/tasks/ns_hle_aa.md          | 2 +-
 .claude/skills/evaluation/recipes/tasks/scicode.md            | 2 +-
 .claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md | 2 +-
 11 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
index c43d18a391f..6723b1f3f92 100644
--- a/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
+++ b/.claude/skills/debugging-playbooks/references/vllm-aot-cache-poisoning.md
@@ -14,7 +14,7 @@ are zeroed out via `--limit-mm-per-prompt`.
 vLLM startup crashes during `profile_run` / `_dummy_run` / CUDA-graph capture
 with:
 
-```
+```text
 AttributeError: 'NoneType' object has no attribute 'size'
 ```
 
@@ -25,7 +25,7 @@ no MLP, no quantized linear. The compiled function is loaded from disk and
 crashes in dynamo's prologue, before any decoder layer runs. The log line
 just above the traceback is the smoking gun:
 
-```
+```text
 INFO ... [decorators.py:...] Directly load AOT compilation from path
   /vllm-cache/torch_compile_cache/torch_aot_compile/<hash>/rank_*/model
 ```
diff --git a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
index fe8107c30eb..93d5f4db1f9 100644
--- a/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
+++ b/.claude/skills/evaluation/recipes/tasks/aa_lcr.md
@@ -8,7 +8,7 @@
 - Run time: Long
 - Samples: 3
 - Requires: `HF_TOKEN`, `JUDGE_API_KEY`
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/AA-LCR.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/AA-LCR.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/aime2025.md b/.claude/skills/evaluation/recipes/tasks/aime2025.md
index 18d5e0ce1e1..ed11a8b05a1 100644
--- a/.claude/skills/evaluation/recipes/tasks/aime2025.md
+++ b/.claude/skills/evaluation/recipes/tasks/aime2025.md
@@ -8,7 +8,7 @@
 - Run time: Long for reasoning models with lengthy thinking traces
 - Repeats: 16
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## YAML Fragment
 
diff --git a/.claude/skills/evaluation/recipes/tasks/gpqa.md b/.claude/skills/evaluation/recipes/tasks/gpqa.md
index 4e79ef90ce7..f9393a04118 100644
--- a/.claude/skills/evaluation/recipes/tasks/gpqa.md
+++ b/.claude/skills/evaluation/recipes/tasks/gpqa.md
@@ -8,7 +8,7 @@
 - Run time: Short
 - Samples: 32
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/ifbench.md b/.claude/skills/evaluation/recipes/tasks/ifbench.md
index f3775fcceac..35fcf3950c0 100644
--- a/.claude/skills/evaluation/recipes/tasks/ifbench.md
+++ b/.claude/skills/evaluation/recipes/tasks/ifbench.md
@@ -8,7 +8,7 @@
 - Run time: Super short
 - Repeats: 8
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/livecodebench.md b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
index cb6d6afc56a..f61b04ba562 100644
--- a/.claude/skills/evaluation/recipes/tasks/livecodebench.md
+++ b/.claude/skills/evaluation/recipes/tasks/livecodebench.md
@@ -8,7 +8,7 @@
 - Run time: Medium
 - Repeats: 3
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
index c57068b5e1e..4579e824889 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmlu_pro.md
@@ -8,7 +8,7 @@
 - Run time: Short
 - Repeats: 1
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
index 1e7542727de..f3490e2b046 100644
--- a/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
+++ b/.claude/skills/evaluation/recipes/tasks/mmmu_pro.md
@@ -8,7 +8,7 @@
 - Run time: Medium
 - Repeats: 1
 - Requires: `HF_TOKEN`
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
index b7b36d9346c..4c952bc3b0a 100644
--- a/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
+++ b/.claude/skills/evaluation/recipes/tasks/ns_hle_aa.md
@@ -8,7 +8,7 @@
 - Run time: Long
 - Repeats: 1
 - Requires: `HF_TOKEN`, `JUDGE_API_KEY`
-- Reference: https://docs.nvidia.com/nemo/evaluator/nightly/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/nightly/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/scicode.md b/.claude/skills/evaluation/recipes/tasks/scicode.md
index a12e0390f57..46e21074ba3 100644
--- a/.claude/skills/evaluation/recipes/tasks/scicode.md
+++ b/.claude/skills/evaluation/recipes/tasks/scicode.md
@@ -8,7 +8,7 @@
 - Run time: Long
 - Repeats: 3
 - Requires: None
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
 
 ## Params
 
diff --git a/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
index 4006e978a8f..ea96cbbf17c 100644
--- a/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
+++ b/.claude/skills/evaluation/recipes/tasks/tau2_bench_telecom.md
@@ -8,7 +8,7 @@
 - Run time: Long
 - Samples: 3
 - Requires: `USER_API_KEY`
-- Reference: https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/tau2_bench.html#tau2-bench-tau2-bench-telecom
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/tau2_bench.html#tau2-bench-tau2-bench-telecom>
 
 ## Params
 

From fd07d917aeed4b4c35dc0961633e29a8683867ca Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 10:20:59 -0500
Subject: [PATCH 25/26] Fix launcher Slurm config typing

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 tools/launcher/slurm_config.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index fc2ab987850..2e69d56e976 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -29,16 +29,16 @@ class SlurmConfig:
     No internal cluster defaults are embedded here.
     """
 
-    host: str = None
+    host: str | None = None
     port: int = 22
-    account: str = None
+    account: str | None = None
     partition: str = "batch"
-    qos: str = None
-    container: str = None
+    qos: str | None = None
+    container: str | None = None
     modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt"
-    container_mounts: list[str] = None
-    srun_args: list[str] = None
-    array: str = None
+    container_mounts: list[str] | None = None
+    srun_args: list[str] | None = None
+    array: str | None = None
     nodes: int = 1
     ntasks_per_node: int = 1
     gpus_per_node: int = 1
@@ -52,7 +52,7 @@ def slurm_factory(
     host: str = os.environ.get("SLURM_HOST", ""),
     account: str = os.environ.get("SLURM_ACCOUNT", ""),
     partition: str = os.environ.get("SLURM_PARTITION", "batch"),
-    qos: str = os.environ.get("SLURM_QOS", None),
+    qos: str | None = os.environ.get("SLURM_QOS"),
     nodes: int = 1,
     ntasks_per_node: int = 1,
     gpus_per_node: int = 1,
@@ -62,7 +62,7 @@ def slurm_factory(
         "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")),
     ],
     srun_args: list[str] = ["--no-container-mount-home"],
-    array: str = None,  # noqa: RUF013
+    array: str | None = None,
     time: str = "04:00:00",
 ) -> SlurmConfig:
     """Generic Slurm factory — configure via environment variables or CLI overrides."""

From 2595c72d9c021ec030418668094eefb2f0ef46f8 Mon Sep 17 00:00:00 2001
From: Chad Voegele <cvoegele@nvidia.com>
Date: Wed, 20 May 2026 10:50:12 -0500
Subject: [PATCH 26/26] Use launcher-compatible optional type hints

Signed-off-by: Chad Voegele <cvoegele@nvidia.com>
---
 tools/launcher/slurm_config.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/tools/launcher/slurm_config.py b/tools/launcher/slurm_config.py
index 2e69d56e976..66fa11dc471 100644
--- a/tools/launcher/slurm_config.py
+++ b/tools/launcher/slurm_config.py
@@ -15,8 +15,11 @@
 
 """Slurm configuration and factory for the ModelOpt Launcher."""
 
+# ruff: noqa: UP045
+
 import os
 from dataclasses import dataclass
+from typing import Optional
 
 import nemo_run as run
 
@@ -29,16 +32,16 @@ class SlurmConfig:
     No internal cluster defaults are embedded here.
     """
 
-    host: str | None = None
+    host: Optional[str] = None
     port: int = 22
-    account: str | None = None
+    account: Optional[str] = None
     partition: str = "batch"
-    qos: str | None = None
-    container: str | None = None
+    qos: Optional[str] = None
+    container: Optional[str] = None
     modelopt_install_path: str = "/usr/local/lib/python3.12/dist-packages/modelopt"
-    container_mounts: list[str] | None = None
-    srun_args: list[str] | None = None
-    array: str | None = None
+    container_mounts: Optional[list[str]] = None
+    srun_args: Optional[list[str]] = None
+    array: Optional[str] = None
     nodes: int = 1
     ntasks_per_node: int = 1
     gpus_per_node: int = 1
@@ -52,7 +55,7 @@ def slurm_factory(
     host: str = os.environ.get("SLURM_HOST", ""),
     account: str = os.environ.get("SLURM_ACCOUNT", ""),
     partition: str = os.environ.get("SLURM_PARTITION", "batch"),
-    qos: str | None = os.environ.get("SLURM_QOS"),
+    qos: Optional[str] = os.environ.get("SLURM_QOS"),
     nodes: int = 1,
     ntasks_per_node: int = 1,
     gpus_per_node: int = 1,
@@ -62,7 +65,7 @@ def slurm_factory(
         "{}:/hf-local".format(os.environ.get("SLURM_HF_LOCAL", "/hf-local")),
     ],
     srun_args: list[str] = ["--no-container-mount-home"],
-    array: str | None = None,
+    array: Optional[str] = None,
     time: str = "04:00:00",
 ) -> SlurmConfig:
     """Generic Slurm factory — configure via environment variables or CLI overrides."""