NVIDIA
diff --git a/‎.claude/skills/evaluation/SKILL.md‎
Lines changed: 190 additions & 239 deletions b/‎.claude/skills/evaluation/SKILL.md‎
Lines changed: 190 additions & 239 deletions
diff --git a/‎.claude/skills/evaluation/recipes/env.example‎
Lines changed: 12 additions & 4 deletions b/‎.claude/skills/evaluation/recipes/env.example‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎.claude/skills/evaluation/recipes/examples/example_eval.yaml‎
Lines changed: 57 additions & 70 deletions b/‎.claude/skills/evaluation/recipes/examples/example_eval.yaml‎
Lines changed: 57 additions & 70 deletions
diff --git a/‎.claude/skills/evaluation/recipes/tasks/aa/gpqa_diamond.md‎
Lines changed: 27 additions & 0 deletions b/‎.claude/skills/evaluation/recipes/tasks/aa/gpqa_diamond.md‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.claude/skills/evaluation/recipes/tasks/aa/hle.md‎
Lines changed: 33 additions & 0 deletions b/‎.claude/skills/evaluation/recipes/tasks/aa/hle.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎.claude/skills/evaluation/recipes/tasks/aa/ifbench.md‎
Lines changed: 27 additions & 0 deletions b/‎.claude/skills/evaluation/recipes/tasks/aa/ifbench.md‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.claude/skills/evaluation/recipes/tasks/aa/lcr.md‎
Lines changed: 50 additions & 0 deletions b/‎.claude/skills/evaluation/recipes/tasks/aa/lcr.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎.claude/skills/evaluation/recipes/tasks/aa/mmmu_pro.md‎
Lines changed: 27 additions & 0 deletions b/‎.claude/skills/evaluation/recipes/tasks/aa/mmmu_pro.md‎
Lines changed: 27 additions & 0 deletions
@@ -18,11 +18,19 @@ NEMO_EVALUATOR_TRUST_PRE_CMD=1
 
 # --- Optional: task-specific keys ---
 
-# HLE, AA-LCR, and other judge-backed tasks
+# Judge / inference endpoints — two separate env vars by harness:
+#
+#   JUDGE_API_KEY     — used by simple-evals harness tasks (e.g. AIME 2025).
+#                       Typically the API key from build.nvidia.com.
+#   INFERENCE_API_KEY — used by nemo-skills and tau2-bench harnesses for
+#                       judge / user-simulator endpoints (HLE, AA-LCR,
+#                       Tau2-Bench Telecom, etc.).
+#
+# The two keys can point to the same provider/credential — they're separate
+# env vars only because different eval harnesses look up different names.
+# Set both if you run tasks from both harness families.
 # JUDGE_API_KEY=
-
-# tau2_bench_telecom user simulator endpoint
-# USER_API_KEY=
+# INFERENCE_API_KEY=
 
 # terminal-bench-hard (AWS sandbox)
 # AWS_ACCESS_KEY_ID=
 
@@ -1,15 +1,13 @@
-# Example: Quantization Validation Suite
+# Example: AA-aligned eval template (single-task starting point)
 #
-# A balanced set of benchmarks for validating quantized model quality.
-# Copy this file and customize for your needs.
-# Task references in recipes/tasks/ define benchmark requirements and YAML
-# fragments — the agent composes them into a runnable config like this one.
+# Minimal config to validate a quantized model end-to-end against one AA-style
+# benchmark. The agent extends this template by copying additional task
+# fragments from recipes/tasks/aa/*.md into the `evaluation.tasks` list — task
+# references define benchmark requirements and YAML fragments, this file
+# defines the deployment + operational conventions.
 #
-# Includes:
-#   - MMLU-Pro (knowledge, completions)
-#   - GPQA Diamond (reasoning, chat, 32 repeats)
-#   - LiveCodeBench v6 (code, chat, 3 repeats)
-#   - IFBench (instruction following, chat, 8 repeats)
+# Includes (default): gpqa_diamond_aa_v3 (simple-evals harness, n_samples=16).
+# Add more AA tasks per recipes/tasks/aa/ and the AA suite rule in SKILL.md.
 #
 # Usage:
 #   nel run --config recipes/examples/example_eval.yaml \
@@ -22,10 +20,15 @@
 # For quantized checkpoints, do not add a vLLM quantization flag by default.
 # Recent vLLM reads ModelOpt quantization metadata from the checkpoint. Only add
 # an explicit flag if the model card, vLLM version, or dry-run error requires it.
-#   -o 'deployment.extra_args=--max-model-len 32768 --trust-remote-code'
+#
+# Deployment uses a single `command:` field instead of separate
+# `tensor_parallel_size` / `data_parallel_size` / `extra_args` fields — the full
+# `vllm serve` invocation lives in the command string. NEL mounts the resolved
+# model (from checkpoint_path or hf_model_handle) at /checkpoint inside the
+# container, and Hydra interpolates ${deployment.port} at run time.
 #
 # Run a single task:
-#   nel run --config ... -t ns_gpqa
+#   nel run --config ... -t gpqa_diamond_aa_v3
 #
 # Canary (2 samples): use this before a full run to validate logs and tune
 # parallelism.
@@ -42,84 +45,68 @@ execution:
   walltime: "04:00:00"
   mounts:
     mount_home: false
+  auto_export:
+    destinations:
+      - mlflow
 deployment:
   env_vars:
     HF_TOKEN: host:HF_TOKEN
   checkpoint_path: ???
   hf_model_handle:
   served_model_name: ???
-  tensor_parallel_size: 1
-  data_parallel_size: 1
-  # For models with custom code, add: --trust-remote-code
-  extra_args: --max-model-len 32768
+  image: vllm/vllm-openai:v0.19.1
+  # For MoE models, add `--enable-expert-parallel` to the command.
+  # For models with custom code, add `--trust-remote-code` to the command.
+  # After filling in evaluation `parallelism` values (top-level + per-task),
+  # append `--max-num-seqs N` to the command where
+  # N = ceil(max_parallelism / data_parallel_size).
+  command: >-
+    vllm serve /checkpoint
+    --host 0.0.0.0
+    --port ${deployment.port}
+    --tensor-parallel-size 1
+    --data-parallel-size 1
+    --max-model-len 131072
+    --max-num-batched-tokens 8192
+    --enable-chunked-prefill
 evaluation:
   env_vars:
     HF_TOKEN: host:HF_TOKEN
   nemo_evaluator_config:
     config:
       params:
+        parallelism: ???  # Number of concurrent requests per each benchmark
         request_timeout: 3600
         max_retries: 10
-        parallelism: 16
+        max_new_tokens: 65536  # 64K for reasoning models; use 16384 (16K) for non-reasoning; prefer model card value
+        temperature: 1.0       # from model card (reasoning mode); adjust per card
+        top_p: 0.95            # from model card (reasoning mode); adjust per card
     target:
       api_endpoint:
         api_key_name: DUMMY_API_KEY
   tasks:
-  # Knowledge (chat endpoint, short)
-    - name: nemo_skills.ns_mmlu_pro
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              num_repeats: 1
-              args: ++prompt_config=eval/aai/mcq-10choices-boxed ++inference.tokens_to_generate=null
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
-
-  # Reasoning (chat endpoint, 32 repeats, short)
-    - name: ns_gpqa
+  # Reasoning (chat endpoint, 8 repeats, short)
+    - name: gpqa_diamond_aa_v3
+      container: nvcr.io/nvidia/eval-factory/simple-evals:26.03
       nemo_evaluator_config:
         config:
           params:
             extra:
-              args: ++prompt_config=eval/aai/mcq-4choices
-              num_repeats: 32
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+              n_samples: 16
 
-  # Code (chat endpoint, 3 repeats, medium)
-    - name: ns_livecodebench
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              dataset_split: test_v6_2408_2505
-              num_repeats: 3
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
-
-  # Instruction following (chat endpoint, 8 repeats, super short)
-    - name: ns_ifbench
-      nemo_evaluator_config:
-        config:
-          params:
-            extra:
-              num_repeats: 8
-        target:
-          api_endpoint:
-            adapter_config:
-              params_to_remove:
-                - max_new_tokens
-                - max_completion_tokens
+export:
+  mlflow:
+    tracking_uri: ???
+    experiment_name: ${oc.env:USER}/${deployment.served_model_name}
+    description: '${oc.env:USER}/${deployment.served_model_name} | T=${evaluation.nemo_evaluator_config.config.params.temperature}, top_p=${evaluation.nemo_evaluator_config.config.params.top_p},
+      max_new_tokens=${evaluation.nemo_evaluator_config.config.params.max_new_tokens}'
+    log_logs: true
+    log_artifacts: true
+    only_required: false
+    skip_existing: false
+    tags:
+      framework: vllm
+      model: ${deployment.served_model_name}
+      temperature: '${evaluation.nemo_evaluator_config.config.params.temperature}'
+      top_p: '${evaluation.nemo_evaluator_config.config.params.top_p}'
+      max_new_tokens: '${evaluation.nemo_evaluator_config.config.params.max_new_tokens}'
@@ -0,0 +1,27 @@
+# GPQA Diamond
+
+## Task Details
+
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/simple_evals.html#simple-evals-gpqa-diamond-aa-v3>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: gpqa_diamond_aa_v3
+  container: nvcr.io/nvidia/eval-factory/simple-evals:26.03
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          n_samples: 16
+```
+
+## Score Extraction from mlflow
+
+Result (0-100): `gpqa_diamond_score_micro_avg_of_N`
+
+N is the repeat count.  If the repeat count is unknown, use the highest available `avg_of_N`.
@@ -0,0 +1,33 @@
+# HLE
+
+## Task Details
+
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html#nemo-skills-ns-hle-aa>
+
+## Params
+
+This is the text-only HLE task with params aligned to Artificial Analysis Index
+v2. HLE is judge-scored and requires judge credentials.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_hle_aa
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  env_vars:
+    INFERENCE_API_KEY: host:INFERENCE_API_KEY
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          judge:
+            model_id: <hle_aa_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
+            api_key: INFERENCE_API_KEY
+```
+
+## Score Extraction from mlflow
+
+Result (0-100): `hle_pass_at_1_judge_correct`
@@ -0,0 +1,27 @@
+# IFBench
+
+## Task Details
+
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html#nemo-skills-ns-ifbench>
+
+## Params
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_ifbench
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 5
+```
+
+## Score Extraction from mlflow
+
+Result (0-100): `ifbench_pass_at_1_avg-of-N_prompt_loose_accuracy`
+
+N is the repeat count.  If the repeat count is unknown, use the highest available `avg-of-N`.
@@ -0,0 +1,50 @@
+# LCR
+
+## Task Details
+
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html#nemo-skills-ns-aa-lcr>
+
+## Params
+
+Recommended judge: use Qwen3 235B as an OpenAI-compatible equality-checker
+judge, and keep the same judge across comparable runs.
+
+AA-LCR needs long context: plan for roughly 120K input tokens plus 16K
+generation tokens. Set deployment `--max-model-len` to at least `131072`, and
+use a larger value when the model supports it.
+
+## YAML Fragment
+
+LCR has a deployment-side requirement (`--max-model-len 131072`) and a task
+block. Per SKILL.md Step 3, the deployment flag must live inside
+`deployment.command:` — not in the deprecated `extra_args` field.
+
+**Deployment requirement:** ensure the `vllm serve ...` invocation in
+`deployment.command` includes `--max-model-len 131072` (or higher).
+
+```yaml
+- name: ns_aa_lcr
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  env_vars:
+    INFERENCE_API_KEY: host:INFERENCE_API_KEY
+  nemo_evaluator_config:
+    target:
+      api_endpoint:
+        adapter_config:
+          use_request_logging: false
+          use_response_logging: false
+    config:
+      params:
+        extra:
+          num_repeats: 16
+          judge:
+            model_id: <qwen3_235b_judge_model_id>
+            url: <openai_compatible_judge_chat_completions_url>
+            api_key: INFERENCE_API_KEY
+```
+
+## Score Extraction from mlflow
+
+Result (0-100): `aalcr_pass_at_1_avg-of-N_judge_correct`
+
+N is the repeat count.  If the repeat count is unknown, use the highest available `avg-of-N`.
@@ -0,0 +1,27 @@
+# MMMU-Pro
+
+## Task Details
+
+- Reference: <https://docs.nvidia.com/nemo/evaluator/latest/evaluation/benchmarks/catalog/all/harnesses/nemo_skills.html>
+
+## Params
+
+MMMU-Pro is a multimodal task. Use a multimodal-capable endpoint.
+
+## YAML Fragment
+
+Use this inside the top-level `evaluation.tasks` list:
+
+```yaml
+- name: ns_mmmu_pro
+  container: nvcr.io/nvidia/eval-factory/nemo-skills:26.03
+  nemo_evaluator_config:
+    config:
+      params:
+        extra:
+          num_repeats: 1
+```
+
+## Score Extraction from mlflow
+
+Result (0-100): `mmmu-pro_pass_at_1_symbolic_correct`