NVIDIA-NeMo · bxyu-nvidia · Apr 3, 2026 · Apr 3, 2026 · Apr 5, 2026 · Apr 6, 2026
diff --git a/README.md b/README.md
@@ -187,13 +187,16 @@ The Dataset column links to publicly available datasets (e.g., on HuggingFace).
 | Multichallenge                                | knowledge             | Targets inference memory, instruction retention, version editing, and self-coherence.                                                                                                                                        | Improve complex multi-turn conversational capability                                                                         | ✓     | -          | Creative Commons Attribution 4.0 International            | <a href='resources_servers/multichallenge/configs/multichallenge_nrl.yaml'>multichallenge_nrl.yaml</a>                                                                                                                      | <a href='https://huggingface.co/datasets/nvidia/Nemotron-RL-Instruction-Following-MultiTurnChat-v1'>Nemotron-RL-Instruction-Following-MultiTurnChat-v1</a>     |
 | Newton Bench                                  | math                  | Scientific law discovery tasks through agentic experimentation across 12 physics domains                                                                                                                                     | Improve science, reasoning, and tool use capabilities                                                                        | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/newton_bench/configs/newton_bench.yaml'>newton_bench.yaml</a>                                                                                                                                    | -                                                                                                                                                              |
 | Ns Tools                                      | agent                 | NeMo Skills tool execution with math verification                                                                                                                                                                            | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/ns_tools/configs/ns_tools.yaml'>ns_tools.yaml</a>                                                                                                                                                | -                                                                                                                                                              |
+| Nvarc                                         | knowledge             | ARC-AGI inductive mode: model outputs Python code with transform()                                                                                                                                                           | Improve ARC-AGI puzzle-solving by inducing executable transformation programs                                                | ✓     | ✓          | Apache 2.0                                                | <a href='resources_servers/nvarc/configs/inductive.yaml'>inductive.yaml</a>                                                                                                                                                 | -                                                                                                                                                              |
+| Nvarc                                         | knowledge             | ARC-AGI transductive mode: model outputs grid directly                                                                                                                                                                       | Improve ARC-AGI puzzle-solving by directly predicting transformed grids                                                      | ✓     | ✓          | Apache 2.0                                                | <a href='resources_servers/nvarc/configs/transductive.yaml'>transductive.yaml</a>                                                                                                                                           | -                                                                                                                                                              |
 | Openenv                                       | agent                 | Echo environment via OpenEnv (MCP). Echoes messages back with length-based rewards.                                                                                                                                          | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/openenv/configs/openenv_echo.yaml'>openenv_echo.yaml</a>                                                                                                                                         | -                                                                                                                                                              |
 | Openenv                                       | coding                | Python code execution environment via OpenEnv. Executes code and returns stdout/stderr.                                                                                                                                      | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/openenv/configs/openenv_coding.yaml'>openenv_coding.yaml</a>                                                                                                                                     | -                                                                                                                                                              |
 | Openenv                                       | games                 | Maze navigation environment via OpenEnv. Agent navigates an 8x8 grid to find the exit.                                                                                                                                       | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/openenv/configs/openenv_maze.yaml'>openenv_maze.yaml</a>                                                                                                                                         | -                                                                                                                                                              |
-| Over Refusal Detection                        |                       | -                                                                                                                                                                                                                            | -                                                                                                                            | ✓     | -          | -                                                         | <a href='resources_servers/over_refusal_detection/configs/over_refusal_detection.yaml'>over_refusal_detection.yaml</a>                                                                                                      | -                                                                                                                                                              |
+| Over Refusal Detection                        |                       | -                                                                                                                                                                                                                            | -                                                                                                                            | ✓     | -          | TBD                                                       | <a href='resources_servers/over_refusal_detection/configs/over_refusal_detection.yaml'>over_refusal_detection.yaml</a>                                                                                                      | -                                                                                                                                                              |
 | Proof Genselect                               | math                  | Pairwise proof selection with binary correctness reward                                                                                                                                                                      | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/proof_genselect/configs/proof_genselect.yaml'>proof_genselect.yaml</a>                                                                                                                           | -                                                                                                                                                              |
 | Proof Judge                                   | math                  | Theorem proving with verifier + meta-verifier judge (combined env)                                                                                                                                                           | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/proof_judge/configs/proof_judge.yaml'>proof_judge.yaml</a>                                                                                                                                       | -                                                                                                                                                              |
 | Proof Verification                            | math                  | Proof verification scored against ground truth and meta-verifier agreement                                                                                                                                                   | -                                                                                                                            | -     | -          | -                                                         | <a href='resources_servers/proof_verification/configs/proof_verification.yaml'>proof_verification.yaml</a>                                                                                                                  | -                                                                                                                                                              |
+| Rdkit Chemistry                               | knowledge             | Molecular chemistry question answering: calculate properties of SMILES. Includes a mix of tool-use (python + rdkit) and no-tool-use questions.                                                                               | Improve molecular reasoning and SMILES parsing.                                                                              | ✓     | -          | TBD                                                       | <a href='resources_servers/rdkit_chemistry/configs/rdkit_chemistry.yaml'>rdkit_chemistry.yaml</a>                                                                                                                           | -                                                                                                                                                              |
 | Reasoning Gym                                 | knowledge             | LangGraph orchestrator agent compatible with resource servers that do not use tools; enables diverse agent training data and test time scaling vs a simple agent, extensible to use tools or other agent architectures       | Iterative test time scaling for improved performance in reasoning tasks                                                      | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/reasoning_gym/configs/orchestrator_agent.yaml'>orchestrator_agent.yaml</a>                                                                                                                       | -                                                                                                                                                              |
 | Reasoning Gym                                 | knowledge             | LangGraph parallel thinking agent compatible with resource servers that do not use tools; enables diverse agent training data and test time scaling vs a simple agent, extensible to use tools or other agent architectures  | Iterative test time scaling for improved performance in reasoning tasks                                                      | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/reasoning_gym/configs/parallel_thinking_agent.yaml'>parallel_thinking_agent.yaml</a>                                                                                                             | -                                                                                                                                                              |
 | Reasoning Gym                                 | knowledge             | LangGraph reflection agent compatible with resource servers that do not use tools; provides iterative reflection for diverse agent training data and test time scaling, extensible to use tools or other agent architectures | Iterative test time scaling for improved performance in reasoning tasks                                                      | ✓     | -          | Apache 2.0                                                | <a href='resources_servers/reasoning_gym/configs/reflection_agent.yaml'>reflection_agent.yaml</a>                                                                                                                           | -                                                                                                                                                              |

diff --git a/benchmarks/aalcr/config.yaml b/benchmarks/aalcr/config.yaml
@@ -35,4 +35,4 @@ aalcr_benchmark_simple_agent:
         jsonl_fpath: benchmarks/aalcr/data/aalcr_benchmark.jsonl
         prompt_config: null
         prepare_script: benchmarks/aalcr/prepare.py
-        num_repeats: 16
+        num_repeats: 3
diff --git a/benchmarks/livecodebench/prepare_utils.py b/benchmarks/livecodebench/prepare_utils.py
@@ -33,6 +33,8 @@
 from pathlib import Path
 from typing import Callable, Optional
 
+import orjson
+
 
 # From LiveCodeBench lcb_runner/prompts/code_generation.py — tells the model which code style to use
 _FORMATTING_WITH_STARTER_CODE = (
@@ -139,8 +141,8 @@ def prepare_from_hf_raw(
 
 def _write_rows(rows: list, output_path: Path) -> Path:
     output_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(output_path, "w") as f:
+    with open(output_path, "wb") as f:
         for row in rows:
-            f.write(json.dumps(row) + "\n")
+            f.write(orjson.dumps(row) + b"\n")
     print(f"Wrote {len(rows)} problems to {output_path}")
     return output_path
diff --git a/benchmarks/mmlu_pro/README.md b/benchmarks/mmlu_pro/README.md
@@ -0,0 +1,25 @@
+# MMLU-Pro
+
+[MMLU-Pro](https://arxiv.org/abs/2406.01574) is a challenging multiple-choice question answering benchmark with 10 answer choices (A–J) across 14 disciplines including math, science, law, business, and more. It extends the original MMLU benchmark with harder questions and more distractor options.
+
+## Configuration
+
+This benchmark uses the `mcqa` resource server with the `mcqa_simple_agent`.
+
+- **Grading mode**: `lenient_answer_colon_md` (markdown-aware `Answer: X` extraction, matching NeMo-Skills evaluator behavior)
+- **Prompt**: `Answer the following multiple choice question. The last line of your response should be in the following format: 'Answer: $LETTER' where LETTER is one of A, B, C, D, E, F, G, H, I, J. ...`
+
+## Usage
+
+```bash
+# Prepare data
+ng_prepare_benchmark "+config_paths=[benchmarks/mmlu_pro/config.yaml]"
+
+# Start servers
+ng_run "+config_paths=[benchmarks/mmlu_pro/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]"
+
+# Collect rollouts
+ng_collect_rollouts \
+    "+config_paths=[benchmarks/mmlu_pro/config.yaml,responses_api_models/vllm_model/configs/vllm_model.yaml]" \
+    +output_jsonl_fpath=results/mmlu_pro.jsonl
+```
diff --git a/benchmarks/mmlu_pro/__init__.py b/benchmarks/mmlu_pro/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/benchmarks/mmlu_pro/config.yaml b/benchmarks/mmlu_pro/config.yaml
@@ -0,0 +1,25 @@
+# Chain to existing resource server + agent config
+config_paths:
+  - resources_servers/mcqa/configs/mcqa.yaml
+
+# We use `_inherit_from` directives to inherit from and not use the generic config above to ensure this benchmark config is isolated.
+mmlu_pro_mcqa_resources_server:
+  _inherit_from: mcqa
+  resources_servers:
+    mcqa:
+      # Override mcqa server grading mode to match the prompt format (Answer: X)
+      grading_mode: lenient_answer_colon_md
+
+mmlu_pro_mcqa_simple_agent:
+  _inherit_from: mcqa_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: mmlu_pro_mcqa_resources_server
+      datasets:
+      - name: mmlu_pro
+        type: benchmark
+        jsonl_fpath: benchmarks/mmlu_pro/data/mmlu_pro_benchmark.jsonl
+        prompt_config: benchmarks/mmlu_pro/prompts/default.yaml
+        prepare_script: benchmarks/mmlu_pro/prepare.py
+        num_repeats: 1
diff --git a/benchmarks/mmlu_pro/data/.gitignore b/benchmarks/mmlu_pro/data/.gitignore
@@ -0,0 +1 @@
+*.jsonl
diff --git a/benchmarks/mmlu_pro/data/mmlu_pro_benchmark_metrics.json b/benchmarks/mmlu_pro/data/mmlu_pro_benchmark_metrics.json
@@ -0,0 +1,37 @@
+{
+    "name": "mmlu_pro",
+    "type": "benchmark",
+    "jsonl_fpath": "benchmarks/mmlu_pro/data/mmlu_pro_benchmark.jsonl",
+    "prepare_script": "benchmarks/mmlu_pro/prepare.py",
+    "prompt_config": "benchmarks/mmlu_pro/prompts/default.yaml",
+    "num_repeats": 8,
+    "Number of examples": 0,
+    "Number of tools": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Json-dumped number of words (proxy for token count)": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Number of turns": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    },
+    "Temperature": {
+        "Total # non-null values": 0,
+        "Average": 0.0,
+        "Min": 0.0,
+        "Max": 0.0,
+        "Standard deviation": 0.0
+    }
+}