feat: emit per-task validation accuracy in GRPO and Distillation

bzantium · bzantium · commit f5fa7dadce4a · 2026-05-15T13:57:36.000+09:00
Multi-validation (data.validation as a list of datasets) currently runs
correctly but the validation aggregator collapses everything into a
single sample-weighted accuracy. Per-task progress (e.g. gsm8k vs
math500) is silently lost.

task_name is already on every sample (DatumSpec.task_name preserved
through rl_collate_fn into val_batch["task_name"]); validate() simply
did not read it.

This commit teaches both validate() functions to track rewards per
task during the loop, then emit accuracy_&lt;task&gt; and num_samples_&lt;task&gt;
keys alongside the existing aggregated accuracy. logger.log_metrics
plots each as its own metric automatically.

The aggregated accuracy key is preserved unchanged for dashboard
backwards compatibility. Datasets without task_name are skipped, so
single-task and legacy recipes behave identically.

DPO already does per-dataset metrics via its dict-of-dataloaders
architecture (see dpo.validate at nemo_rl/algorithms/dpo.py:332-377),
so it is not touched here.

Tests:
* test_grpo.py adds test_validate_emits_per_task_accuracy_keys.
* test_distillation.py adds the same plus a check that the
  aggregated accuracy key matches the sample-weighted mean across
  tasks.

Signed-off-by: Minho Ryu &lt;ryumin93@gmail.com&gt;
diff --git a/nemo_rl/algorithms/distillation.py b/nemo_rl/algorithms/distillation.py
@@ -976,6 +976,10 @@ def validate(
         total_rewards = []  # Can be any metric. Setted to 'accuracy' by default.
         total_lengths = []
         all_message_logs = []  # Collect all message logs
+        # Per-task accuracy breakdown. Multi-validation concatenates several
+        # datasets into one dataloader; without this split the aggregated
+        # `accuracy` hides per-task progress (e.g. gsm8k vs math500).
+        per_task_rewards: dict[str, list[float]] = {}
 
         max_val_samples = master_config.distillation.get("max_val_samples")
         if max_val_samples is None:
@@ -1011,10 +1015,21 @@ def validate(
                     greedy=False,
                 )
             rewards = val_batch["total_reward"]
+            rewards_list = rewards.tolist()
 
-            total_rewards.extend(rewards.tolist())
+            total_rewards.extend(rewards_list)
             total_lengths.append(gen_metrics["mean_gen_tokens_per_sample"])
 
+            # task_name is set per sample by rl_collate_fn from
+            # DatumSpec.task_name. Skip entries that lack it for backwards
+            # compatibility with single-task or legacy datasets.
+            batch_task_names = val_batch.get("task_name")
+            if batch_task_names is not None:
+                for r, t in zip(rewards_list, batch_task_names):
+                    if t is None:
+                        continue
+                    per_task_rewards.setdefault(t, []).append(r)
+
             # Collect message logs for later display
             to_env = [
                 get_keys_from_message_log(
@@ -1037,6 +1052,11 @@ def validate(
             "accuracy": accuracy,
             "avg_length": avg_length,
         }
+        for task_name, task_rewards in per_task_rewards.items():
+            val_metrics[f"accuracy_{task_name}"] = sum(task_rewards) / len(
+                task_rewards
+            )
+            val_metrics[f"num_samples_{task_name}"] = len(task_rewards)
 
         # Print sample conversations only once at the end of validation
         try:
@@ -1062,6 +1082,14 @@ def validate(
     print(f"    • Accuracy: {accuracy:.4f}")
     print(f"    • Average response length: {avg_length:.1f} tokens")
     print(f"    • Samples processed: {len(total_rewards)}", flush=True)
+    if per_task_rewards:
+        print("    • Per-task accuracy:")
+        for task_name in sorted(per_task_rewards.keys()):
+            tr = per_task_rewards[task_name]
+            print(
+                f"        - {task_name}: {sum(tr) / len(tr):.4f} (n={len(tr)})",
+                flush=True,
+            )
 
     # Print timing information
     print("\n  ⏱️  Validation Timing:")
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -2287,6 +2287,10 @@ def validate(
         total_rewards = []
         total_lengths = []
         all_message_logs = []  # Collect all message logs
+        # Per-task accuracy breakdown. Multi-validation concatenates several
+        # datasets into one dataloader; without this split the aggregated
+        # `accuracy` hides per-task progress (e.g. gsm8k vs math500).
+        per_task_rewards: dict[str, list[float]] = {}
 
         max_val_samples = master_config.grpo.get("max_val_samples")
         if max_val_samples is None:
@@ -2337,9 +2341,20 @@ def validate(
                     greedy=False,
                 )
 
-            total_rewards.extend(val_batch["total_reward"].tolist())
+            rewards_list = val_batch["total_reward"].tolist()
+            total_rewards.extend(rewards_list)
             total_lengths.append(gen_metrics["mean_gen_tokens_per_sample"])
 
+            # task_name is set per sample by rl_collate_fn from
+            # DatumSpec.task_name. Skip entries that lack it for backwards
+            # compatibility with single-task or legacy datasets.
+            batch_task_names = val_batch.get("task_name")
+            if batch_task_names is not None:
+                for r, t in zip(rewards_list, batch_task_names):
+                    if t is None:
+                        continue
+                    per_task_rewards.setdefault(t, []).append(r)
+
             # Collect message logs for later display
             to_env = [
                 get_keys_from_message_log(
@@ -2367,6 +2382,11 @@ def validate(
             "avg_length": avg_length,
             **additional_metrics_to_report,
         }
+        for task_name, task_rewards in per_task_rewards.items():
+            val_metrics[f"accuracy_{task_name}"] = sum(task_rewards) / len(
+                task_rewards
+            )
+            val_metrics[f"num_samples_{task_name}"] = len(task_rewards)
 
         # Print sample conversations only once at the end of validation
         try:
@@ -2392,6 +2412,14 @@ def validate(
     print(f"    • Accuracy: {accuracy:.4f}")
     print(f"    • Average response length: {avg_length:.1f} tokens")
     print(f"    • Samples processed: {len(total_rewards)}", flush=True)
+    if per_task_rewards:
+        print("    • Per-task accuracy:")
+        for task_name in sorted(per_task_rewards.keys()):
+            tr = per_task_rewards[task_name]
+            print(
+                f"        - {task_name}: {sum(tr) / len(tr):.4f} (n={len(tr)})",
+                flush=True,
+            )
 
     # Print timing information
     print("\n  ⏱️  Validation Timing:")
diff --git a/tests/unit/algorithms/test_distillation.py b/tests/unit/algorithms/test_distillation.py
@@ -329,6 +329,58 @@ def test_validate_iterates_full_dataloader_when_max_val_samples_is_none(
     assert mock_rollout.call_count == expected_batches
 
 
+def test_validate_emits_per_task_accuracy_keys(mock_components):
+    """Multi-task validation produces accuracy_<task> and num_samples_<task> entries."""
+    config = mock_components["master_config"]
+    config.distillation["max_val_samples"] = 4
+    config.distillation["val_batch_size"] = 1
+
+    # Two batches per task; reward 1.0 for gsm8k, 0.0 for math500.
+    task_sequence = ["gsm8k", "math500", "gsm8k", "math500"]
+    reward_sequence = [1.0, 0.0, 1.0, 0.0]
+
+    def make_batch(task, reward):
+        batch = BatchedDataDict[DatumSpec](
+            {
+                "message_log": [
+                    [
+                        {"token_ids": torch.tensor([1, 2]), "role": "user", "content": "q"},
+                        {"token_ids": torch.tensor([3, 4]), "role": "assistant", "content": "a"},
+                    ]
+                ],
+                "loss_multiplier": torch.tensor([1.0]),
+                "task_name": [task],
+                "extra_env_info": [{}],
+                "length": torch.tensor([4]),
+                "idx": torch.tensor([0]),
+                "total_reward": torch.tensor([reward]),
+            }
+        )
+        return batch
+
+    rollout_batches = [make_batch(t, r) for t, r in zip(task_sequence, reward_sequence)]
+
+    with patch.object(distil_mod, "run_multi_turn_rollout") as mock_rollout:
+        mock_rollout.side_effect = [
+            (b, {"mean_gen_tokens_per_sample": 4.0}) for b in rollout_batches
+        ]
+        val_metrics, _ = validate(
+            mock_components["student_generation"],
+            mock_components["val_dataloader"],
+            mock_components["tokenizer"],
+            mock_components["val_task_to_env"],
+            step=0,
+            master_config=config,
+        )
+
+    assert val_metrics["accuracy_gsm8k"] == 1.0
+    assert val_metrics["accuracy_math500"] == 0.0
+    assert val_metrics["num_samples_gsm8k"] == 2
+    assert val_metrics["num_samples_math500"] == 2
+    # Aggregated key is preserved with the same definition (sample-weighted mean).
+    assert val_metrics["accuracy"] == pytest.approx(0.5)
+
+
 def test_validate_floor_divides_max_val_samples_by_val_batch_size(mock_components):
     """When max_val_samples is set, validate truncates with floor division (matches GRPO)."""
     config = mock_components["master_config"]
diff --git a/tests/unit/algorithms/test_grpo.py b/tests/unit/algorithms/test_grpo.py
@@ -2433,6 +2433,108 @@ def test_validate_iterates_full_dataloader_when_max_val_samples_is_none(self):
 
         assert mock_rollout.call_count == num_batches
 
+    def test_validate_emits_per_task_accuracy_keys(self):
+        """Multi-task validation produces accuracy_<task> and num_samples_<task> keys."""
+        mock_policy_gen = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.pad_token_id = 0
+
+        def make_batch(task, reward):
+            return BatchedDataDict[DatumSpec](
+                {
+                    "message_log": [
+                        [
+                            {
+                                "role": "user",
+                                "content": "q",
+                                "token_ids": torch.tensor([1, 2]),
+                            },
+                            {
+                                "role": "assistant",
+                                "content": "a",
+                                "token_ids": torch.tensor([3, 4]),
+                            },
+                        ],
+                    ],
+                    "task_name": [task],
+                    "extra_env_info": [{}],
+                    "loss_multiplier": torch.tensor([1.0]),
+                    "idx": torch.tensor([0]),
+                    "length": torch.tensor([4]),
+                    "total_reward": torch.tensor([reward]),
+                }
+            )
+
+        # Two batches per task; gsm8k all correct, math500 all wrong.
+        task_sequence = ["gsm8k", "math500", "gsm8k", "math500"]
+        reward_sequence = [1.0, 0.0, 1.0, 0.0]
+        rollout_batches = [
+            make_batch(t, r) for t, r in zip(task_sequence, reward_sequence)
+        ]
+
+        mock_dataloader = MagicMock(spec=StatefulDataLoader)
+        mock_dataloader.__iter__ = MagicMock(
+            return_value=iter([make_batch("placeholder", 0.0)] * len(rollout_batches))
+        )
+        mock_dataloader.__len__ = MagicMock(return_value=len(rollout_batches))
+
+        mock_env = MagicMock(spec=EnvironmentInterface)
+        mock_env.global_post_process_and_metrics.return_value = (
+            rollout_batches[0],
+            {},
+        )
+
+        mock_config = MasterConfig.model_construct(
+            **{
+                "grpo": {
+                    "max_val_samples": 4,
+                    "val_batch_size": 1,
+                    "max_rollout_turns": 1,
+                },
+                "policy": {
+                    "max_total_sequence_length": 2048,
+                    "generation": {
+                        "temperature": 1.0,
+                        "top_p": 1.0,
+                        "top_k": None,
+                        "backend": "vllm",
+                        "colocated": {"enabled": True},
+                        "vllm_cfg": {"async_engine": False},
+                    },
+                },
+                "logger": {"num_val_samples_to_print": 1},
+            }
+        )
+
+        with patch("nemo_rl.algorithms.grpo.run_multi_turn_rollout") as mock_rollout:
+            mock_rollout.side_effect = [
+                (b, {"mean_gen_tokens_per_sample": 4.0}) for b in rollout_batches
+            ]
+            with patch(
+                "nemo_rl.algorithms.grpo._should_use_nemo_gym", return_value=False
+            ):
+                with patch(
+                    "nemo_rl.algorithms.grpo._should_use_async_rollouts",
+                    return_value=False,
+                ):
+                    with patch("nemo_rl.algorithms.grpo.print_message_log_samples"):
+                        val_metrics, _ = validate(
+                            mock_policy_gen,
+                            mock_dataloader,
+                            mock_tokenizer,
+                            {"math": mock_env},
+                            step=0,
+                            master_config=mock_config,
+                            logger=None,
+                        )
+
+        assert val_metrics["accuracy_gsm8k"] == 1.0
+        assert val_metrics["accuracy_math500"] == 0.0
+        assert val_metrics["num_samples_gsm8k"] == 2
+        assert val_metrics["num_samples_math500"] == 2
+        # Aggregated key is preserved with the same definition.
+        assert val_metrics["accuracy"] == pytest.approx(0.5)
+
 
 # ============================================================================
 # Tests for compute_and_apply_seq_logprob_error_masking function