.

bnayahu · bnayahu · commit 23eee0b13d38 · 2025-10-30T15:10:55.000+02:00
Signed-off-by: Jonathan Bnayahu &lt;bnayahu@il.ibm.com&gt;
diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py
@@ -2,30 +2,33 @@
 from unitxt.catalog import add_to_catalog
 from unitxt.standard import DatasetRecipe
 
+MAX_TEST_INSTANCES = 500
+
 benchmark = Benchmark(
     subsets={
         "attaq": DatasetRecipe(
             card="cards.safety.attaq_gg",
             template_card_index="default",
-            max_test_instances=500,
+            group_by=["label"],
+            max_test_instances=MAX_TEST_INSTANCES,
         ),
         "provoq": DatasetRecipe(
             card="cards.safety.provoq",
             template_card_index="default",
             group_by=["group"],
-            max_test_instances=500,
+            max_test_instances=MAX_TEST_INSTANCES,
         ),
         "airbench": DatasetRecipe(
             card="cards.safety.airbench2024",
             template_card_index="default",
             group_by=["l2-name"],
-            max_test_instances=500,
+            max_test_instances=MAX_TEST_INSTANCES,
         ),
         "ailuminate": DatasetRecipe(
             card="cards.safety.mlcommons_ailuminate",
             template_card_index="default",
             group_by=["hazard"],
-            max_test_instances=500,
+            max_test_instances=MAX_TEST_INSTANCES,
         ),
     }
 )
diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json
@@ -5,6 +5,9 @@
             "__type__": "dataset_recipe",
             "card": "cards.safety.attaq_gg",
             "template_card_index": "default",
+            "group_by": [
+                "label"
+            ],
             "max_test_instances": 500
         },
         "provoq": {