55#
66# repo_id — HuggingFace repo ID or local path.
77# splits — List of splits to load. All splits are concatenated before capping.
8- # cap_per_split — Maximum rows taken from each split (null = no cap).
8+ # cap_per_split — Maximum rows taken from each split (omit or leave empty = no cap).
99# augment — Whether to include this dataset in the augmented copy.
1010# Set to false for already-multilingual splits so language-redirect
1111# augmentations are not applied on top of existing non-English content.
@@ -53,14 +53,14 @@ datasets:
5353 - repo_id : nvidia/Nemotron-SWE-v1
5454 splits :
5555 - r2e_gym
56- cap_per_split : null
56+ cap_per_split :
5757 augment : true
5858
5959 - repo_id : nvidia/Nemotron-SFT-SWE-v2
6060 splits :
6161 - agentless
6262 - openhands_swe
63- cap_per_split : null
63+ cap_per_split :
6464 augment : true
6565
6666 # Competitive-Programming-v1 has ~3.93M rows across 6 splits (cpp ×2, python ×2,
@@ -94,7 +94,7 @@ datasets:
9494 - question_tool
9595 - agent_skills
9696 - agent_skills_question_tool
97- cap_per_split : null
97+ cap_per_split :
9898 augment : true
9999
100100 # ---------------------------------------------------------------------------
@@ -105,7 +105,7 @@ datasets:
105105 splits :
106106 - MCQ
107107 - RQA
108- cap_per_split : null # ~226 K total
108+ cap_per_split : # ~226 K total
109109 augment : true
110110
111111 # ---------------------------------------------------------------------------
@@ -116,14 +116,14 @@ datasets:
116116 splits :
117117 - chat_if
118118 - structured_outputs
119- cap_per_split : null # ~288 K total
119+ cap_per_split : # ~288 K total
120120 augment : true
121121
122122 - repo_id : nvidia/Nemotron-SFT-Instruction-Following-Chat-v2
123123 splits :
124124 - reasoning_off
125125 - reasoning_on
126- cap_per_split : null
126+ cap_per_split :
127127 augment : true
128128
129129 # ---------------------------------------------------------------------------
@@ -134,15 +134,15 @@ datasets:
134134 splits :
135135 - interactive_agent
136136 - tool_calling
137- cap_per_split : null
137+ cap_per_split :
138138 augment : true
139139
140140 - repo_id : nvidia/Nemotron-SFT-Agentic-v2
141141 splits :
142142 - interactive_agent
143143 - search
144144 - tool_calling
145- cap_per_split : null
145+ cap_per_split :
146146 augment : true
147147
148148 # ---------------------------------------------------------------------------
@@ -152,7 +152,7 @@ datasets:
152152 - repo_id : nvidia/Nemotron-SFT-Safety-v1
153153 splits :
154154 - train
155- cap_per_split : null # ~45 K
155+ cap_per_split : # ~45 K
156156 augment : true
157157
158158 # ---------------------------------------------------------------------------
0 commit comments