@@ -26,7 +26,7 @@ speculative decoding draft-model training, etc.
2626``` bash
2727pip install datasets huggingface_hub pyyaml
2828huggingface-cli login # required for gated datasets
29- ```
29+ ` ` ` text
3030
3131# ## Build a Nemotron PT v3 dataset
3232
@@ -39,20 +39,20 @@ python make_nemotron_ptv3_dataset.py --mode train --output-dir /tmp/ptv3_train
3939
4040# Use a custom dataset mix
4141python make_nemotron_ptv3_dataset.py --config my_mix.yaml --output-dir /tmp/ptv3_custom
42- ```
42+ ` ` ` text
4343
4444# ## Build a Nemotron PT v2 dataset
4545
4646` ` ` bash
4747python make_nemotron_ptv2_dataset.py --output-dir /tmp/ptv2_gen
4848python make_nemotron_ptv2_dataset.py --mode train --output-dir /tmp/ptv2_train
49- ```
49+ ` ` ` text
5050
5151# ## Build a general-purpose mixed dataset
5252
5353` ` ` bash
5454python make_dataset.py --config example_data_config.yaml --output-dir /tmp/mixed
55- ```
55+ ` ` ` text
5656
5757# # Dataset Modes
5858
@@ -69,13 +69,13 @@ The `generate` mode produces conversation skeletons that are fed to a target mod
6969via ` tools/launcher/common/query.py` (vLLM or TRT-LLM). The output becomes training
7070data for a draft model (e.g. EAGLE3 speculative decoding) or a distilled student:
7171
72- ```
72+ ` ` ` text
7373make_nemotron_ptv3_dataset.py --mode generate → skeleton.jsonl
7474 ↓
7575query.py (target model generates responses turn-by-turn)
7676 ↓
7777training data for draft model / student
78- ```
78+ ` ` ` text
7979
8080# # Augmentations
8181
@@ -95,7 +95,7 @@ augmentations:
9595 - type: system_prompt
9696 content: " You are a helpful assistant."
9797 enabled: false # disable without deleting
98- ` ` `
98+ ` ` ` text
9999
100100# # Dataset Mix Config (`nemotron_ptv3_datasets.yaml`)
101101
@@ -111,7 +111,7 @@ datasets:
111111 - repo_id: nvidia/OpenMathReasoning-mini
112112 splits: [train]
113113 augment: false # multilingual — skip language-redirect augmentation
114- ` ` `
114+ ` ` ` text
115115
116116# # Output Format
117117
@@ -123,6 +123,6 @@ Every output row is a JSONL object with a single `messages` key:
123123 {" role" : " user" , " content" : " What is 2+2?" },
124124 {" role" : " assistant" , " content" : " 4" }
125125]}
126- ` ` `
126+ ` ` ` text
127127
128128In ` generate` mode, assistant turns are stripped so the row ends with a user turn.
0 commit comments