NVIDIA
diff --git a/‎examples/speculative_decoding/README.md‎
Lines changed: 12 additions & 9 deletions b/‎examples/speculative_decoding/README.md‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens_dp.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens_dp.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative_decoding/collect_hidden_states/run_trtllm_compute_hiddens.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/collect_hidden_states/run_trtllm_compute_hiddens.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/speculative_decoding/collect_hidden_states/run_trtllm_compute_hiddens_dp.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/speculative_decoding/collect_hidden_states/run_trtllm_compute_hiddens_dp.sh‎
Lines changed: 1 addition & 1 deletion
@@ -242,6 +242,17 @@ To add a system prompt, use the `--system_prompt <system_prompt_text>` argument.
 
 For large scale data generation, please see [SLURM prepare data](SLURM_prepare_data.md) for SLURM support.
 
+### Configuring Draft Model
+
+For EAGLE‑1 and EAGLE‑3 we provide a [default model architecture config](https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt/torch/speculative/config.py#L37) in ModelOpt. You can override default settings by providing an additional JSON dict. E.g. To use 2-layer eagle with 8192 intermediate size for MLP, set `eagle_config.json` to:
+
+```json
+{
+    "num_hidden_layers": 2,
+    "intermediate_size":8192
+}
+```
+
 ### Draft Vocabulary Compression
 
 We can optionally use smaller vocab size for the draft model for faster training and inference. E.g. Llama3.2-1B has a vocab size of 128256. In this example, we construct a draft vocab mapping of size 32k by finding the most commonly appeared vocabs in our training set:
@@ -252,15 +263,7 @@ python scripts/calibrate_draft_vocab.py --model meta-llama/Llama-3.2-1B-Instruct
 
 This will produce a `d2t.pt` file in `save_dir`, which is the mapping from draft token to target token. During inference, draft tokens can be mapped back to target tokens by `target_token = draft_token + d2t[draft_token]`.
 
-### Configuring Draft Model
-
-For EAGLE‑1 and EAGLE‑3 we provide a [default model architecture config](https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt/torch/speculative/config.py#L37) in ModelOpt. You can override default settings by providing an additional JSON dict. In this example, we override `draft_vocab_size` in `eagle_config.json`:
-
-```json
-{
-    "draft_vocab_size": 32000
-}
-```
+Then, simply include the `--draft_vocab_cache <path_to_d2t.pt>` argument when starting training with `./launch_train.sh`. The draft model will use this provided vocab table during training and export.
 
 ### Interact with `modelopt.torch.speculative`
 
 
@@ -19,5 +19,5 @@
 
 python3 collect_hidden_states/compute_hidden_states_hf.py \
   --model meta-llama/Llama-3.2-1B-Instruct \
-  --input-file synthetic_conversations/daring-anteater.jsonl \
+  --input-data synthetic_conversations/daring-anteater.jsonl \
   --output-dir /mnt/md0/eagle-hidden-states/llama1b/daring_anteater/
@@ -30,7 +30,7 @@ split -n l/$DP_SIZE --numeric-suffixes=0 -d --additional-suffix=.jsonl $INPUT_FI
 
 for i in $(seq 0 $((DP_SIZE-1)))
 do
-CUDA_VISIBLE_DEVICES=$i python3 collect_hidden_states/compute_hidden_states_hf.py --model meta-llama/Llama-3.2-1B-Instruct --input-file /tmp/part-0${i}.jsonl --output-dir $OUTPUT_DIR &
+CUDA_VISIBLE_DEVICES=$i python3 collect_hidden_states/compute_hidden_states_hf.py --model meta-llama/Llama-3.2-1B-Instruct --input-data /tmp/part-0${i}.jsonl --output-dir $OUTPUT_DIR &
 done
 wait
 
 
@@ -20,6 +20,6 @@
 export TLLM_LOG_LEVEL="error";
 python3 collect_hidden_states/compute_hidden_states_trtllm.py \
   --model meta-llama/Llama-3.2-1B-Instruct \
-  --input-file synthetic_conversations/daring-anteater.jsonl \
+  --input-data synthetic_conversations/daring-anteater.jsonl \
   --output-dir /mnt/md0/eagle-hidden-states/llama1b/daring_anteater/
 
@@ -33,7 +33,7 @@ split -n l/$DP_SIZE --numeric-suffixes=0 -d --additional-suffix=.jsonl $INPUT_FI
 for i in $(seq 0 $((DP_SIZE-1)))
 do
 
-export CUDA_VISIBLE_DEVICES=$i;  python3 collect_hidden_states/compute_hidden_states_trtllm.py --model $MODEL --input-file /tmp/part-0${i}.jsonl --output-dir $OUTPUT_DIR --dp-rank $i &
+export CUDA_VISIBLE_DEVICES=$i;  python3 collect_hidden_states/compute_hidden_states_trtllm.py --model $MODEL --input-data /tmp/part-0${i}.jsonl --output-dir $OUTPUT_DIR --dp-rank $i &
 
 done
 wait