fix: PTQ 1GPU, export PP divisibility, hidden states conversations key (#1293)

ChenhanYu · claude · web-flow · commit 355c6b788378 · 2026-04-20T12:43:58.000-07:00
## Summary - **megatron_lm_ptq.yaml**: Qwen3-8B PTQ to single GPU for L40 clusters (TP=1, all tasks) - **quantize.sh**: Auto-find largest PP dividing model's `num_hidden_layers` for export step. Qwen3-8B has 36 layers which isn't divisible by 8, causing `AssertionError` on 8-GPU nodes - **compute_hidden_states_trtllm.py**: Use `messages` with `conversations` fallback, matching the HF version. Fixes `KeyError: 'conversations'` when data uses OpenAI `messages` format ## Test plan - [x] Qwen3-8B PTQ runs on single L40 GPU - [x] Export PP auto-selects valid divisor (36 layers → PP=6 on 8 GPUs, PP=4 on 4 GPUs, PP=1 on 1 GPU) - [x] EAGLE3 offline pipeline reads data with `messages` field 🤖 Generated with [Claude Code](https://claude.com/claude-code)  ## Summary by CodeRabbit * **New Features** * Dataset input handling now supports multiple field formats for enhanced compatibility. * **Bug Fixes** * Optimized GPU resource allocation during model quantization with improved pipeline parallelism computation. * Updated quantization configuration for more efficient resource utilization.  Signed-off-by: Chenhan Yu <chenhany@nvidia.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_trtllm.py
@@ -256,7 +256,7 @@ async def submit_generates():
         for entry in dataset:
             conversation_id = entry.get("conversation_id", entry.get("uuid"))
 
-            conversations = entry["conversations"]
+            conversations = entry.get("messages") or entry.get("conversations")
             if not conversations or not isinstance(conversations, list):
                 num_invalid += 1
                 continue
diff --git a/tools/launcher/common/megatron_lm/quantize/quantize.sh b/tools/launcher/common/megatron_lm/quantize/quantize.sh
@@ -41,11 +41,22 @@ TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} ${QUANTIZE_EXE} ${MLM_MODEL_CF
 export MLM_EXTRA_ARGS="--mmlu-dataset ${MMLU_DATASET:-/hf-local/cais/mmlu} --fraction 0.01 --lower-bound ${MMLU_LOWER_BOUND:-0.38} --disable-tqdm"
 TP=${TP:-1} PP=${PP:-1} EP=${EP:-1} ETP=${ETP:-1} MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${MMLU_EXE} ${MLM_MODEL_CFG}
 
-# Export quantized checkpoint to HF format (PP=all GPUs)
+# Export quantized checkpoint to HF format
+# Use largest PP <= total GPUs that divides the model's num_hidden_layers
 TOTAL_GPUS=$(python3 -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo ${NUM_GPUS:-1})
-echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${TOTAL_GPUS}) ==="
+EXPORT_PP=$(python3 -c "
+import json, os
+cfg = os.path.join('${HF_MODEL_CKPT}', 'config.json')
+n_layers = json.load(open(cfg)).get('num_hidden_layers', 1) if os.path.exists(cfg) else 1
+gpus = ${TOTAL_GPUS}
+pp = gpus
+while pp > 1 and n_layers % pp != 0:
+    pp -= 1
+print(pp)
+" 2>/dev/null || echo ${TOTAL_GPUS})
+echo "=== Exporting ${MLM_MODEL_CFG} ${QUANT_CFG} (PP=${EXPORT_PP}, ${TOTAL_GPUS} GPUs) ==="
 export MLM_EXTRA_ARGS=
-TP=1 PP=${TOTAL_GPUS} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
+TP=1 PP=${EXPORT_PP} EP=1 ETP=1 MLM_MODEL_CKPT=${MLM_MODEL_SAVE} ${EXPORT_EXE} ${MLM_MODEL_CFG}
 ls ${EXPORT_DIR}
 cat ${EXPORT_DIR}/hf_quant_config.json
 
diff --git a/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml b/tools/launcher/examples/Qwen/Qwen3-8B/megatron_lm_ptq.yaml
@@ -24,7 +24,7 @@ pipeline:
     config:
       model: Qwen/Qwen3-8B
       quant_cfg: NVFP4_DEFAULT_CFG
-      tp: 8
+      tp: 1
       calib_dataset: abisee/cnn_dailymail
       calib_size: 32
       mmlu_dataset: cais/mmlu
@@ -33,15 +33,15 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 8
-      gpus_per_node: 8
+      ntasks_per_node: 1
+      gpus_per_node: 1
 
   task_1:
     _target_: common.megatron_lm.quantize.task.MegatronLMQuantizeTask
     config:
       model: Qwen/Qwen3-8B
       quant_cfg: FP8_DEFAULT_CFG
-      tp: 8
+      tp: 1
       calib_dataset: abisee/cnn_dailymail
       calib_size: 32
       mmlu_dataset: cais/mmlu
@@ -50,18 +50,18 @@ pipeline:
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
-      ntasks_per_node: 8
-      gpus_per_node: 8
+      ntasks_per_node: 1
+      gpus_per_node: 1
 
   # Step 3: TRT-LLM eval MMLU on all exported checkpoints
   task_2:
     script: common/tensorrt_llm/eval.sh
     environment:
       - HF_MODEL_CKPT: /scratchspace/export
-      - TP: "8"
+      - TP: "1"
       - EP: "1"
     slurm_config:
       _factory_: "slurm_factory"
       nodes: 1
       ntasks_per_node: 1
-      gpus_per_node: 8
+      gpus_per_node: 1