All commands expect the following environment variable for common arguments:
export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
--ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
--opt-v-dtype=bf16 --gpus=1 --use-cuda-graphs"./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 3558 ms | norm 14.196782 | loss 12.118944 | tps 147k | sol 30.1%
# [T] step 1 [ 0.2%] | time: 3185 ms | norm 16.201508 | loss 11.496161 | tps 164k | sol 33.6%
# [T] step 2 [ 0.3%] | time: 3178 ms | norm 12.644269 | loss 10.924606 | tps 164k | sol 33.7%
# [T] step 3 [ 0.5%] | time: 3178 ms | norm 9.579600 | loss 10.513889 | tps 164k | sol 33.7%
# [T] step 4 [ 0.6%] | time: 3196 ms | norm 8.089901 | loss 10.235482 | tps 164k | sol 33.5%
# [T] step 5 [ 0.7%] | time: 3185 ms | norm 6.754247 | loss 10.023415 | tps 164k | sol 33.6%./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 3846 ms | norm 14.356340 | loss 12.121202 | tps 136k | sol 42.6%
# [T] step 1 [ 0.2%] | time: 3493 ms | norm 16.444948 | loss 11.495819 | tps 150k | sol 46.9%
# [T] step 2 [ 0.3%] | time: 3492 ms | norm 12.627003 | loss 10.924049 | tps 150k | sol 46.9%
# [T] step 3 [ 0.5%] | time: 3504 ms | norm 9.668566 | loss 10.513186 | tps 149k | sol 46.8%
# [T] step 4 [ 0.6%] | time: 3508 ms | norm 8.125627 | loss 10.232485 | tps 149k | sol 46.7%
# [T] step 5 [ 0.7%] | time: 3508 ms | norm 6.793734 | loss 10.017923 | tps 149k | sol 46.7%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 7601 ms | norm 15.986144 | loss 12.242810 | tps 68976 | sol 39.0%
# [T] step 1 [ 0.2%] | time: 7188 ms | norm 20.733370 | loss 11.048758 | tps 72939 | sol 41.2%
# [T] step 2 [ 0.3%] | time: 7182 ms | norm 12.358917 | loss 9.907642 | tps 73000 | sol 41.3%
# [T] step 3 [ 0.5%] | time: 7177 ms | norm 11.513764 | loss 9.301802 | tps 73051 | sol 41.3%
# [T] step 4 [ 0.6%] | time: 7183 ms | norm 12.643443 | loss 8.985088 | tps 72990 | sol 41.3%
# [T] step 5 [ 0.7%] | time: 7204 ms | norm 10.341788 | loss 8.699650 | tps 72777 | sol 41.1%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 --lmhead-chunks=4
# [T] step 0 [ 0.1%] | time: 9400 ms | norm 16.093142 | loss 12.243517 | tps 55775 | sol 53.7%
# [T] step 1 [ 0.2%] | time: 8898 ms | norm 20.712574 | loss 11.035850 | tps 58922 | sol 56.7%
# [T] step 2 [ 0.3%] | time: 8906 ms | norm 12.315584 | loss 9.896231 | tps 58869 | sol 56.7%
# [T] step 3 [ 0.5%] | time: 8905 ms | norm 11.510052 | loss 9.289831 | tps 58875 | sol 56.7%
# [T] step 4 [ 0.6%] | time: 8914 ms | norm 12.428946 | loss 8.962239 | tps 58816 | sol 56.6%
# [T] step 5 [ 0.7%] | time: 8930 ms | norm 11.770356 | loss 8.688566 | tps 58710 | sol 56.5%
./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32
# [T] step 0 [ 0.1%] | time: 13444 ms | norm 17.093300 | loss 12.364218 | tps 38997 | sol 41.9%
# [T] step 1 [ 0.2%] | time: 13035 ms | norm 29.931534 | loss 10.841572 | tps 40221 | sol 43.3%
# [T] step 2 [ 0.3%] | time: 13032 ms | norm 150.115555 | loss 10.673299 | tps 40230 | sol 43.3%
# [T] step 3 [ 0.5%] | time: 12984 ms | norm 70.059677 | loss 9.603341 | tps 40379 | sol 43.4%
# [T] step 4 [ 0.6%] | time: 12980 ms | norm 38.186043 | loss 9.463447 | tps 40391 | sol 43.4%
# [T] step 5 [ 0.7%] | time: 12954 ms | norm 22.843899 | loss 9.419279 | tps 40473 | sol 43.5%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=32
# [T] step 0 [ 0.1%] | time: 17367 ms | norm 17.293119 | loss 12.366402 | tps 30188 | sol 57.9%
# [T] step 1 [ 0.2%] | time: 16981 ms | norm 29.885237 | loss 10.834126 | tps 30874 | sol 59.2%
# [T] step 2 [ 0.3%] | time: 16978 ms | norm 148.822845 | loss 10.651226 | tps 30880 | sol 59.2%
# [T] step 3 [ 0.5%] | time: 16958 ms | norm 70.211777 | loss 9.592414 | tps 30916 | sol 59.3%
# [T] step 4 [ 0.6%] | time: 16959 ms | norm 38.043625 | loss 9.457190 | tps 30915 | sol 59.3%
# [T] step 5 [ 0.7%] | time: 16935 ms | norm 22.941721 | loss 9.407406 | tps 30958 | sol 59.3%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32 --lmhead-chunks=4 --attn-bwd-chunks=4 --recompute-ffn --recompute-norm --offload-residual
# [T] step 0 [ 0.1%] | time: 27740 ms | norm 12.232619 | loss 12.650684 | tps 18900 | sol 44.8%
# [T] step 1 [ 0.2%] | time: 27419 ms | norm 23.970106 | loss 10.778204 | tps 19121 | sol 45.3%
# [T] step 2 [ 0.3%] | time: 27426 ms | norm 196.771927 | loss 15.770700 | tps 19116 | sol 45.3%
# [T] step 3 [ 0.5%] | time: 27380 ms | norm 147.629486 | loss 15.106022 | tps 19148 | sol 45.4%
# [T] step 4 [ 0.6%] | time: 27340 ms | norm 130.252472 | loss 15.071719 | tps 19176 | sol 45.5%
# [T] step 5 [ 0.7%] | time: 27314 ms | norm 106.251984 | loss 13.023262 | tps 19194 | sol 45.5%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=4 --grad-accumulation=128 --lmhead-chunks=2 --attn-bwd-chunks=2
# [T] step 0 [ 0.1%] | time: 38381 ms | norm 12.291814 | loss 12.650324 | tps 13660 | sol 59.4%
# [T] step 1 [ 0.2%] | time: 38105 ms | norm 23.904566 | loss 10.767546 | tps 13759 | sol 59.8%
# [T] step 2 [ 0.3%] | time: 37961 ms | norm 198.835052 | loss 15.780600 | tps 13811 | sol 60.1%
# [T] step 3 [ 0.5%] | time: 37942 ms | norm 146.817291 | loss 15.095704 | tps 13818 | sol 60.1%
# [T] step 4 [ 0.6%] | time: 37854 ms | norm 133.388214 | loss 15.087540 | tps 13850 | sol 60.2%
# [T] step 5 [ 0.7%] | time: 37817 ms | norm 114.014503 | loss 13.045321 | tps 13863 | sol 60.3%On the RTX Pro 6000, it appears that --use-zero-copy is slower than memcpy-based transfers.
./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-master --recompute-ffn --recompute-norm --offload-residual
# [T] step 0 [ 0.1%] | time: 57435 ms | norm 17.859472 | loss 12.969749 | tps 9128 | sol 42.3%
# [T] step 1 [ 0.2%] | time: 55994 ms | norm 95.178490 | loss 11.136398 | tps 9363 | sol 43.4%
# [T] step 2 [ 0.3%] | time: 55993 ms | norm 46.467678 | loss 12.601131 | tps 9363 | sol 43.4%
# [T] step 3 [ 0.5%] | time: 55976 ms | norm 216.350403 | loss 16.645550 | tps 9366 | sol 43.4%
# [T] step 4 [ 0.6%] | time: 55952 ms | norm 174.723923 | loss 17.085625 | tps 9370 | sol 43.4%
# [T] step 5 [ 0.7%] | time: 55986 ms | norm 209.725906 | loss 14.991776 | tps 9364 | sol 43.4%./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=64 \
--lmhead-chunks=2 --attn-bwd-chunks=2 --offload-opt-m --offload-opt-v --offload-master --recompute-ffn --offload-residual
# [T] step 0 [ 0.1%] | time: 83169 ms | norm 17.844570 | loss 12.968641 | tps 6303 | sol 54.4%
# [T] step 1 [ 0.2%] | time: 82819 ms | norm 96.651398 | loss 11.131461 | tps 6330 | sol 54.7%
# [T] step 2 [ 0.3%] | time: 82835 ms | norm 46.290871 | loss 12.561548 | tps 6329 | sol 54.7%
# [T] step 3 [ 0.5%] | time: 83018 ms | norm 215.638519 | loss 16.763704 | tps 6315 | sol 54.5%
# [T] step 4 [ 0.6%] | time: 82745 ms | norm 177.452179 | loss 17.235472 | tps 6336 | sol 54.7%
# [T] step 5 [ 0.7%] | time: 82939 ms | norm 211.570663 | loss 15.318270 | tps 6321 | sol 54.6%Here are some useful commands for profiling several kernels in the 0.5B model:
export LAUNCH="--set full --launch-count 1 --import-source yes \"./build/train\" ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16 --steps=2"
ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 49 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 24 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 392 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1640 --export "quantize-rms-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name reduce_abs_max_kernel --launch-skip 360 --export "absmax-att-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 192 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1643 --export "quantize-swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name fused_classifier_kernel5 --launch-skip 8 --export "fused-classifier.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_backward_kernel10 --launch-skip 392 --export "rmsnorm-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_backward_kernel1 --launch-skip 192 --export "swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1825 --export "quantize-swiglu-bwd.ncu-rep" ${LAUNCH}