All commands expect the following environment variable for common arguments:
export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
--ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
--opt-v-dtype=bf16 --gpus=4 --use-cuda-graphs" ./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 -grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 1654 ms | norm 15.237247 | loss 12.103670 | tps 316k | sol 31.7%
# [T] step 1 [ 0.2%] | time: 1395 ms | norm 17.229757 | loss 11.432494 | tps 375k | sol 37.6%
# [T] step 2 [ 0.3%] | time: 1380 ms | norm 12.447560 | loss 10.850240 | tps 379k | sol 38.0%
# [T] step 3 [ 0.5%] | time: 1378 ms | norm 9.318286 | loss 10.456961 | tps 380k | sol 38.1%
# [T] step 4 [ 0.6%] | time: 1378 ms | norm 7.623181 | loss 10.193689 | tps 380k | sol 38.1%
# [T] step 5 [ 0.7%] | time: 1378 ms | norm 6.494474 | loss 9.994730 | tps 380k | sol 38.1% ./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 -grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 1805 ms | norm 15.185236 | loss 12.105891 | tps 290k | sol 44.6%
# [T] step 1 [ 0.2%] | time: 1532 ms | norm 17.148014 | loss 11.431538 | tps 342k | sol 52.5%
# [T] step 2 [ 0.3%] | time: 1536 ms | norm 12.489378 | loss 10.849688 | tps 341k | sol 52.4%
# [T] step 3 [ 0.5%] | time: 1536 ms | norm 9.419200 | loss 10.455972 | tps 341k | sol 52.4%
# [T] step 4 [ 0.6%] | time: 1535 ms | norm 7.677828 | loss 10.190819 | tps 341k | sol 52.4%
# [T] step 5 [ 0.7%] | time: 1541 ms | norm 6.511380 | loss 9.990303 | tps 340k | sol 52.2%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 3518 ms | norm 15.668739 | loss 12.246813 | tps 149k | sol 41.4%
# [T] step 1 [ 0.2%] | time: 3269 ms | norm 21.313473 | loss 11.063199 | tps 160k | sol 44.5%
# [T] step 2 [ 0.3%] | time: 3270 ms | norm 13.376834 | loss 9.935160 | tps 160k | sol 44.5%
# [T] step 3 [ 0.5%] | time: 3275 ms | norm 13.404247 | loss 9.339596 | tps 160k | sol 44.4%
# [T] step 4 [ 0.6%] | time: 3280 ms | norm 19.330711 | loss 9.021979 | tps 159k | sol 44.4%
# [T] step 5 [ 0.7%] | time: 3286 ms | norm 12.970963 | loss 8.851159 | tps 159k | sol 44.3%./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=4
# [T] step 0 [ 0.1%] | time: 4370 ms | norm 15.852186 | loss 12.247669 | tps 119k | sol 56.7%
# [T] step 1 [ 0.2%] | time: 4051 ms | norm 21.191936 | loss 11.055208 | tps 129k | sol 61.1%
# [T] step 2 [ 0.3%] | time: 4059 ms | norm 13.463421 | loss 9.928066 | tps 129k | sol 61.0%
# [T] step 3 [ 0.5%] | time: 4062 ms | norm 13.632689 | loss 9.334002 | tps 129k | sol 61.0%
# [T] step 4 [ 0.6%] | time: 4066 ms | norm 18.258806 | loss 9.002836 | tps 128k | sol 60.9%
# [T] step 5 [ 0.7%] | time: 4069 ms | norm 12.641823 | loss 8.826939 | tps 128k | sol 60.9%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=8
# [T] step 0 [ 0.1%] | time: 5952 ms | norm 17.298082 | loss 12.333839 | tps 88086 | sol 46.5%
# [T] step 1 [ 0.2%] | time: 5712 ms | norm 29.288118 | loss 10.681979 | tps 91787 | sol 48.5%
# [T] step 2 [ 0.3%] | time: 5716 ms | norm 183.139511 | loss 11.617577 | tps 91722 | sol 48.4%
# [T] step 3 [ 0.5%] | time: 5721 ms | norm 47.218239 | loss 9.645790 | tps 91642 | sol 48.4%
# [T] step 4 [ 0.6%] | time: 5723 ms | norm 46.932884 | loss 9.655010 | tps 91610 | sol 48.4%
# [T] step 5 [ 0.7%] | time: 5725 ms | norm 22.928219 | loss 9.611375 | tps 91578 | sol 48.3%./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=8
# [T] step 0 [ 0.1%] | time: 7875 ms | norm 17.444851 | loss 12.334220 | tps 66576 | sol 62.7%
# [T] step 1 [ 0.2%] | time: 7649 ms | norm 29.357367 | loss 10.678251 | tps 68543 | sol 64.5%
# [T] step 2 [ 0.3%] | time: 7661 ms | norm 181.318436 | loss 11.542382 | tps 68435 | sol 64.4%
# [T] step 3 [ 0.5%] | time: 7666 ms | norm 46.621365 | loss 9.616313 | tps 68391 | sol 64.4%
# [T] step 4 [ 0.6%] | time: 7671 ms | norm 45.573971 | loss 9.619636 | tps 68346 | sol 64.3%
# [T] step 5 [ 0.7%] | time: 7678 ms | norm 23.592392 | loss 9.577412 | tps 68284 | sol 64.3%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=8 \
--lmhead-chunks=4 --attn-bwd-chunks=4
# [T] step 0 [ 0.1%] | time: 11522 ms | norm 12.745495 | loss 12.640880 | tps 45503 | sol 53.0%
# [T] step 1 [ 0.2%] | time: 11310 ms | norm 24.054037 | loss 10.666334 | tps 46356 | sol 54.0%
# [T] step 2 [ 0.3%] | time: 11324 ms | norm 192.126587 | loss 15.055107 | tps 46298 | sol 53.9%
# [T] step 3 [ 0.5%] | time: 11322 ms | norm 185.878799 | loss 16.300306 | tps 46307 | sol 53.9%
# [T] step 4 [ 0.6%] | time: 11320 ms | norm 133.388321 | loss 16.287636 | tps 46315 | sol 53.9%
# [T] step 5 [ 0.7%] | time: 11321 ms | norm 106.739357 | loss 13.215859 | tps 46311 | sol 53.9%./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=16
# [T] step 0 [ 0.1%] | time: 16606 ms | norm 12.805418 | loss 12.639004 | tps 31572 | sol 67.4%
# [T] step 1 [ 0.2%] | time: 16378 ms | norm 24.150591 | loss 10.664420 | tps 32011 | sol 68.4%
# [T] step 2 [ 0.3%] | time: 16397 ms | norm 194.400818 | loss 15.176842 | tps 31974 | sol 68.3%
# [T] step 3 [ 0.5%] | time: 16392 ms | norm 188.018234 | loss 16.306389 | tps 31984 | sol 68.3%
# [T] step 4 [ 0.6%] | time: 16391 ms | norm 133.500732 | loss 16.235382 | tps 31986 | sol 68.3%
# [T] step 5 [ 0.7%] | time: 16389 ms | norm 107.259521 | loss 13.117952 | tps 31990 | sol 68.3%On the RTX Pro 6000, it appears that --use-zero-copy is slower than memcpy-based transfers.
./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=8 \
--grad-accumulation=16 --lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-residual
# [T] step 0 [ 0.1%] | time: 22197 ms | norm 17.410797 | loss 12.957819 | tps 23619 | sol 53.7%
# [T] step 1 [ 0.2%] | time: 22010 ms | norm 41.930729 | loss 10.856384 | tps 23820 | sol 54.1%
# [T] step 2 [ 0.3%] | time: 22046 ms | norm 40.773552 | loss 11.950643 | tps 23781 | sol 54.1%
# [T] step 3 [ 0.5%] | time: 22075 ms | norm 195.725052 | loss 21.508512 | tps 23750 | sol 54.0%
# [T] step 4 [ 0.6%] | time: 22065 ms | norm 163.850677 | loss 22.708485 | tps 23761 | sol 54.0%
# [T] step 5 [ 0.7%] | time: 22056 ms | norm 178.926910 | loss 23.133272 | tps 23770 | sol 54.0%./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=4 --grad-accumulation=32 \
--lmhead-chunks=2 --offload-opt-m
# [T] step 0 [ 0.1%] | time: 33136 ms | norm 17.428005 | loss 12.957518 | tps 15822 | sol 67.1%
# [T] step 1 [ 0.2%] | time: 33025 ms | norm 39.298843 | loss 10.838183 | tps 15875 | sol 67.3%
# [T] step 2 [ 0.3%] | time: 33050 ms | norm 39.551338 | loss 11.834833 | tps 15863 | sol 67.3%
# [T] step 3 [ 0.5%] | time: 33078 ms | norm 192.808502 | loss 21.943832 | tps 15850 | sol 67.2%
# [T] step 4 [ 0.6%] | time: 33052 ms | norm 155.747375 | loss 22.839943 | tps 15862 | sol 67.3%
# [T] step 5 [ 0.7%] | time: 33043 ms | norm 170.201584 | loss 23.217522 | tps 15866 | sol 67.3%./build/train ${ARGS} --model=Qwen2.5-32B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=8 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-master --recompute-block --shard-weights --persistent-quants
# [T] step 0 [ 0.1%] | time: 58426 ms | norm 18.507336 | loss 12.957186 | tps 8973 | sol 44.7%
# [T] step 1 [ 0.2%] | time: 58039 ms | norm 23.720108 | loss 10.594598 | tps 9033 | sol 45.0%
# [T] step 2 [ 0.3%] | time: 58059 ms | norm 203.682953 | loss 23.335171 | tps 9030 | sol 44.9%
# [T] step 3 [ 0.5%] | time: 58021 ms | norm 250.889709 | loss 24.865129 | tps 9036 | sol 45.0%
# [T] step 4 [ 0.6%] | time: 58019 ms | norm 212.280212 | loss 24.015837 | tps 9036 | sol 45.0%
# [T] step 5 [ 0.7%] | time: 57979 ms | norm 253.355835 | loss 21.329235 | tps 9042 | sol 45.0%./build/train ${ARGS} --model=Qwen2.5-32B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=8 \
--lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-master --recompute-block --shard-weights
# [T] step 0 [ 0.1%] | time: 89739 ms | norm 18.528318 | loss 12.959810 | tps 5842 | sol 56.2%
# [T] step 1 [ 0.2%] | time: 89446 ms | norm 23.803040 | loss 10.587227 | tps 5861 | sol 56.4%
# [T] step 2 [ 0.3%] | time: 89430 ms | norm 203.019669 | loss 24.327652 | tps 5862 | sol 56.4%
# [T] step 3 [ 0.5%] | time: 89360 ms | norm 267.475616 | loss 26.094778 | tps 5867 | sol 56.5%
# [T] step 4 [ 0.6%] | time: 89314 ms | norm 202.022141 | loss 25.178646 | tps 5870 | sol 56.5%
# [T] step 5 [ 0.7%] | time: 89295 ms | norm 156.916290 | loss 22.510256 | tps 5871 | sol 56.5%