Benchmarks on 1 x H100

All commands expect the following environment variable for common arguments:

export ARGS="--train-file=data/tiny-stories-qwen/train-*.bin --eval-file=data/tiny-stories-qwen/eval.bin \
 --ckpt-interval=10000 --from-scratch --seq-length=1024 --model-dtype=bf16 --opt-m-dtype=bf16 \
 --opt-v-dtype=bf16 --gpus=1 --use-cuda-graphs"

Model size: 0.5B

FP8

./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16

# [T] step     0 [  0.1%] | time:  3558 ms | norm  14.196782 | loss  12.118944 | tps  147k | sol 30.1%
# [T] step     1 [  0.2%] | time:  3185 ms | norm  16.201508 | loss  11.496161 | tps  164k | sol 33.6%
# [T] step     2 [  0.3%] | time:  3178 ms | norm  12.644269 | loss  10.924606 | tps  164k | sol 33.7%
# [T] step     3 [  0.5%] | time:  3178 ms | norm   9.579600 | loss  10.513889 | tps  164k | sol 33.7%
# [T] step     4 [  0.6%] | time:  3196 ms | norm   8.089901 | loss  10.235482 | tps  164k | sol 33.5%
# [T] step     5 [  0.7%] | time:  3185 ms | norm   6.754247 | loss  10.023415 | tps  164k | sol 33.6%

BF16

./build/train ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16

# [T] step     0 [  0.1%] | time:  3846 ms | norm  14.356340 | loss  12.121202 | tps  136k | sol 42.6%
# [T] step     1 [  0.2%] | time:  3493 ms | norm  16.444948 | loss  11.495819 | tps  150k | sol 46.9%
# [T] step     2 [  0.3%] | time:  3492 ms | norm  12.627003 | loss  10.924049 | tps  150k | sol 46.9%
# [T] step     3 [  0.5%] | time:  3504 ms | norm   9.668566 | loss  10.513186 | tps  149k | sol 46.8%
# [T] step     4 [  0.6%] | time:  3508 ms | norm   8.125627 | loss  10.232485 | tps  149k | sol 46.7%
# [T] step     5 [  0.7%] | time:  3508 ms | norm   6.793734 | loss  10.017923 | tps  149k | sol 46.7%

Model size: 1.5B

FP8

./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16

# [T] step     0 [  0.1%] | time:  7601 ms | norm  15.986144 | loss  12.242810 | tps 68976 | sol 39.0%
# [T] step     1 [  0.2%] | time:  7188 ms | norm  20.733370 | loss  11.048758 | tps 72939 | sol 41.2%
# [T] step     2 [  0.3%] | time:  7182 ms | norm  12.358917 | loss   9.907642 | tps 73000 | sol 41.3%
# [T] step     3 [  0.5%] | time:  7177 ms | norm  11.513764 | loss   9.301802 | tps 73051 | sol 41.3%
# [T] step     4 [  0.6%] | time:  7183 ms | norm  12.643443 | loss   8.985088 | tps 72990 | sol 41.3%
# [T] step     5 [  0.7%] | time:  7204 ms | norm  10.341788 | loss   8.699650 | tps 72777 | sol 41.1%

BF16

./build/train ${ARGS} --model=Qwen2.5-1.5B --matmul-dtype=bf16 --batch-size=32 --grad-accumulation=16 --lmhead-chunks=4

# [T] step     0 [  0.1%] | time:  9400 ms | norm  16.093142 | loss  12.243517 | tps 55775 | sol 53.7%
# [T] step     1 [  0.2%] | time:  8898 ms | norm  20.712574 | loss  11.035850 | tps 58922 | sol 56.7%
# [T] step     2 [  0.3%] | time:  8906 ms | norm  12.315584 | loss   9.896231 | tps 58869 | sol 56.7%
# [T] step     3 [  0.5%] | time:  8905 ms | norm  11.510052 | loss   9.289831 | tps 58875 | sol 56.7%
# [T] step     4 [  0.6%] | time:  8914 ms | norm  12.428946 | loss   8.962239 | tps 58816 | sol 56.6%
# [T] step     5 [  0.7%] | time:  8930 ms | norm  11.770356 | loss   8.688566 | tps 58710 | sol 56.5%

Model size: 3B

FP8

./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32

# [T] step     0 [  0.1%] | time: 13444 ms | norm  17.093300 | loss  12.364218 | tps 38997 | sol 41.9%
# [T] step     1 [  0.2%] | time: 13035 ms | norm  29.931534 | loss  10.841572 | tps 40221 | sol 43.3%
# [T] step     2 [  0.3%] | time: 13032 ms | norm 150.115555 | loss  10.673299 | tps 40230 | sol 43.3%
# [T] step     3 [  0.5%] | time: 12984 ms | norm  70.059677 | loss   9.603341 | tps 40379 | sol 43.4%
# [T] step     4 [  0.6%] | time: 12980 ms | norm  38.186043 | loss   9.463447 | tps 40391 | sol 43.4%
# [T] step     5 [  0.7%] | time: 12954 ms | norm  22.843899 | loss   9.419279 | tps 40473 | sol 43.5%

BF16

./build/train ${ARGS} --model=Qwen2.5-3B --matmul-dtype=bf16 --batch-size=16 --grad-accumulation=32

# [T] step     0 [  0.1%] | time: 17367 ms | norm  17.293119 | loss  12.366402 | tps 30188 | sol 57.9%
# [T] step     1 [  0.2%] | time: 16981 ms | norm  29.885237 | loss  10.834126 | tps 30874 | sol 59.2%
# [T] step     2 [  0.3%] | time: 16978 ms | norm 148.822845 | loss  10.651226 | tps 30880 | sol 59.2%
# [T] step     3 [  0.5%] | time: 16958 ms | norm  70.211777 | loss   9.592414 | tps 30916 | sol 59.3%
# [T] step     4 [  0.6%] | time: 16959 ms | norm  38.043625 | loss   9.457190 | tps 30915 | sol 59.3%
# [T] step     5 [  0.7%] | time: 16935 ms | norm  22.941721 | loss   9.407406 | tps 30958 | sol 59.3%

Model size: 7B

FP8:

./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32  --lmhead-chunks=4 --attn-bwd-chunks=4 --recompute-ffn --recompute-norm --offload-residual

# [T] step     0 [  0.1%] | time: 27740 ms | norm  12.232619 | loss  12.650684 | tps 18900 | sol 44.8%
# [T] step     1 [  0.2%] | time: 27419 ms | norm  23.970106 | loss  10.778204 | tps 19121 | sol 45.3%
# [T] step     2 [  0.3%] | time: 27426 ms | norm 196.771927 | loss  15.770700 | tps 19116 | sol 45.3%
# [T] step     3 [  0.5%] | time: 27380 ms | norm 147.629486 | loss  15.106022 | tps 19148 | sol 45.4%
# [T] step     4 [  0.6%] | time: 27340 ms | norm 130.252472 | loss  15.071719 | tps 19176 | sol 45.5%
# [T] step     5 [  0.7%] | time: 27314 ms | norm 106.251984 | loss  13.023262 | tps 19194 | sol 45.5%

BF16:

./build/train ${ARGS} --model=Qwen2.5-7B --matmul-dtype=bf16 --batch-size=4 --grad-accumulation=128  --lmhead-chunks=2 --attn-bwd-chunks=2

# [T] step     0 [  0.1%] | time: 38381 ms | norm  12.291814 | loss  12.650324 | tps 13660 | sol 59.4%
# [T] step     1 [  0.2%] | time: 38105 ms | norm  23.904566 | loss  10.767546 | tps 13759 | sol 59.8%
# [T] step     2 [  0.3%] | time: 37961 ms | norm 198.835052 | loss  15.780600 | tps 13811 | sol 60.1%
# [T] step     3 [  0.5%] | time: 37942 ms | norm 146.817291 | loss  15.095704 | tps 13818 | sol 60.1%
# [T] step     4 [  0.6%] | time: 37854 ms | norm 133.388214 | loss  15.087540 | tps 13850 | sol 60.2%
# [T] step     5 [  0.7%] | time: 37817 ms | norm 114.014503 | loss  13.045321 | tps 13863 | sol 60.3%

Model size: 14B

On the RTX Pro 6000, it appears that --use-zero-copy is slower than memcpy-based transfers.

FP8:

./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=e4m3 --batch-size=16 --grad-accumulation=32 \
  --lmhead-chunks=2 --offload-opt-m --offload-opt-v --offload-master --recompute-ffn --recompute-norm --offload-residual

# [T] step     0 [  0.1%] | time: 57435 ms | norm  17.859472 | loss  12.969749 | tps  9128 | sol 42.3%
# [T] step     1 [  0.2%] | time: 55994 ms | norm  95.178490 | loss  11.136398 | tps  9363 | sol 43.4%
# [T] step     2 [  0.3%] | time: 55993 ms | norm  46.467678 | loss  12.601131 | tps  9363 | sol 43.4%
# [T] step     3 [  0.5%] | time: 55976 ms | norm 216.350403 | loss  16.645550 | tps  9366 | sol 43.4%
# [T] step     4 [  0.6%] | time: 55952 ms | norm 174.723923 | loss  17.085625 | tps  9370 | sol 43.4%
# [T] step     5 [  0.7%] | time: 55986 ms | norm 209.725906 | loss  14.991776 | tps  9364 | sol 43.4%

BF16:

./build/train ${ARGS} --model=Qwen2.5-14B --matmul-dtype=bf16 --batch-size=8 --grad-accumulation=64 \
  --lmhead-chunks=2 --attn-bwd-chunks=2  --offload-opt-m --offload-opt-v --offload-master --recompute-ffn --offload-residual

# [T] step     0 [  0.1%] | time: 83169 ms | norm  17.844570 | loss  12.968641 | tps  6303 | sol 54.4%
# [T] step     1 [  0.2%] | time: 82819 ms | norm  96.651398 | loss  11.131461 | tps  6330 | sol 54.7%
# [T] step     2 [  0.3%] | time: 82835 ms | norm  46.290871 | loss  12.561548 | tps  6329 | sol 54.7%
# [T] step     3 [  0.5%] | time: 83018 ms | norm 215.638519 | loss  16.763704 | tps  6315 | sol 54.5%
# [T] step     4 [  0.6%] | time: 82745 ms | norm 177.452179 | loss  17.235472 | tps  6336 | sol 54.7%
# [T] step     5 [  0.7%] | time: 82939 ms | norm 211.570663 | loss  15.318270 | tps  6321 | sol 54.6%

Profiling

Here are some useful commands for profiling several kernels in the 0.5B model:

export LAUNCH="--set full --launch-count 1 --import-source yes \"./build/train\" ${ARGS} --model=Qwen2.5-0.5B --matmul-dtype=e4m3 --batch-size=32 --grad-accumulation=16 --steps=2"

ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 49 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 24 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_forward_unified_kernel --launch-skip 392 --export "rmsnorm-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1640 --export "quantize-rms-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name rope_kernel --launch-skip 48 --export "rope-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name reduce_abs_max_kernel --launch-skip 360 --export "absmax-att-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_forward_persistent_kernel --launch-skip 192 --export "swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1643 --export "quantize-swiglu-fwd.ncu-rep" ${LAUNCH}
ncu --kernel-name fused_classifier_kernel5 --launch-skip 8 --export "fused-classifier.ncu-rep" ${LAUNCH}
ncu --kernel-name rmsnorm_backward_kernel10 --launch-skip 392 --export "rmsnorm-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name transpose_kernel --launch-skip 2113 --export "tp-swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name swiglu_backward_kernel1 --launch-skip 192 --export "swiglu-bwd.ncu-rep" ${LAUNCH}
ncu --kernel-name quantize_with_abs_max_kernel --launch-skip 1825 --export "quantize-swiglu-bwd.ncu-rep" ${LAUNCH}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Benchmarks on 1 x H100

Model size: 0.5B

FP8

BF16

Model size: 1.5B

FP8

BF16

Model size: 3B

FP8

BF16

Model size: 7B

FP8:

BF16:

Model size: 14B

FP8:

BF16:

Profiling

FilesExpand file tree

H100x1.md

Latest commit

History

H100x1.md

File metadata and controls

Benchmarks on 1 x H100

Model size: 0.5B

FP8

BF16

Model size: 1.5B

FP8

BF16

Model size: 3B

FP8

BF16

Model size: 7B

FP8:

BF16:

Model size: 14B

FP8:

BF16:

Profiling