11epoch , framework_config , gradient_accumulation_steps , mem_nvidia_mem_reserved , model_name_or_path , num_gpus , per_device_train_batch_size , torch_dtype , train_loss , train_runtime , train_samples_per_second , train_steps_per_second , train_tokens_per_second
2- 0.25 , none , 16 , 72072 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938093501 , 1986.7714 , 6.443 , 0.05 , 1797.489
3- 0.25 , none , 8 , 49689 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.937983845 , 1082.5484 , 11.824 , 0.092 , 1649.441
4- 0.25 , none , 4 , 41754.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.93852025 , 569.5617 , 22.473 , 0.176 , 1567.521
5- 0.25 , moe-scattermoe-granite-ep1 , 16 , 72068 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938054211 , 660.687 , 19.374 , 0.151 , 5405.283
6- 0.25 , moe-scattermoe-granite-ep1 , 8 , 53917 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944801819 , 362.751 , 35.286 , 0.276 , 4922.385
7- 0.25 , moe-scattermoe-granite-ep1 , 4 , 53070 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.95192752 , 202.3782 , 63.248 , 0.494 , 4411.543
8- 0.25 , moe-scattermoe-granite-ep2 , 8 , 41880 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.938050581 , 441.5269 , 28.99 , 0.226 , 4044.147
9- 0.25 , moe-scattermoe-granite-ep2 , 4 , 43092 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945302382 , 235.4383 , 54.367 , 0.425 , 3792.076
10- 0.25 , moe-scattermoe-granite-ep4 , 4 , 33673.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938171822 , 259.2932 , 49.365 , 0.386 , 3443.207
11- 0.25 , moe-scattermoe-granite-ep1 -padding-free , 16 , 49580 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.937993399 , 505.6847 , 25.312 , 0.198 , 4904.241
12- 0.25 , moe-scattermoe-granite-ep1 -padding-free , 8 , 43821 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944808855 , 311.785 , 41.054 , 0.321 , 3977.099
13- 0.25 , moe-scattermoe-granite-ep1 -padding-free , 4 , 40070.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.951866873 , 169.9554 , 75.314 , 0.588 , 3648.016
14- 0.25 , moe-scattermoe-granite-ep1 -padding-free-foak , 16 , 49114 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938123143 , 476.8099 , 26.845 , 0.21 , 5201.235
15- 0.25 , moe-scattermoe-granite-ep1 -padding-free-foak , 8 , 43865 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.944894351 , 296.5204 , 43.167 , 0.337 , 4181.837
16- 0.25 , moe-scattermoe-granite-ep1 -padding-free-foak , 4 , 40070.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.951975068 , 163.756 , 78.165 , 0.611 , 3786.12
17- 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 32276 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.937930156 , 356.1296 , 35.942 , 0.281 , 3481.878
18- 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 29787 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945339936 , 192.7168 , 66.419 , 0.519 , 3217.156
19- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 32376 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.938017525 , 342.9327 , 37.325 , 0.292 , 3615.87
20- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 29734.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945357794 , 184.554 , 69.356 , 0.542 , 3359.451
21- 0.25 , moe-scattermoe-granite-ep4-padding-free , 4 , 23386.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938359724 , 191.205 , 66.944 , 0.523 , 3242.593
22- 0.25 , moe-scattermoe-granite-ep4-padding-free-foak , 4 , 23359.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938333818 , 183.9191 , 69.596 , 0.544 , 3371.048
23- 0.25 , none , 16 , 81018 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878051637 , 4223.9158 , 3.03 , 0.024 , 839.411
24- 0.25 , none , 8 , 74462 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877874975 , 2247.4716 , 5.695 , 0.044 , 788.798
25- 0.25 , none , 4 , 63033 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.878253661 , 1155.5903 , 11.077 , 0.087 , 767.054
26- 0.25 , moe-scattermoe-granite-ep1 , 16 , 81018 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878006854 , 907.8407 , 14.099 , 0.11 , 3905.531
27- 0.25 , moe-scattermoe-granite-ep1 , 8 , 73870 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879557709 , 492.5063 , 25.99 , 0.203 , 3599.548
28- 0.25 , moe-scattermoe-granite-ep1 , 4 , 74108.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.881521969 , 277.8191 , 46.073 , 0.36 , 3190.565
29- 0.25 , moe-scattermoe-granite-ep2 , 8 , 54168 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877982622 , 563.0434 , 22.734 , 0.178 , 3148.603
30- 0.25 , moe-scattermoe-granite-ep2 , 4 , 54582 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.880103117 , 299.2522 , 42.773 , 0.334 , 2962.05
31- 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 77632 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878018975 , 726.1255 , 17.628 , 0.138 , 3410.98
32- 0.25 , moe-scattermoe-granite-ep1-padding-free , 8 , 68019 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879643369 , 429.5618 , 29.798 , 0.233 , 2882.938
33- 0.25 , moe-scattermoe-granite-ep1-padding-free , 4 , 63879 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.88148216 , 239.3677 , 53.474 , 0.418 , 2586.815
34- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 72666 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878073001 , 688.38 , 18.594 , 0.145 , 3598.013
35- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 8 , 63074 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.879622684 , 419.7876 , 30.492 , 0.238 , 2950.063
36- 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 4 , 60126.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.881447418 , 231.7976 , 55.221 , 0.431 , 2671.296
37- 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 45093 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.8779908 , 471.1344 , 27.168 , 0.212 , 2628.549
38- 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 42590 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.879999972 , 250.48 , 51.102 , 0.399 , 2472.054
39- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 40281 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.878110015 , 461.6668 , 27.726 , 0.217 , 2682.454
40- 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 38934.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.880085612 , 250.2941 , 51.14 , 0.4 , 2473.889
41- 0.25 , moe-scattermoe-granite-ep8 , 16 , 56845 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.86557077 , 779.9315 , 16.412 , 0.128 , 430.807
42- 0.25 , moe-scattermoe-granite-ep8-foak , 16 , 56769 .25, mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.86551428 , 734.0756 , 17.437 , 0.136 , 457.719
2+ 0.25 , none , 16 , 77748 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.93802941 , 1830.0024 , 6.995 , 0.055 , 1951.473
3+ 0.25 , none , 8 , 56837 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.937978864 , 970.492 , 13.189 , 0.103 , 1839.891
4+ 0.25 , none , 4 , 47395 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938433378 , 508.0143 , 25.196 , 0.197 , 1757.431
5+ 0.25 , moe-scattermoe-granite-ep1 , 16 , 78376 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 0.938084066 , 656.3588 , 19.502 , 0.152 , 5440.927
6+ 0.25 , moe-scattermoe-granite-ep2 , 8 , 45422 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 0.938047224 , 439.6446 , 29.114 , 0.227 , 4061.462
7+ 0.25 , moe-scattermoe-granite-ep2 , 4 , 46506 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.945220579 , 234.7146 , 54.534 , 0.426 , 3803.769
8+ 0.25 , moe-scattermoe-granite-ep4 , 4 , 37025.5 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 0.938337043 , 255.5461 , 50.089 , 0.391 , 3493.694
9+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 49462 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 1.196784412 , 431.9774 , 29.631 , 0.231 , 5741.041
10+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 49060 , ibm-granite/granite-3.0-3b-a800m-instruct , 1 , 8 , bfloat16 , 1.200383433 , 398.976 , 32.082 , 0.251 , 6215.913
11+ 0.25 , moe-scattermoe-granite-ep2 -padding-free , 8 , 32265 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 1.198062455 , 335.7106 , 38.128 , 0.298 , 3693.657
12+ 0.25 , moe-scattermoe-granite-ep2 -padding-free , 4 , 29720 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 1.210442821 , 180.924 , 70.748 , 0.553 , 3426.854
13+ 0.25 , moe-scattermoe-granite-ep2 -padding-free-foak , 8 , 32285 , ibm-granite/granite-3.0-3b-a800m-instruct , 2 , 8 , bfloat16 , 1.199450992 , 320.9043 , 39.887 , 0.312 , 3864.08
14+ 0.25 , moe-scattermoe-granite-ep2 -padding-free-foak , 4 , 29771 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 1.21032447 , 175.5856 , 72.899 , 0.57 , 3531.042
15+ 0.25 , moe-scattermoe-granite-ep4 -padding-free , 4 , 23248 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 1.200576434 , 175.5905 , 72.897 , 0.57 , 3530.942
16+ 0.25 , moe-scattermoe-granite-ep4 -padding-free-foak , 4 , 23422 , ibm-granite/granite-3.0-3b-a800m-instruct , 4 , 8 , bfloat16 , 1.199994416 , 173.1652 , 73.918 , 0.577 , 3580.397
17+ 0.25 , none , 16 , 78704 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878016037 , 3924.9586 , 3.261 , 0.025 , 903.347
18+ 0.25 , none , 8 , 79299 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877915607 , 2059.5193 , 6.215 , 0.049 , 860.783
19+ 0.25 , none , 4 , 67966.5 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.878266089 , 1054.1087 , 12.143 , 0.095 , 840.9
20+ 0.25 , moe-scattermoe-granite-ep1 , 16 , 80638 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 0.878047609 , 899.6248 , 14.228 , 0.111 , 3941.198
21+ 0.25 , moe-scattermoe-granite-ep2 , 8 , 58769 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 0.877957979 , 550.4483 , 23.254 , 0.182 , 3220.647
22+ 0.25 , moe-scattermoe-granite-ep2 , 4 , 58932 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 0.880045412 , 300.1744 , 42.642 , 0.333 , 2952.95
23+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 77512 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 1.256636418 , 630.1126 , 20.314 , 0.159 , 3930.726
24+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 72604 , ibm-research/moe-7b-1b-active-shared-experts , 1 , 8 , bfloat16 , 1.261311768 , 598.0884 , 21.402 , 0.167 , 4141.194
25+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 45237 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 1.259768015 , 436.0593 , 29.354 , 0.229 , 2839.981
26+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 42449 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 1.267803932 , 236.4495 , 54.134 , 0.423 , 2618.741
27+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 40279 , ibm-research/moe-7b-1b-active-shared-experts , 2 , 8 , bfloat16 , 1.262602715 , 434.2257 , 29.478 , 0.23 , 2851.973
28+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 38827 , ibm-research/moe-7b-1b-active-shared-experts , 4 , 8 , bfloat16 , 1.268255376 , 231.5911 , 55.27 , 0.432 , 2673.678
29+ 0.25 , none , 16 , 78670 , ibm-granite/granite-4.0-tiny-preview , 1 , 8 , bfloat16 , 0.855069562 , 4106.7072 , 3.117 , 0.024 , 863.368
30+ 0.25 , none , 8 , 77928 , ibm-granite/granite-4.0-tiny-preview , 2 , 8 , bfloat16 , 0.854871556 , 2107.9397 , 6.072 , 0.047 , 841.011
31+ 0.25 , none , 4 , 70000 , ibm-granite/granite-4.0-tiny-preview , 4 , 8 , bfloat16 , 0.855348825 , 1117.3656 , 11.456 , 0.089 , 793.295
32+ 0.25 , moe-scattermoe-granite-ep1 , 16 , 78634 , ibm-granite/granite-4.0-tiny-preview , 1 , 8 , bfloat16 , 0.855006663 , 968.4797 , 13.217 , 0.103 , 3660.996
33+ 0.25 , moe-scattermoe-granite-ep2 , 8 , 61692 , ibm-granite/granite-4.0-tiny-preview , 2 , 8 , bfloat16 , 0.854951358 , 611.0101 , 20.949 , 0.164 , 2901.425
34+ 0.25 , moe-scattermoe-granite-ep2 , 4 , 61213 , ibm-granite/granite-4.0-tiny-preview , 4 , 8 , bfloat16 , 0.856631212 , 337.9265 , 37.878 , 0.296 , 2623.055
35+ 0.25 , moe-scattermoe-granite-ep1-padding-free , 16 , 79842 , ibm-granite/granite-4.0-tiny-preview , 1 , 8 , bfloat16 , 0.852907363 , 823.4639 , 15.544 , 0.121 , 3007.782
36+ 0.25 , moe-scattermoe-granite-ep1-padding-free-foak , 16 , 76916 , ibm-granite/granite-4.0-tiny-preview , 1 , 8 , bfloat16 , 0.852861792 , 734.3252 , 17.431 , 0.136 , 3372.893
37+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 8 , 48068 , ibm-granite/granite-4.0-tiny-preview , 2 , 8 , bfloat16 , 0.852783817 , 554.2306 , 23.095 , 0.18 , 2234.449
38+ 0.25 , moe-scattermoe-granite-ep2-padding-free , 4 , 44790 , ibm-granite/granite-4.0-tiny-preview , 4 , 8 , bfloat16 , 0.854414411 , 308.2351 , 41.527 , 0.324 , 2008.856
39+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 8 , 43180 , ibm-granite/granite-4.0-tiny-preview , 2 , 8 , bfloat16 , 0.85276741 , 541.444 , 23.64 , 0.185 , 2287.217
40+ 0.25 , moe-scattermoe-granite-ep2-padding-free-foak , 4 , 41128 , ibm-granite/granite-4.0-tiny-preview , 4 , 8 , bfloat16 , 0.854435267 , 308.1642 , 41.536 , 0.325 , 2009.318
41+ 0.25 , moe-scattermoe-granite-ep8 , 16 , 56687.5 , mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.8654898 , 810.9653 , 15.784 , 0.123 , 414.321
42+ 0.25 , moe-scattermoe-granite-ep8-foak , 16 , 56710 .25, mistralai/Mixtral-8x7B-Instruct-v0.1 , 8 , 1 , bfloat16 , 0.86548216 , 775.5419 , 16.505 , 0.129 , 433.245
0 commit comments