|
304 | 304 | } |
305 | 305 | ] |
306 | 306 | }, |
| 307 | + { |
| 308 | + "tag": "lr_scheduler", |
| 309 | + "tests": [ |
| 310 | + { |
| 311 | + "id": "3_none_distopt", |
| 312 | + "args": { |
| 313 | + "dtype": "float32", |
| 314 | + "nthread_per_process": 8, |
| 315 | + "num_iteration": 10, |
| 316 | + "batch_size": 10, |
| 317 | + "total_batch_size": 5120, |
| 318 | + "use_distributed_optimizer": true, |
| 319 | + "learning_rate": 0.00001, |
| 320 | + "lr_decay_style": "none" |
| 321 | + } |
| 322 | + }, |
| 323 | + { |
| 324 | + "id": "4_constant_tp4", |
| 325 | + "args": { |
| 326 | + "dtype": "float32", |
| 327 | + "nthread_per_process": 8, |
| 328 | + "num_iteration": 10, |
| 329 | + "batch_size": 40, |
| 330 | + "total_batch_size": 5120, |
| 331 | + "tensor_parallel": 4, |
| 332 | + "learning_rate": 0.00001, |
| 333 | + "min_lr": 0.000001, |
| 334 | + "lr_decay_style": "constant", |
| 335 | + "lr_warmup_iters": 0, |
| 336 | + "lr_decay_iters": 0 |
| 337 | + } |
| 338 | + }, |
| 339 | + { |
| 340 | + "id": "5_linear_tp4_sp_distopt", |
| 341 | + "args": { |
| 342 | + "dtype": "float32", |
| 343 | + "nthread_per_process": 8, |
| 344 | + "num_iteration": 10, |
| 345 | + "batch_size": 40, |
| 346 | + "total_batch_size": 5120, |
| 347 | + "tensor_parallel": 4, |
| 348 | + "sequence_parallel": true, |
| 349 | + "use_distributed_optimizer": true, |
| 350 | + "learning_rate": 0.00001, |
| 351 | + "min_lr": 0.000001, |
| 352 | + "lr_decay_style": "linear", |
| 353 | + "lr_warmup_iters": 2, |
| 354 | + "lr_warmup_init": 0.0, |
| 355 | + "lr_decay_iters": 10 |
| 356 | + } |
| 357 | + }, |
| 358 | + { |
| 359 | + "id": "6_cosine_pp8", |
| 360 | + "args": { |
| 361 | + "dtype": "float32", |
| 362 | + "nthread_per_process": 8, |
| 363 | + "num_iteration": 10, |
| 364 | + "batch_size": 10, |
| 365 | + "total_batch_size": 5120, |
| 366 | + "pipeline_parallel": 8, |
| 367 | + "learning_rate": 0.00001, |
| 368 | + "min_lr": 0.000001, |
| 369 | + "lr_decay_style": "cosine", |
| 370 | + "lr_warmup_iters": 2, |
| 371 | + "lr_warmup_init": 0.0, |
| 372 | + "lr_decay_iters": 10 |
| 373 | + } |
| 374 | + }, |
| 375 | + { |
| 376 | + "id": "7_inverse_sqrt_pp4_vpp2", |
| 377 | + "args": { |
| 378 | + "dtype": "float32", |
| 379 | + "nthread_per_process": 4, |
| 380 | + "num_iteration": 10, |
| 381 | + "batch_size": 10, |
| 382 | + "total_batch_size": 5120, |
| 383 | + "pipeline_parallel": 4, |
| 384 | + "virtual_pipeline_parallel": 2, |
| 385 | + "learning_rate": 0.00001, |
| 386 | + "min_lr": 0.000001, |
| 387 | + "lr_decay_style": "inverse-square-root", |
| 388 | + "lr_warmup_iters": 2, |
| 389 | + "lr_warmup_init": 0.0, |
| 390 | + "lr_decay_iters": 10 |
| 391 | + } |
| 392 | + }, |
| 393 | + { |
| 394 | + "id": "8_cosine_all_parallel_distopt", |
| 395 | + "args": { |
| 396 | + "dtype": "float32", |
| 397 | + "nthread_per_process": 8, |
| 398 | + "num_iteration": 10, |
| 399 | + "batch_size": 40, |
| 400 | + "total_batch_size": 5120, |
| 401 | + "tensor_parallel": 2, |
| 402 | + "sequence_parallel": true, |
| 403 | + "pipeline_parallel": 2, |
| 404 | + "virtual_pipeline_parallel": 2, |
| 405 | + "use_distributed_optimizer": true, |
| 406 | + "learning_rate": 0.00001, |
| 407 | + "min_lr": 0.000001, |
| 408 | + "lr_decay_style": "cosine", |
| 409 | + "lr_warmup_iters": 2, |
| 410 | + "lr_warmup_init": 0.0, |
| 411 | + "lr_decay_iters": 10 |
| 412 | + } |
| 413 | + }, |
| 414 | + { |
| 415 | + "id": "3_bfloat16_linear", |
| 416 | + "args": { |
| 417 | + "dtype": "bfloat16", |
| 418 | + "nthread_per_process": 8, |
| 419 | + "num_iteration": 10, |
| 420 | + "batch_size": 10, |
| 421 | + "total_batch_size": 5120, |
| 422 | + "learning_rate": 0.00001, |
| 423 | + "min_lr": 0.000001, |
| 424 | + "lr_decay_style": "linear", |
| 425 | + "lr_warmup_iters": 2, |
| 426 | + "lr_warmup_init": 0.0, |
| 427 | + "lr_decay_iters": 0 |
| 428 | + } |
| 429 | + }, |
| 430 | + { |
| 431 | + "id": "4_bfloat16_inverse_sqrt_tp4_distopt", |
| 432 | + "args": { |
| 433 | + "dtype": "bfloat16", |
| 434 | + "nthread_per_process": 8, |
| 435 | + "num_iteration": 10, |
| 436 | + "batch_size": 40, |
| 437 | + "total_batch_size": 5120, |
| 438 | + "tensor_parallel": 4, |
| 439 | + "use_distributed_optimizer": true, |
| 440 | + "learning_rate": 0.00001, |
| 441 | + "min_lr": 0.000001, |
| 442 | + "lr_decay_style": "inverse-square-root", |
| 443 | + "lr_warmup_iters": 2, |
| 444 | + "lr_warmup_init": 0.0, |
| 445 | + "lr_decay_iters": 10 |
| 446 | + } |
| 447 | + }, |
| 448 | + { |
| 449 | + "id": "5_bfloat16_constant_tp4_sp", |
| 450 | + "args": { |
| 451 | + "dtype": "bfloat16", |
| 452 | + "nthread_per_process": 8, |
| 453 | + "num_iteration": 10, |
| 454 | + "batch_size": 40, |
| 455 | + "total_batch_size": 5120, |
| 456 | + "tensor_parallel": 4, |
| 457 | + "sequence_parallel": true, |
| 458 | + "learning_rate": 0.00001, |
| 459 | + "min_lr": 0.000001, |
| 460 | + "lr_decay_style": "constant", |
| 461 | + "lr_warmup_iters": 0, |
| 462 | + "lr_decay_iters": 10 |
| 463 | + } |
| 464 | + }, |
| 465 | + { |
| 466 | + "id": "8_bfloat16_none_all_parallel", |
| 467 | + "args": { |
| 468 | + "dtype": "bfloat16", |
| 469 | + "nthread_per_process": 8, |
| 470 | + "num_iteration": 10, |
| 471 | + "batch_size": 40, |
| 472 | + "total_batch_size": 5120, |
| 473 | + "tensor_parallel": 2, |
| 474 | + "sequence_parallel": true, |
| 475 | + "pipeline_parallel": 2, |
| 476 | + "virtual_pipeline_parallel": 2, |
| 477 | + "learning_rate": 0.00001, |
| 478 | + "lr_decay_style": "none" |
| 479 | + } |
| 480 | + } |
| 481 | + ] |
| 482 | + }, |
307 | 483 | { |
308 | 484 | "tag": "lora", |
309 | 485 | "tests": [ |
|
0 commit comments