Skip to content

Commit 63eeb11

Browse files
nathon-leetohtana
andauthored
fix: Validate fp16.loss_scale is finite and non-negative (#7889)
Validate fp16.loss_scale is finite and non-negative Add a Pydantic field validator to DeepSpeedFP16Config to reject NaN/inf/-inf and negative values for fp16.loss_scale (while keeping 0 as dynamic loss scaling). This prevents invalid configs from silently initializing and causing NaNs during training. Test: Run pytest -q tests/unit/runtime/test_precision_config_loss_scale.py Result: ``` root@72170d0458e9:/home/DeepSpeed_woo# pytest -q tests/unit/runtime/test_precision_config_loss_scale.py =================================================================== test session starts =================================================================== platform linux -- Python 3.11.10, pytest-8.3.5, pluggy-1.6.0 -- /usr/bin/python cachedir: .pytest_cache Using --randomly-seed=1526199052 rootdir: /home/DeepSpeed_woo/tests configfile: pytest.ini plugins: xdist-3.8.0, randomly-4.0.1, forked-1.6.0, anyio-4.6.0 collected 10 items tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[3] PASSED [ 10%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[0] PASSED [ 20%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[inf] PASSED [ 30%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[1] PASSED [ 40%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[nan] PASSED [ 50%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[2.0] PASSED [ 60%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[True] PASSED [ 70%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_invalid_type_has_clear_error[loss_scale0] PASSED [ 80%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[-1] PASSED [ 90%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_invalid_type_has_clear_error[loss_scale1] PASSED [100%] (30 durations < 1s hidden. Use -vv to show these durations.) ============================================================= 10 passed, 16 warnings in 4.18s ============================================================= ``` Fix issue #7852 --------- Signed-off-by: nathon-lee <leejianwoo@gmail.com> Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
1 parent 784cc26 commit 63eeb11

2 files changed

Lines changed: 48 additions & 0 deletions

File tree

deepspeed/runtime/precision_config.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
# DeepSpeed Team
55

6+
import math
7+
from pydantic import field_validator
68
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
79
from .fp16.loss_scaler import (
810
INITIAL_LOSS_SCALE,
@@ -113,6 +115,21 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
113115
Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
114116
"""
115117

118+
@field_validator("loss_scale", mode="before")
119+
@classmethod
120+
def _validate_loss_scale(cls, v):
121+
if isinstance(v, bool):
122+
raise ValueError("fp16.loss_scale must be a number, not bool")
123+
try:
124+
v = float(v)
125+
except (TypeError, ValueError):
126+
raise ValueError("fp16.loss_scale must be a number")
127+
if not math.isfinite(v):
128+
raise ValueError("fp16.loss_scale must be a finite number (not inf/-inf/nan)")
129+
if v < 0:
130+
raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
131+
return v
132+
116133
initial_scale_power: int = 16
117134
"""
118135
For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
# DeepSpeed Team
5+
6+
import math
7+
8+
import pytest
9+
from pydantic import ValidationError
10+
11+
from deepspeed.runtime.precision_config import DeepSpeedFP16Config
12+
13+
14+
@pytest.mark.parametrize("loss_scale", [-1, float("inf"), float("nan"), True])
15+
def test_fp16_loss_scale_rejects_invalid_values(loss_scale):
16+
with pytest.raises(ValidationError):
17+
DeepSpeedFP16Config(loss_scale=loss_scale)
18+
19+
20+
@pytest.mark.parametrize("loss_scale", [0, 1, 2.0, "3"])
21+
def test_fp16_loss_scale_accepts_valid_values(loss_scale):
22+
cfg = DeepSpeedFP16Config(loss_scale=loss_scale)
23+
assert math.isfinite(cfg.loss_scale)
24+
assert cfg.loss_scale >= 0
25+
26+
27+
@pytest.mark.parametrize("loss_scale", [[], {}])
28+
def test_fp16_loss_scale_invalid_type_has_clear_error(loss_scale):
29+
with pytest.raises(ValidationError) as excinfo:
30+
DeepSpeedFP16Config(loss_scale=loss_scale)
31+
assert "must be a number" in str(excinfo.value)

0 commit comments

Comments
 (0)