fix: Validate fp16.loss_scale is finite and non-negative (#7889)

nathon-lee · tohtana · web-flow · commit 63eeb114b5e8 · 2026-03-13T01:15:24.000Z
Validate fp16.loss_scale is finite and non-negative Add a Pydantic field validator to DeepSpeedFP16Config to reject NaN/inf/-inf and negative values for fp16.loss_scale (while keeping 0 as dynamic loss scaling). This prevents invalid configs from silently initializing and causing NaNs during training. Test: Run pytest -q tests/unit/runtime/test_precision_config_loss_scale.py Result: ``` root@72170d0458e9:/home/DeepSpeed_woo# pytest -q tests/unit/runtime/test_precision_config_loss_scale.py =================================================================== test session starts =================================================================== platform linux -- Python 3.11.10, pytest-8.3.5, pluggy-1.6.0 -- /usr/bin/python cachedir: .pytest_cache Using --randomly-seed=1526199052 rootdir: /home/DeepSpeed_woo/tests configfile: pytest.ini plugins: xdist-3.8.0, randomly-4.0.1, forked-1.6.0, anyio-4.6.0 collected 10 items tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[3] PASSED [ 10%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[0] PASSED [ 20%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[inf] PASSED [ 30%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[1] PASSED [ 40%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[nan] PASSED [ 50%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_accepts_valid_values[2.0] PASSED [ 60%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[True] PASSED [ 70%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_invalid_type_has_clear_error[loss_scale0] PASSED [ 80%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_rejects_invalid_values[-1] PASSED [ 90%] tests/unit/runtime/test_precision_config_loss_scale.py::test_fp16_loss_scale_invalid_type_has_clear_error[loss_scale1] PASSED [100%] (30 durations < 1s hidden. Use -vv to show these durations.) ============================================================= 10 passed, 16 warnings in 4.18s ============================================================= ``` Fix issue #7852 --------- Signed-off-by: nathon-lee <leejianwoo@gmail.com> Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com>
diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+import math
+from pydantic import field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .fp16.loss_scaler import (
     INITIAL_LOSS_SCALE,
@@ -113,6 +115,21 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
     Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
     """
 
+    @field_validator("loss_scale", mode="before")
+    @classmethod
+    def _validate_loss_scale(cls, v):
+        if isinstance(v, bool):
+            raise ValueError("fp16.loss_scale must be a number, not bool")
+        try:
+            v = float(v)
+        except (TypeError, ValueError):
+            raise ValueError("fp16.loss_scale must be a number")
+        if not math.isfinite(v):
+            raise ValueError("fp16.loss_scale must be a finite number (not inf/-inf/nan)")
+        if v < 0:
+            raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
diff --git a/tests/unit/runtime/test_precision_config_loss_scale.py b/tests/unit/runtime/test_precision_config_loss_scale.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import math
+
+import pytest
+from pydantic import ValidationError
+
+from deepspeed.runtime.precision_config import DeepSpeedFP16Config
+
+
+@pytest.mark.parametrize("loss_scale", [-1, float("inf"), float("nan"), True])
+def test_fp16_loss_scale_rejects_invalid_values(loss_scale):
+    with pytest.raises(ValidationError):
+        DeepSpeedFP16Config(loss_scale=loss_scale)
+
+
+@pytest.mark.parametrize("loss_scale", [0, 1, 2.0, "3"])
+def test_fp16_loss_scale_accepts_valid_values(loss_scale):
+    cfg = DeepSpeedFP16Config(loss_scale=loss_scale)
+    assert math.isfinite(cfg.loss_scale)
+    assert cfg.loss_scale >= 0
+
+
+@pytest.mark.parametrize("loss_scale", [[], {}])
+def test_fp16_loss_scale_invalid_type_has_clear_error(loss_scale):
+    with pytest.raises(ValidationError) as excinfo:
+        DeepSpeedFP16Config(loss_scale=loss_scale)
+    assert "must be a number" in str(excinfo.value)