fix: Validate fp16.loss_scale is finite and non-negative

nathon-lee · nathon-lee · commit 225ab4e5494e · 2026-03-07T03:26:37.000Z
Signed-off-by: nathon-lee &lt;leejianwoo@gmail.com&gt;
diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+import math
+from pydantic import field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .fp16.loss_scaler import (
     INITIAL_LOSS_SCALE,
@@ -107,12 +109,31 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
     """
     Automatically cast inputs to fp16
     """
-
+    
     loss_scale: float = 0
     """
     Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
     """
 
+    @field_validator("loss_scale")
+    @classmethod
+    def _validate_loss_scale(cls, v):
+        # Prevent True/False from being treated as 1/0
+        if isinstance(v, bool):
+            raise ValueError("fp16.loss_scale must be a number, not bool")
+
+        v = float(v)
+
+        # Reject inf/-inf/nan
+        if not math.isfinite(v):
+            raise ValueError("fp16.loss_scale must be a finite number (not inf/-inf/nan)")
+
+        # Reject negative values; 0 still means dynamic loss scaling
+        if v < 0:
+            raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
+
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.