Add validation for loss_scale in precision_config

nathon-lee · nathon-lee · commit 3ead20d9bfda · 2026-03-07T03:19:48.000Z
(cherry picked from commit f0059a7) Signed-off-by: nathon-lee <leejianwoo@gmail.com>
diff --git a/deepspeed/runtime/precision_config.py b/deepspeed/runtime/precision_config.py
@@ -109,12 +109,31 @@ class DeepSpeedFP16Config(DeepSpeedConfigModel):
     """
     Automatically cast inputs to fp16
     """
-
+    
     loss_scale: float = 0
     """
     Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
     """
 
+    @field_validator("loss_scale")
+    @classmethod
+    def _validate_loss_scale(cls, v):
+        # Prevent True/False from being treated as 1/0
+        if isinstance(v, bool):
+            raise ValueError("fp16.loss_scale must be a number, not bool")
+
+        v = float(v)
+
+        # Reject inf/-inf/nan
+        if not math.isfinite(v):
+            raise ValueError("fp16.loss_scale must be a finite number (not inf/-inf/nan)")
+
+        # Reject negative values; 0 still means dynamic loss scaling
+        if v < 0:
+            raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
+
+        return v
+
     initial_scale_power: int = 16
     """
     For dynamic loss scaling, set initial loss scale to 2^{initial_scale_power}.
@@ -156,27 +175,3 @@ def dynamic_loss_scale_args(self):
             CONSECUTIVE_HYSTERESIS: self.consecutive_hysteresis,
             MIN_LOSS_SCALE: self.min_loss_scale,
         }
-
-        loss_scale: float = 0
-    """
-    Loss scaling value. Default value of 0 means dynamic loss scaling instead of static loss scale.
-    """
-
-    @field_validator("loss_scale")
-    @classmethod
-    def _validate_loss_scale(cls, v):
-        # Prevent True/False from being treated as 1/0
-        if isinstance(v, bool):
-            raise ValueError("fp16.loss_scale must be a number, not bool")
-
-        v = float(v)
-
-        # Reject inf/-inf/nan
-        if not math.isfinite(v):
-            raise ValueError("fp16.loss_scale must be a finite number (not inf/-inf/nan)")
-
-        # Reject negative values; 0 still means dynamic loss scaling
-        if v < 0:
-            raise ValueError("fp16.loss_scale must be >= 0 (0 enables dynamic loss scaling)")
-
-        return v