Fix bf16 dtype mismatch in ZeRO-3 with zero_quantized_weights

juyterman1000 · juyterman1000 · commit 83bd5d1997ee · 2026-01-19T20:08:40.000-08:00
Signed-off-by: juyterman1000 &lt;fastrunner10090@gmail.com&gt;
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
@@ -743,8 +743,8 @@ def wait(self, handle_dependency=True) -> None:
             instrument_w_nvtx(self.quantization.quant_handle.wait)()
             # Fix for issue #7775: convert dequantized tensor back to original dtype (e.g., bf16)
             # to prevent dtype mismatch when zero_quantized_weights is used with bf16
-            dequantized = self.quantization.backend.dequantize(
-                self.quantization.quantized_param, self.quantization.scale_buffer)
+            dequantized = self.quantization.backend.dequantize(self.quantization.quantized_param,
+                                                               self.quantization.scale_buffer)
             if self.original_dtype is not None:
                 dequantized = dequantized.to(self.original_dtype)
             flat_tensor = dequantized.to(self.params[0].device)
diff --git a/tests/unit/runtime/zero/test_zero_quant_bf16.py b/tests/unit/runtime/zero/test_zero_quant_bf16.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel, random_dataloader
+
+
+class TestZeroQuantBF16(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("zero_quantized_weights", [True])
+    def test_bf16_quantized_weights(self, zero_quantized_weights):
+        if not deepspeed.get_accelerator().is_bf16_supported():
+            pytest.skip("bf16 is not supported by this accelerator")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "zero_quantized_weights": zero_quantized_weights,
+            },
+            "bf16": {
+                "enabled": True
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            }
+        }
+
+        hidden_dim = 128
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
+
+        # Ensure model is in bf16
+        for param in model.parameters():
+            assert param.dtype == torch.bfloat16
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=2,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+
+        for n, batch in enumerate(data_loader):
+            # This triggers all_gather and dequantization
+            loss = model(batch[0], batch[1])
+
+            # Verify that param.data is indeed bfloat16 after all_gather
+            for name, param in model.named_parameters():
+                assert param.data.dtype == torch.bfloat16, f"Parameter {name} data dtype is {param.data.dtype}, expected torch.bfloat16"
+
+            model.backward(loss)
+            model.step()
+            break