Merge branch 'master' into test-fvq

lucidrains · web-flow · commit cbf44548aa7c · 2026-02-12T08:50:56.000-08:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vector-quantize-pytorch"
-version = "1.27.16"
+version = "1.27.21"
 description = "Vector Quantization - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/vector_quantize_pytorch/binary_mapper.py b/vector_quantize_pytorch/binary_mapper.py
@@ -77,6 +77,7 @@ def forward(
         calc_aux_loss = None,
         deterministic = None,
         return_indices = False,
+        reduce_aux_kl_loss = True
     ):
         deterministic = default(deterministic, self.deterministic_on_eval and not self.training)
 
@@ -112,7 +113,14 @@ def forward(
             # calculate negative entropy
 
             kl_div = self.bits * NAT - binary_entropy(logits)
-            aux_kl_loss = F.relu(kl_div - self.kl_loss_threshold).mean()
+            aux_kl_loss = F.relu(kl_div - self.kl_loss_threshold)
+
+            # able to return unreduced kl loss, for use in another project (metacontroller)
+
+            if reduce_aux_kl_loss:
+                aux_kl_loss = aux_kl_loss.mean()
+            else:
+                aux_kl_loss = inverse_pack_lead_dims(aux_kl_loss, '*')
 
         # maybe straight through
 
@@ -150,11 +158,11 @@ def forward(
 
     logits = torch.randn(3, 4, 8)
 
-    sparse_one_hot, indices, aux_loss = binary_mapper(logits, return_indices = True)
+    sparse_one_hot, indices, aux_loss = binary_mapper(logits, return_indices = True, reduce_aux_kl_loss = False)
 
     assert sparse_one_hot.shape == (3, 4, 2 ** 8)
     assert indices.shape == (3, 4)
-    assert aux_loss.numel() == 1
+    assert aux_loss.shape == (3, 4)
 
     binary_mapper.eval()
     sparse_one_hot1, _ = binary_mapper(logits, deterministic = True)
diff --git a/vector_quantize_pytorch/finite_scalar_quantization.py b/vector_quantize_pytorch/finite_scalar_quantization.py
@@ -93,6 +93,7 @@ def __init__(
 
         self.scale = scale
 
+        assert not (noise_dropout > 0 and not preserve_symmetry)
         self.preserve_symmetry = preserve_symmetry
         self.noise_dropout = noise_dropout
 
@@ -158,22 +159,26 @@ def symmetry_preserving_bound(self, z, hard_clamp = False):
     def quantize(self, z):
         """ Quantizes z, returns quantized zhat, same shape as z. """
 
-        shape, device, noise_dropout, preserve_symmetry = z.shape[0], z.device, self.noise_dropout, self.preserve_symmetry
+        shape, device, preserve_symmetry = z.shape[0], z.device, self.preserve_symmetry
         bound_fn = self.symmetry_preserving_bound if preserve_symmetry else self.bound
 
-        bounded_z = bound_fn(z, hard_clamp = self.bound_hard_clamp)
+        return bound_fn(z, hard_clamp = self.bound_hard_clamp)
 
-        # determine where to add a random offset elementwise
-        # if using noise dropout
+    def maybe_apply_noise(self, bounded_z):
+        noise_dropout = self.noise_dropout
 
         if not self.training or noise_dropout == 0.:
             return bounded_z
 
-        offset_mask = torch.bernoulli(torch.full_like(bounded_z, noise_dropout)).bool()
+        # determine where to add a random offset elementwise
+        # if using noise dropout
+
+        offset_mask = torch.full_like(bounded_z, noise_dropout).bernoulli_().bool()
         offset = torch.rand_like(bounded_z) - 0.5
+
         bounded_z = torch.where(offset_mask, bounded_z + offset, bounded_z)
 
-        return bounded_z
+        return bounded_z.clamp(-1., 1.)
 
     def _scale_and_shift(self, zhat_normalized):
         if self.preserve_symmetry:
@@ -268,6 +273,8 @@ def forward(self, z):
             if self.return_indices:
                 indices = self.codes_to_indices(codes)
 
+            codes = self.maybe_apply_noise(codes)
+
             codes = rearrange(codes, 'b n c d -> b n (c d)')
 
             codes = codes.to(orig_dtype)