make binary mapper policy optimizable

lucidrains · lucidrains · commit f4ff5d17aa05 · 2026-03-22T20:24:19.000-07:00
diff --git a/README.md b/README.md
@@ -353,7 +353,7 @@ xhat, indices = quantizer(x)
 assert torch.all(xhat == quantizer.indices_to_codes(indices))
 ```
 
-An improvised Residual FSQ, for an attempt to improve audio encoding. 
+An improvised Residual FSQ, for an attempt to improve audio encoding.
 
 Credit goes to [@sekstini](https://github.com/sekstini) for originally incepting the idea [here](https://github.com/lucidrains/vector-quantize-pytorch/pull/74#issuecomment-1742048597)
 
@@ -506,7 +506,7 @@ from vector_quantize_pytorch import LatentQuantize
 quantizer = LatentQuantize(
     levels = [5, 5, 8],      # number of levels per codebook dimension
     dim = 16,                   # input dim
-    commitment_loss_weight=0.1,  
+    commitment_loss_weight=0.1,
     quantization_loss_weight=0.1,
 )
 
@@ -530,7 +530,7 @@ from vector_quantize_pytorch import LatentQuantize
 quantizer = LatentQuantize(
     levels = [5, 5, 8],
     dim = 16,
-    commitment_loss_weight=0.1,  
+    commitment_loss_weight=0.1,
     quantization_loss_weight=0.1,
 )
 
@@ -720,7 +720,7 @@ assert loss.item() >= 0
 
 ```bibtex
 @misc{hsu2023disentanglement,
-    title   = {Disentanglement via Latent Quantization}, 
+    title   = {Disentanglement via Latent Quantization},
     author  = {Kyle Hsu and Will Dorrell and James C. R. Whittington and Jiajun Wu and Chelsea Finn},
     year    = {2023},
     eprint  = {2305.18378},
@@ -782,36 +782,36 @@ assert loss.item() >= 0
 
 ```bibtex
 @misc{vali2025diveqdifferentiablevectorquantization,
-    title   = {DiVeQ: Differentiable Vector Quantization Using the Reparameterization Trick}, 
+    title   = {DiVeQ: Differentiable Vector Quantization Using the Reparameterization Trick},
     author  = {Mohammad Hassan Vali and Tom Bäckström and Arno Solin},
     year    = {2025},
     eprint  = {2509.26469},
     archivePrefix = {arXiv},
     primaryClass = {cs.LG},
-    url     = {https://arxiv.org/abs/2509.26469}, 
+    url     = {https://arxiv.org/abs/2509.26469},
 }
 ```
 
 ```bibtex
 @misc{fleuret2025freetransformer,
-    title   = {The Free Transformer}, 
+    title   = {The Free Transformer},
     author  = {François Fleuret},
     year    = {2025},
     eprint  = {2510.17558},
     archivePrefix = {arXiv},
     primaryClass = {cs.LG},
-    url     = {https://arxiv.org/abs/2510.17558}, 
+    url     = {https://arxiv.org/abs/2510.17558},
 }
 ```
 
 ```bibtex
 @misc{chang2025scalabletrainingvectorquantizednetworks,
-    title   = {Scalable Training for Vector-Quantized Networks with 100% Codebook Utilization}, 
+    title   = {Scalable Training for Vector-Quantized Networks with 100% Codebook Utilization},
     author  = {Yifan Chang and Jie Qin and Limeng Qiao and Xiaofeng Wang and Zheng Zhu and Lin Ma and Xingang Wang},
     year    = {2025},
     eprint  = {2509.10140},
     archivePrefix = {arXiv},
     primaryClass = {cs.CV},
-    url     = {https://arxiv.org/abs/2509.10140}, 
+    url     = {https://arxiv.org/abs/2509.10140},
 }
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "vector-quantize-pytorch"
-version = "1.27.21"
+version = "1.28.0"
 description = "Vector Quantization - Pytorch"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }
diff --git a/ruff.toml b/ruff.toml
@@ -21,4 +21,3 @@ convention = "numpy"
 [format]
 docstring-code-format = true
 docstring-code-line-length = 20
-
diff --git a/tests/test_beam.py b/tests/test_beam.py
@@ -15,7 +15,7 @@ def test_topk_and_manual_ema_update():
         dim = 256,
         codebook_size = 512
     )
-    
+
     vq2.load_state_dict(vq1.state_dict())
 
     x = torch.randn(1, 1024, 256)
diff --git a/tests/test_lfq.py b/tests/test_lfq.py
@@ -74,4 +74,4 @@ def test_lfq_bruteforce_frac_per_sample_entropy(
         per_sample_losses[i] = loss_breakdown.per_sample_entropy
 
     # 95% confidence interval
-    assert abs(per_sample_losses.mean() - true_per_sample_entropy) < (1.96 * (per_sample_losses.std() / math.sqrt(iters)))
+    assert abs(per_sample_losses.mean() - true_per_sample_entropy) < (1.96 * (per_sample_losses.std() / math.sqrt(iters)))
diff --git a/vector_quantize_pytorch/binary_mapper.py b/vector_quantize_pytorch/binary_mapper.py
@@ -69,6 +69,58 @@ def __init__(
 
         self.deterministic_on_eval = deterministic_on_eval
 
+    def binary_entropy(self, logits):
+        return binary_entropy(logits)
+
+    def calc_aux_loss(
+        self,
+        logits,
+        reduce_aux_kl_loss = True
+    ):
+        logits, inverse_pack_lead_dims = pack_with_inverse(logits, '* bits')
+        kl_div = self.bits * NAT - self.binary_entropy(logits)
+        aux_kl_loss = F.relu(kl_div - self.kl_loss_threshold)
+
+        if reduce_aux_kl_loss:
+            return aux_kl_loss.mean()
+
+        return inverse_pack_lead_dims(aux_kl_loss, '*')
+
+    def log_prob(
+        self,
+        logits,
+        *,
+        indices = None,
+        one_hot = None,
+        sum_bits = True
+    ):
+        assert exists(indices) ^ exists(one_hot), 'either indices or one_hot must be provided'
+
+        if exists(one_hot):
+            indices = one_hot.argmax(dim=-1)
+
+        # allow for any number of leading dimensions
+
+        logits, inverse_pack_lead_dims = pack_with_inverse(logits, '* bits')
+        indices, _ = pack_with_inverse(indices, '*')
+
+        # sampled bits representation
+
+        sampled_bits = self.codes[indices]
+
+        # calculate log probability
+
+        log_probs_1 = F.logsigmoid(logits)
+        log_probs_0 = F.logsigmoid(-logits)
+
+        log_probs = torch.where(sampled_bits, log_probs_1, log_probs_0)
+
+        if not sum_bits:
+            return inverse_pack_lead_dims(log_probs)
+
+        log_probs = log_probs.sum(dim = -1)
+        return inverse_pack_lead_dims(log_probs, '*')
+
     def forward(
         self,
         logits,
@@ -86,6 +138,7 @@ def forward(
 
         assert logits.shape[-1] == self.bits, f'logits must have a last dimension of {self.bits}'
 
+        orig_logits = logits
         # allow for any number of leading dimensions
 
         logits, inverse_pack_lead_dims = pack_with_inverse(logits, '* bits')
@@ -110,17 +163,7 @@ def forward(
         aux_kl_loss = self.zero
 
         if calc_aux_loss:
-            # calculate negative entropy
-
-            kl_div = self.bits * NAT - binary_entropy(logits)
-            aux_kl_loss = F.relu(kl_div - self.kl_loss_threshold)
-
-            # able to return unreduced kl loss, for use in another project (metacontroller)
-
-            if reduce_aux_kl_loss:
-                aux_kl_loss = aux_kl_loss.mean()
-            else:
-                aux_kl_loss = inverse_pack_lead_dims(aux_kl_loss, '*')
+            aux_kl_loss = self.calc_aux_loss(orig_logits, reduce_aux_kl_loss = reduce_aux_kl_loss)
 
         # maybe straight through
 
@@ -164,7 +207,13 @@ def forward(
     assert indices.shape == (3, 4)
     assert aux_loss.shape == (3, 4)
 
+    joint_log_prob = binary_mapper.log_prob(logits, indices = indices)
+    assert joint_log_prob.shape == (3, 4)
+
+    joint_log_prob_one_hot = binary_mapper.log_prob(logits, one_hot = sparse_one_hot)
+    assert torch.allclose(joint_log_prob, joint_log_prob_one_hot)
+
     binary_mapper.eval()
     sparse_one_hot1, _ = binary_mapper(logits, deterministic = True)
     sparse_one_hot2, _ = binary_mapper(logits, deterministic = True)
-    assert torch.allclose(sparse_one_hot1, sparse_one_hot2)
+    assert torch.allclose(sparse_one_hot1, sparse_one_hot2)
diff --git a/vector_quantize_pytorch/finite_scalar_quantization.py b/vector_quantize_pytorch/finite_scalar_quantization.py
@@ -145,7 +145,7 @@ def bound(self, z, eps = 1e-3, hard_clamp = False):
         return round_ste(bounded_z) / half_width
 
     # symmetry-preserving and noise-approximated quantization, section 3.2 in https://arxiv.org/abs/2411.19842
-    
+
     def symmetry_preserving_bound(self, z, hard_clamp = False):
         """ QL(x) = 2 / (L - 1) * [(L - 1) * (tanh(x) + 1) / 2 + 0.5] - 1 """
         maybe_tanh = tanh if not hard_clamp else partial(clamp, min = -1., max = 1.)
@@ -186,7 +186,7 @@ def _scale_and_shift(self, zhat_normalized):
 
         half_width = self._levels // 2
         return (zhat_normalized * half_width) + half_width
-    
+
     def _scale_and_shift_inverse(self, zhat):
         if self.preserve_symmetry:
             return zhat * (2. / (self._levels - 1)) - 1.
diff --git a/vector_quantize_pytorch/latent_quantization.py b/vector_quantize_pytorch/latent_quantization.py
@@ -52,7 +52,7 @@ def __init__(
                 (default is 1)
             codebook_dim (int): the dimension of the codebook.
                 If levels is a list, codebook_dim is the length of the list.
-                (default to -1) 
+                (default to -1)
             keep_num_codebooks_dim (Optional[bool]): Whether to keep the number of codebooks dimension in the output tensor. If not provided, it is set to True if num_codebooks > 1, otherwise False.
             optimize_values (Optional[bool]): Whether to optimize the values of the codebook. If not provided, it is set to True.
         """
diff --git a/vector_quantize_pytorch/residual_sim_vq.py b/vector_quantize_pytorch/residual_sim_vq.py
@@ -85,7 +85,7 @@ def __init__(
     @property
     def codebook_size(self):
         return first(self.layers).codebook_size
-    
+
     @property
     def codebook_dim(self):
         return first(self.layers).codebook_dim
diff --git a/vector_quantize_pytorch/residual_vq.py b/vector_quantize_pytorch/residual_vq.py
@@ -263,7 +263,7 @@ def __init__(
         self.register_buffer('beam_score_weights', tensor(beam_score_quantizer_weights), persistent = False)
 
         # setting up the MLPs for implicit neural codebooks
-        
+
         self.mlps = None
 
         if implicit_neural_codebook:
@@ -285,7 +285,7 @@ def __init__(
 
         for vq in rest_vq:
             vq._codebook = codebook
-    
+
     @property
     def codebook_size(self):
         return self.layers[0].codebook_size
diff --git a/vector_quantize_pytorch/sim_vq.py b/vector_quantize_pytorch/sim_vq.py
@@ -66,7 +66,7 @@ def __init__(
         self.register_buffer('frozen_codebook', codebook)
 
 
-        # whether to use rotation trick from Fifty et al. 
+        # whether to use rotation trick from Fifty et al.
         # https://arxiv.org/abs/2410.06424
 
         self.rotation_trick = rotation_trick

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ def test_topk_and_manual_ema_update():`
`15`	`15`	`dim = 256,`
`16`	`16`	`codebook_size = 512`
`17`	`17`	`)`
`18`		`-`
	`18`	`+`
`19`	`19`	`vq2.load_state_dict(vq1.state_dict())`
`20`	`20`
`21`	`21`	`x = torch.randn(1, 1024, 256)`