bring back zeroshot disabling code (#15564)

paarthneekhara · web-flow · commit 6a0ecb714a66 · 2026-03-31T19:09:55.000Z
Signed-off-by: Paarth Neekhara &lt;paarth.n@gmail.com&gt;
diff --git a/nemo/collections/tts/models/magpietts.py b/nemo/collections/tts/models/magpietts.py
@@ -619,6 +619,10 @@ def __init__(self, cfg: DictConfig, trainer: 'Trainer' = None):
             self.register_buffer('_baked_embedding_T', None)  # Time dimension
             self.register_buffer('_baked_embedding_D', None)  # Embedding dimension
             self.register_buffer('baked_context_embedding_len', None)  # Per-speaker lengths (N,)
+            # Probability of bypassing the context encoder during training and instead feeding
+            # batch-shuffled raw context embeddings, so the model learns not to clone voices
+            # from untransformed (i.e. not encoded by the context encoder) input.
+            self.train_shuffle_context_embedding_prob = cfg.get('train_shuffle_context_embedding_prob', 0.0)
         else:
             raise ValueError(f"Unsupported model type {self.model_type}")
 
@@ -1773,9 +1777,24 @@ def _prepare_decoder_context(
                 context_input_lens = context_input_lens.to(text.device)
                 context_mask = get_mask_from_lengths(context_input_lens)
             else:
-                context_embeddings = self.context_encoder(
-                    context_input_embedded, context_mask, cond=None, cond_mask=None
-                )['output']
+                # Zero-shot disable: with some probability, bypass the context encoder and feed
+                # batch-shuffled raw embeddings so the model learns to not clone from untransformed input.
+                # Skip when batch_size == 1: rolling a single sample maps it back to itself,
+                # so the context would remain matched to the correct speaker.
+                batch_size = context_input_embedded.size(0)
+                if (
+                    self.training
+                    and batch_size > 1
+                    and self.train_shuffle_context_embedding_prob > 0
+                    and random.random() < self.train_shuffle_context_embedding_prob
+                ):
+                    shift = random.randint(1, batch_size - 1)
+                    context_embeddings = context_input_embedded.roll(shift, dims=0)
+                    context_mask = context_mask.roll(shift, dims=0)
+                else:
+                    context_embeddings = self.context_encoder(
+                        context_input_embedded, context_mask, cond=None, cond_mask=None
+                    )['output']
         else:
             raise ValueError(f"Unsupported model type for decoder context: {self.model_type}")