docstrings

sayakpaul · sayakpaul · commit 811d2e695a27 · 2026-05-21T13:42:34.000+05:30
diff --git a/src/diffusers/models/autoencoders/audio_tokenizer_ace_step.py b/src/diffusers/models/autoencoders/audio_tokenizer_ace_step.py
@@ -341,6 +341,12 @@ def __init__(
         self.gradient_checkpointing = False
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input audio tokens of shape `(batch_size, num_tokens, hidden_size)` to be unpooled back to the 25 Hz
+                acoustic-latent rate.
+        """
         batch_size, num_tokens, _ = hidden_states.shape
         hidden_states = self.embed_tokens(hidden_states)
         hidden_states = hidden_states.unsqueeze(2).expand(-1, -1, self.pool_window_size, -1)
@@ -436,6 +442,12 @@ def __init__(
         self.pool_window_size = pool_window_size
 
     def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input acoustic latents of shape `(batch_size, latent_length, audio_acoustic_hidden_dim)` to be
+                quantized into ACE-Step 5 Hz audio tokens.
+        """
         input_dtype = hidden_states.dtype
         hidden_states = self.audio_acoustic_proj(hidden_states)
         hidden_states = self.attention_pooler(hidden_states)
diff --git a/src/diffusers/models/autoencoders/latent_upsampler_ltx.py b/src/diffusers/models/autoencoders/latent_upsampler_ltx.py
@@ -144,6 +144,12 @@ def __init__(
         self.final_conv = ConvNd(mid_channels, in_channels, kernel_size=3, padding=1)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input latents of shape `(batch_size, num_channels, num_frames, height, width)` to spatially or
+                temporally upsample.
+        """
         batch_size, num_channels, num_frames, height, width = hidden_states.shape
 
         if self.dims == 2:
diff --git a/src/diffusers/models/autoencoders/latent_upsampler_ltx2.py b/src/diffusers/models/autoencoders/latent_upsampler_ltx2.py
@@ -243,6 +243,12 @@ def __init__(
         self.final_conv = ConvNd(mid_channels, in_channels, kernel_size=3, padding=1)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input latents of shape `(batch_size, num_channels, num_frames, height, width)` to spatially or
+                temporally upsample.
+        """
         batch_size, num_channels, num_frames, height, width = hidden_states.shape
 
         if self.dims == 2:
diff --git a/src/diffusers/models/autoencoders/vocoder_ltx2.py b/src/diffusers/models/autoencoders/vocoder_ltx2.py
@@ -572,6 +572,11 @@ def __init__(
         )
 
     def forward(self, mel_spec: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            mel_spec (`torch.Tensor`):
+                Input mel spectrogram of shape `(batch_size, num_channels, num_frames, num_mel_bins)`.
+        """
         # 1. Run stage 1 vocoder to get low sampling rate waveform
         x = self.vocoder(mel_spec)
         batch_size, num_channels, num_samples = x.shape
diff --git a/src/diffusers/models/condition_embedders/condition_encoder_ace_step.py b/src/diffusers/models/condition_embedders/condition_encoder_ace_step.py
@@ -178,6 +178,13 @@ def forward(
         inputs_embeds: torch.FloatTensor,
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
+        """
+        Args:
+            inputs_embeds (`torch.FloatTensor`):
+                Lyric token ids of shape `(batch_size, sequence_length)` to embed and encode.
+            attention_mask (`torch.Tensor`):
+                Attention mask of shape `(batch_size, sequence_length)` indicating which tokens are valid.
+        """
         inputs_embeds = self.embed_tokens(inputs_embeds)
 
         seq_len = inputs_embeds.shape[1]
@@ -317,6 +324,15 @@ def forward(
         refer_audio_acoustic_hidden_states_packed: torch.FloatTensor,
         refer_audio_order_mask: torch.LongTensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            refer_audio_acoustic_hidden_states_packed (`torch.FloatTensor`):
+                Packed reference-audio acoustic hidden states of shape `(total_tokens, hidden_size)` across all
+                reference samples in the batch.
+            refer_audio_order_mask (`torch.LongTensor`):
+                Batch-index assignment of shape `(total_tokens,)` indicating which reference sample each packed token
+                belongs to.
+        """
         inputs_embeds = self.embed_tokens(refer_audio_acoustic_hidden_states_packed)
 
         seq_len = inputs_embeds.shape[1]
@@ -447,6 +463,22 @@ def forward(
         refer_audio_acoustic_hidden_states_packed: torch.FloatTensor,
         refer_audio_order_mask: torch.LongTensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            text_hidden_states (`torch.FloatTensor`):
+                Text encoder hidden states of shape `(batch_size, text_sequence_length, text_hidden_dim)`.
+            text_attention_mask (`torch.Tensor`):
+                Attention mask of shape `(batch_size, text_sequence_length)` for the text hidden states.
+            lyric_hidden_states (`torch.FloatTensor`):
+                Lyric token ids of shape `(batch_size, lyric_sequence_length)` to be encoded by the lyric encoder.
+            lyric_attention_mask (`torch.Tensor`):
+                Attention mask of shape `(batch_size, lyric_sequence_length)` for the lyric tokens.
+            refer_audio_acoustic_hidden_states_packed (`torch.FloatTensor`):
+                Packed reference-audio acoustic hidden states of shape `(total_tokens, hidden_size)`.
+            refer_audio_order_mask (`torch.LongTensor`):
+                Batch-index assignment of shape `(total_tokens,)` indicating which reference sample each packed token
+                belongs to.
+        """
         text_hidden_states = self.text_projector(text_hidden_states)
 
         lyric_hidden_states = self.lyric_encoder(
diff --git a/src/diffusers/models/condition_embedders/image_encoder_redux.py b/src/diffusers/models/condition_embedders/image_encoder_redux.py
@@ -41,6 +41,12 @@ def __init__(
         self.redux_down = nn.Linear(txt_in_features * 3, txt_in_features)
 
     def forward(self, x: torch.Tensor) -> ReduxImageEncoderOutput:
+        """
+        Args:
+            x (`torch.Tensor`):
+                Image embeddings of shape `(batch_size, sequence_length, redux_dim)` produced by the SigLIP image
+                encoder.
+        """
         projected_x = self.redux_down(nn.functional.silu(self.redux_up(x)))
 
         return ReduxImageEncoderOutput(image_embeds=projected_x)
diff --git a/src/diffusers/models/condition_embedders/projection_audioldm2.py b/src/diffusers/models/condition_embedders/projection_audioldm2.py
@@ -109,6 +109,18 @@ def forward(
         attention_mask: torch.LongTensor | None = None,
         attention_mask_1: torch.LongTensor | None = None,
     ):
+        """
+        Args:
+            hidden_states (`torch.Tensor`, *optional*):
+                Hidden states from the first text encoder of shape `(batch_size, sequence_length, text_encoder_dim)`.
+            hidden_states_1 (`torch.Tensor`, *optional*):
+                Hidden states from the second text encoder of shape `(batch_size, sequence_length_1,
+                text_encoder_1_dim)`.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Attention mask of shape `(batch_size, sequence_length)` for `hidden_states`.
+            attention_mask_1 (`torch.LongTensor`, *optional*):
+                Attention mask of shape `(batch_size, sequence_length_1)` for `hidden_states_1`.
+        """
         hidden_states = self.projection(hidden_states)
         hidden_states, attention_mask = add_special_tokens(
             hidden_states, attention_mask, sos_token=self.sos_embed, eos_token=self.eos_embed
diff --git a/src/diffusers/models/condition_embedders/projection_clip_image.py b/src/diffusers/models/condition_embedders/projection_clip_image.py
@@ -26,4 +26,9 @@ def __init__(self, hidden_size: int = 768):
         self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
 
     def forward(self, x):
+        """
+        Args:
+            x (`torch.Tensor`):
+                Input CLIP image embeddings of shape `(batch_size, hidden_size)`.
+        """
         return self.project(x)
diff --git a/src/diffusers/models/condition_embedders/projection_stable_audio.py b/src/diffusers/models/condition_embedders/projection_stable_audio.py
@@ -141,6 +141,15 @@ def forward(
         start_seconds: torch.Tensor | None = None,
         end_seconds: torch.Tensor | None = None,
     ):
+        """
+        Args:
+            text_hidden_states (`torch.Tensor`, *optional*):
+                Hidden states from the text encoder of shape `(batch_size, sequence_length, text_encoder_dim)`.
+            start_seconds (`torch.Tensor`, *optional*):
+                Start-time-in-seconds conditioning values of shape `(batch_size,)`.
+            end_seconds (`torch.Tensor`, *optional*):
+                End-time-in-seconds conditioning values of shape `(batch_size,)`.
+        """
         text_hidden_states = (
             text_hidden_states if text_hidden_states is None else self.text_projection(text_hidden_states)
         )
diff --git a/src/diffusers/models/others/renderer_shap_e.py b/src/diffusers/models/others/renderer_shap_e.py
@@ -659,6 +659,21 @@ def map_indices_to_keys(self, output):
         return mapped_output
 
     def forward(self, *, position, direction, ts, nerf_level="coarse", rendering_mode="nerf"):
+        """
+        Args:
+            position (`torch.Tensor`):
+                3D query positions of shape `(batch_size, ..., 3)` to evaluate the NeRSTF MLP at.
+            direction (`torch.Tensor`):
+                Viewing directions of shape `(batch_size, ..., 3)` used for view-dependent color prediction.
+            ts (`torch.Tensor`):
+                Per-ray sample distances of shape `(batch_size, ..., 1)` passed through to the output for downstream
+                integration.
+            nerf_level (`str`, *optional*, defaults to `"coarse"`):
+                Which density/color head to read from — `"coarse"` or `"fine"`.
+            rendering_mode (`str`, *optional*, defaults to `"nerf"`):
+                Output head to use: `"nerf"` for radiance-field colors or `"stf"` for the signed-distance/texture
+                field.
+        """
         h = encode_position(position)
 
         h_preact = h
@@ -769,6 +784,12 @@ def __init__(
             )
 
     def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (`torch.Tensor`):
+                Latent representation of a 3D asset of shape `(batch_size, total_vectors, d_latent)`, sliced per
+                `param_name` and projected to each MLP weight tensor.
+        """
         out = {}
         start = 0
         for k, shape in zip(self.config.param_names, self.config.param_shapes):
diff --git a/src/diffusers/models/unets/unet_2d_condition_audioldm2.py b/src/diffusers/models/unets/unet_2d_condition_audioldm2.py
@@ -512,6 +512,16 @@ def forward(
             timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
             encoder_hidden_states (`torch.Tensor`):
                 The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*):
+                Conditional class labels of shape `(batch,)`. Only used when the model is configured with a
+                `class_embed_type`.
+            timestep_cond (`torch.Tensor`, *optional*):
+                Additional timestep conditioning of shape `(batch, time_cond_proj_dim)`, applied after the timestep
+                embedding.
+            attention_mask (`torch.Tensor`, *optional*):
+                A self-attention mask of shape `(batch, sequence_length)`. If `True` the mask is kept, otherwise if
+                `False` it is discarded. The mask is converted to a bias added to the attention scores for "discard"
+                tokens.
             encoder_attention_mask (`torch.Tensor`):
                 A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
                 `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,