[misc] attention hot-path cleanup + denoising loop hoists (hao-ai-lab#1272)

alexzms · mergify[bot] · web-flow · commit 636d3b743e9f · 2026-05-10T11:51:10.000-07:00
Co-authored-by: mergify[bot] &lt;37929162+mergify[bot]@users.noreply.github.com&gt;
diff --git a/fastvideo/attention/backends/bsa_attn.py b/fastvideo/attention/backends/bsa_attn.py
@@ -358,12 +358,14 @@ def _flash_attn_single_mask(
     flat_k = torch.cat(k_list, dim=0)
     flat_v = torch.cat(v_list, dim=0)
 
+    # Compute max_seqlen_k from the Python list before moving to GPU to
+    # avoid a `.item()` round-trip that would force a host/device sync.
+    max_seqlen_q = Sq
+    max_seqlen_k = max(b - a for a, b in zip(cu_seqlens_k[:-1], cu_seqlens_k[1:], strict=False))
+
     cu_seqlens_q_t = torch.tensor(cu_seqlens_q, dtype=torch.int32, device=device)
     cu_seqlens_k_t = torch.tensor(cu_seqlens_k, dtype=torch.int32, device=device)
 
-    max_seqlen_q = Sq
-    max_seqlen_k = int((cu_seqlens_k_t[1:] - cu_seqlens_k_t[:-1]).max().item())
-
     orig_dtype = flat_q.dtype
     compute_dtype = orig_dtype
     if compute_dtype not in (torch.float16, torch.bfloat16):
@@ -445,12 +447,14 @@ def _flash_attn_single_head(
     flat_k = torch.cat(k_list, dim=0)
     flat_v = torch.cat(v_list, dim=0)
 
+    # Compute max_seqlen_k from the Python list before moving to GPU to
+    # avoid a `.item()` round-trip that would force a host/device sync.
+    max_seqlen_q = Sq
+    max_seqlen_k = max(b - a for a, b in zip(cu_seqlens_k[:-1], cu_seqlens_k[1:], strict=False))
+
     cu_seqlens_q_t = torch.tensor(cu_seqlens_q, dtype=torch.int32, device=device)
     cu_seqlens_k_t = torch.tensor(cu_seqlens_k, dtype=torch.int32, device=device)
 
-    max_seqlen_q = Sq
-    max_seqlen_k = int((cu_seqlens_k_t[1:] - cu_seqlens_k_t[:-1]).max().item())
-
     orig_dtype = flat_q.dtype
     compute_dtype = orig_dtype
     if compute_dtype not in (torch.float16, torch.bfloat16):
diff --git a/fastvideo/attention/backends/flash_attn.py b/fastvideo/attention/backends/flash_attn.py
@@ -57,6 +57,24 @@ def get_builder_cls() -> type["AttentionMetadataBuilder"]:
         raise NotImplementedError
 
 
+def _key_padding_mask_from_attn_mask(attn_mask: torch.Tensor, key_len: int) -> torch.Tensor:
+    # Normalize attn_mask to [B, key_len] where True means valid token.
+    if attn_mask.dim() == 4:
+        attn_mask = attn_mask[:, 0, 0, :]
+    elif attn_mask.dim() == 3:
+        attn_mask = attn_mask[:, 0, :]
+    elif attn_mask.dim() != 2:
+        raise ValueError(f"Unsupported attn_mask shape for FLASH_ATTN: {attn_mask.shape}")
+
+    # SDPA additive mask convention: valid=0, masked=-inf/large negative.
+    key_padding_mask = attn_mask if attn_mask.dtype == torch.bool else attn_mask >= 0
+
+    if key_padding_mask.shape[-1] != key_len:
+        raise ValueError("Invalid key padding mask length for FLASH_ATTN: "
+                         f"expected {key_len}, got {key_padding_mask.shape[-1]}")
+    return key_padding_mask
+
+
 @dataclass
 class FlashAttnMetadata(AttentionMetadata):
     current_timestep: int
@@ -101,24 +119,6 @@ def forward(
         value: torch.Tensor,
         attn_metadata: FlashAttnMetadata,
     ):
-
-        def _key_padding_mask_from_attn_mask(attn_mask: torch.Tensor, key_len: int) -> torch.Tensor:
-            # Normalize attn_mask to [B, key_len] where True means valid token.
-            if attn_mask.dim() == 4:
-                attn_mask = attn_mask[:, 0, 0, :]
-            elif attn_mask.dim() == 3:
-                attn_mask = attn_mask[:, 0, :]
-            elif attn_mask.dim() != 2:
-                raise ValueError(f"Unsupported attn_mask shape for FLASH_ATTN: {attn_mask.shape}")
-
-            # SDPA additive mask convention: valid=0, masked=-inf/large negative.
-            key_padding_mask = attn_mask if attn_mask.dtype == torch.bool else attn_mask >= 0
-
-            if key_padding_mask.shape[-1] != key_len:
-                raise ValueError("Invalid key padding mask length for FLASH_ATTN: "
-                                 f"expected {key_len}, got {key_padding_mask.shape[-1]}")
-            return key_padding_mask
-
         if (attn_metadata is not None and hasattr(attn_metadata, "attn_mask") and attn_metadata.attn_mask is not None):
             from fastvideo.attention.utils.flash_attn_no_pad import (
                 flash_attn_no_pad,
@@ -136,6 +136,7 @@ def _key_padding_mask_from_attn_mask(attn_mask: torch.Tensor, key_len: int) -> t
                     device=query.device,
                 )
                 key_padding_mask = _key_padding_mask_from_attn_mask(attn_mask, key.shape[1]).to(device=key.device)
+
                 return flash_attn_varlen_qk_no_pad(
                     query,
                     key,
@@ -148,9 +149,8 @@ def _key_padding_mask_from_attn_mask(attn_mask: torch.Tensor, key_len: int) -> t
                 )
 
             qkv = torch.stack([query, key, value], dim=2)
-
-            attn_mask = F.pad(attn_mask, (qkv.shape[1] - attn_mask.shape[1], 0), value=True)
-            output = flash_attn_no_pad(qkv, attn_mask, causal=False, dropout_p=0, softmax_scale=None)
+            attn_mask_padded = F.pad(attn_mask, (qkv.shape[1] - attn_mask.shape[1], 0), value=True)
+            output = flash_attn_no_pad(qkv, attn_mask_padded, causal=False, dropout_p=0, softmax_scale=None)
         else:
             output = flash_attn_func(
                 query,  # type: ignore[no-untyped-call]
diff --git a/fastvideo/attention/backends/sla.py b/fastvideo/attention/backends/sla.py
@@ -307,9 +307,12 @@ def forward(
         # Sparse attention
         o_s = _attention.apply(q, k, v, sparse_map, lut, real_topk, self.BLKQ, self.BLKK)
 
-        # Linear attention with feature maps
-        q_linear = self.feature_map_q(q).contiguous().to(self.dtype)
-        k_linear = self.feature_map_k(k).contiguous().to(self.dtype)
+        # Linear attention with feature maps. Note: softmax / elu / relu
+        # are elementwise and preserve layout, so the inputs are already
+        # contiguous from the transpose-contiguous above — no need to
+        # call .contiguous() again here.
+        q_linear = self.feature_map_q(q).to(self.dtype)
+        k_linear = self.feature_map_k(k).to(self.dtype)
         o_l = self._calc_linear_attention(q_linear, k_linear, v)
 
         # Project linear attention output and combine
@@ -539,9 +542,10 @@ def forward(
                     False, 1, scale, 0)
         # ========== END SPARGE ==========
 
-        # Linear attention with feature maps
-        q_linear = self.feature_map_q(q).contiguous().to(self.dtype)
-        k_linear = self.feature_map_k(k).contiguous().to(self.dtype)
+        # Linear attention with feature maps (see SLAAttentionImpl.forward
+        # for why .contiguous() is unnecessary here).
+        q_linear = self.feature_map_q(q).to(self.dtype)
+        k_linear = self.feature_map_k(k).to(self.dtype)
         o_l = self._calc_linear_attention(q_linear, k_linear, v)
 
         # Project linear attention output and combine
diff --git a/fastvideo/attention/backends/video_sparse_attn.py b/fastvideo/attention/backends/video_sparse_attn.py
@@ -140,6 +140,18 @@ class VideoSparseAttentionMetadata(AttentionMetadata):
     reverse_tile_partition_indices: torch.LongTensor
     variable_block_sizes: torch.LongTensor
     non_pad_index: torch.LongTensor
+    # Precomputed fancy index that fuses ``x[:, non_pad_index][:, reverse_tile_partition_indices]``
+    # in postprocess_output().  Avoids materializing the intermediate
+    # ``[B, len(non_pad_index), H, D]`` tensor on every layer.
+    untile_combined_index: torch.LongTensor
+    # Per-step shared padded buffer used by tile().  Lazily populated on
+    # the first layer's call and reused by every subsequent VSA layer in
+    # the same denoising step.  Scoping to metadata (not class/instance)
+    # makes the reuse thread-safe across concurrent requests and keeps
+    # the "pad positions are zero" invariant trivially true (the buffer
+    # is freshly zeroed alongside ``non_pad_index`` so the index set
+    # cannot drift between calls).
+    tile_buf: torch.Tensor | None = None
 
 
 class VideoSparseAttentionMetadataBuilder(AttentionMetadataBuilder):
@@ -171,6 +183,7 @@ def build(  # type: ignore
         reverse_tile_partition_indices = get_reverse_tile_partition_indices(dit_seq_shape, VSA_TILE_SIZE, device)
         variable_block_sizes = construct_variable_block_sizes(dit_seq_shape, num_tiles, device)
         non_pad_index = get_non_pad_index(variable_block_sizes, math.prod(VSA_TILE_SIZE))
+        untile_combined_index = non_pad_index[reverse_tile_partition_indices]
 
         return VideoSparseAttentionMetadata(
             current_timestep=current_timestep,
@@ -181,7 +194,8 @@ def build(  # type: ignore
             tile_partition_indices=tile_partition_indices,  # type: ignore
             reverse_tile_partition_indices=reverse_tile_partition_indices,
             variable_block_sizes=variable_block_sizes,
-            non_pad_index=non_pad_index)
+            non_pad_index=non_pad_index,
+            untile_combined_index=untile_combined_index)
 
 
 class VideoSparseAttentionImpl(AttentionImpl):
@@ -200,37 +214,59 @@ def __init__(
         sp_group = get_sp_group()
         self.sp_size = sp_group.world_size
 
-    def tile(self, x: torch.Tensor, num_tiles: list[int], tile_partition_indices: torch.LongTensor,
-             non_pad_index: torch.LongTensor) -> torch.Tensor:
+    def tile(self, x: torch.Tensor, attn_metadata: VideoSparseAttentionMetadata) -> torch.Tensor:
+        """Tile ``x`` into ``attn_metadata.tile_buf`` and return it.
+
+        The returned tensor aliases the per-metadata buffer and is only
+        valid until the next ``tile()`` / ``preprocess_qkv`` call on the
+        same ``attn_metadata``.  Callers must consume (or copy) the
+        result before invoking another VSA layer with the same metadata.
+        Today both call sites materialize copies via
+        ``.transpose(...).contiguous()`` inside ``forward()``, so the
+        contract holds; future callers must preserve it.
+        """
+        num_tiles = attn_metadata.num_tiles
         t_padded_size = num_tiles[0] * VSA_TILE_SIZE[0]
         h_padded_size = num_tiles[1] * VSA_TILE_SIZE[1]
         w_padded_size = num_tiles[2] * VSA_TILE_SIZE[2]
-
-        x_padded = torch.zeros((x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1]),
-                               device=x.device,
-                               dtype=x.dtype)
-        x_padded[:, non_pad_index] = x[:, tile_partition_indices]
-        return x_padded
-
-    def untile(self, x: torch.Tensor, reverse_tile_partition_indices: torch.LongTensor,
-               non_pad_index: torch.LongTensor) -> torch.Tensor:
-        x = x[:, non_pad_index][:, reverse_tile_partition_indices]
-        return x
+        target_shape = (x.shape[0], t_padded_size * h_padded_size * w_padded_size, x.shape[-2], x.shape[-1])
+
+        # Reuse the per-step buffer stashed on metadata (lazily allocated
+        # on the first VSA layer's call within a denoising step).  Pad
+        # positions are zero from the initial torch.zeros and never
+        # written to.  Scoping to metadata makes reuse safe across
+        # concurrent requests and keeps the "pad positions are zero"
+        # invariant trivially true: ``non_pad_index`` is fixed within
+        # a single metadata instance.
+        buf = attn_metadata.tile_buf
+        if (buf is None or buf.shape != target_shape or buf.dtype != x.dtype or buf.device != x.device):
+            buf = torch.zeros(target_shape, device=x.device, dtype=x.dtype)
+            attn_metadata.tile_buf = buf
+
+        buf[:, attn_metadata.non_pad_index] = x[:, attn_metadata.tile_partition_indices]
+        return buf
+
+    def untile(self, x: torch.Tensor, untile_combined_index: torch.LongTensor) -> torch.Tensor:
+        # Single fancy index using precomputed combined indices; avoids
+        # the intermediate ``[B, len(non_pad_index), H, D]`` tensor that
+        # the two-step ``x[:, non_pad_index][:, reverse_tile_partition_indices]``
+        # would allocate on every layer.
+        return x[:, untile_combined_index]
 
     def preprocess_qkv(
         self,
         qkv: torch.Tensor,
         attn_metadata: VideoSparseAttentionMetadata,
     ) -> torch.Tensor:
-        return self.tile(qkv, attn_metadata.num_tiles, attn_metadata.tile_partition_indices,
-                         attn_metadata.non_pad_index)
+        """Tile QKV; aliasing contract: see ``tile()``."""
+        return self.tile(qkv, attn_metadata)
 
     def postprocess_output(
         self,
         output: torch.Tensor,
         attn_metadata: VideoSparseAttentionMetadata,
     ) -> torch.Tensor:
-        return self.untile(output, attn_metadata.reverse_tile_partition_indices, attn_metadata.non_pad_index)
+        return self.untile(output, attn_metadata.untile_combined_index)
 
     def forward(  # type: ignore[override]
         self,
diff --git a/fastvideo/pipelines/stages/conditioning.py b/fastvideo/pipelines/stages/conditioning.py
@@ -6,14 +6,11 @@
 import torch
 
 from fastvideo.fastvideo_args import FastVideoArgs
-from fastvideo.logger import init_logger
 from fastvideo.pipelines.pipeline_batch_info import ForwardBatch
 from fastvideo.pipelines.stages.base import PipelineStage
 from fastvideo.pipelines.stages.validators import StageValidators as V
 from fastvideo.pipelines.stages.validators import VerificationResult
 
-logger = init_logger(__name__)
-
 
 class ConditioningStage(PipelineStage):
     """
@@ -39,31 +36,11 @@ def forward(
         Returns:
             The batch with applied conditioning.
         """
-        # TODO!!
-        if not batch.do_classifier_free_guidance:
-            return batch
-        else:
-            return batch
-
-        logger.info("batch.negative_prompt_embeds: %s", batch.negative_prompt_embeds)
-        logger.info("do_classifier_free_guidance: %s", batch.do_classifier_free_guidance)
-        logger.info("cfg_scale: %s", batch.guidance_scale)
-
-        # Ensure negative prompt embeddings are available
-        assert batch.negative_prompt_embeds is not None, (
-            "Negative prompt embeddings are required for classifier-free guidance")
-
-        # Concatenate primary embeddings and masks
-        batch.prompt_embeds = torch.cat([batch.negative_prompt_embeds, batch.prompt_embeds])
-        if batch.attention_mask is not None:
-            batch.attention_mask = torch.cat([batch.negative_attention_mask, batch.attention_mask])
-
-        # Concatenate secondary embeddings and masks if present
-        if batch.prompt_embeds_2 is not None:
-            batch.prompt_embeds_2 = torch.cat([batch.negative_prompt_embeds_2, batch.prompt_embeds_2])
-        if batch.attention_mask_2 is not None:
-            batch.attention_mask_2 = torch.cat([batch.negative_attention_mask_2, batch.attention_mask_2])
-
+        # Forward is a no-op: CFG is applied via two separate
+        # transformer forward passes inside DenoisingStage (e.g.
+        # denoising.py:364-394, :706, :930). The class is kept because
+        # verify_input / verify_output still validate CFG fields and
+        # disable CFG when prompt_embeds is empty.
         return batch
 
     def verify_input(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> VerificationResult:
diff --git a/fastvideo/pipelines/stages/denoising.py b/fastvideo/pipelines/stages/denoising.py
@@ -212,6 +212,23 @@ def forward(
         trajectory_timesteps: list[torch.Tensor] = []
         trajectory_latents: list[torch.Tensor] = []
 
+        # Hoisted out of the per-step loop: depends only on inputs that
+        # are constant across denoising steps.
+        use_meanflow = getattr(self.transformer.config, "use_meanflow", False)
+        embedded_cfg_scale = fastvideo_args.pipeline_config.embedded_cfg_scale
+        if embedded_cfg_scale is not None:
+            guidance_expand = (torch.tensor(
+                [embedded_cfg_scale] * latents.shape[0],
+                dtype=torch.float32,
+                device=get_local_torch_device(),
+            ).to(target_dtype) * 1000.0)
+        else:
+            guidance_expand = None
+        # V2V padding: zero-filled tensor concatenated with each step's
+        # latent_model_input.  Shape is fixed by latents and is never
+        # written to, so we allocate once.
+        v2v_zero_pad = torch.zeros_like(latents) if batch.video_latent is not None else None
+
         # Run denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
@@ -248,8 +265,7 @@ def forward(
                 # Expand latents for V2V/I2V
                 latent_model_input = latents.to(target_dtype)
                 if batch.video_latent is not None:
-                    latent_model_input = torch.cat([latent_model_input, batch.video_latent,
-                                                    torch.zeros_like(latents)],
+                    latent_model_input = torch.cat([latent_model_input, batch.video_latent, v2v_zero_pad],
                                                    dim=1).to(target_dtype)
                 elif batch.image_latent is not None:
                     assert not fastvideo_args.pipeline_config.ti2v_task, "image latents should not be provided for TI2V task"
@@ -266,7 +282,6 @@ def forward(
                     t_expand = t.repeat(latent_model_input.shape[0])
                 t_expand = t_expand.to(get_local_torch_device())
 
-                use_meanflow = getattr(self.transformer.config, "use_meanflow", False)
                 if use_meanflow:
                     if i == len(timesteps) - 1:
                         timesteps_r = torch.tensor([0.0], device=get_local_torch_device())
@@ -285,13 +300,6 @@ def forward(
 
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
-                # Prepare inputs for transformer
-                guidance_expand = (torch.tensor(
-                    [fastvideo_args.pipeline_config.embedded_cfg_scale] * latent_model_input.shape[0],
-                    dtype=torch.float32,
-                    device=get_local_torch_device(),
-                ).to(target_dtype) * 1000.0 if fastvideo_args.pipeline_config.embedded_cfg_scale is not None else None)
-
                 # Predict noise residual
                 with torch.autocast(device_type="cuda", dtype=target_dtype, enabled=autocast_enabled):
                     if (vsa_available and self.attn_backend == VideoSparseAttentionBackend):