Address PR review comments

yeyu-nvidia · yeyu-nvidia · commit 424832732c03 · 2026-04-07T12:33:31.000-07:00
- CLIP scoring: use CPU to avoid OOM with WAN pipeline on GPU; catch
  RuntimeError in addition to OSError
- --skip-threshold help: fix description to match actual exp(tile_max -
  running_max) &lt; lambda criterion
- vLLM worker: reject unsupported sparse presets (non-triton backend or
  unknown method) with a clear ValueError instead of silently degrading
  to dense attention
- PYTHONPATH construction: use os.pathsep and skip empty entries to
  avoid CWD injection when PYTHONPATH is unset
- diffusers_triton backend: raise ValueError when mixed with other
  backends instead of silently skipping _attn_implementation setup
- _wan_forward_triton: fall back to SDPA when attention_mask is not
  None to preserve masking semantics

Signed-off-by: Ye Yu &lt;yeyu@nvidia.com&gt;
diff --git a/examples/diffusers/quantization/wan2_sage_attention.py b/examples/diffusers/quantization/wan2_sage_attention.py
@@ -794,8 +794,9 @@ def parse_args() -> argparse.Namespace:
         help=(
             "Override skip_softmax_threshold for triton-skip / triton-skip-nvfp4 kernels. "
             f"Default: {_TRITON_SKIP_DEFAULT_THRESHOLD}. "
-            "A tile is skipped when its max attention score is less than LAMBDA times the "
-            "running maximum (BLASST criterion).  Lower = better quality, less speedup. "
+            "A tile is skipped when exp(tile_max - running_max) < LAMBDA "
+            "(equivalently: tile_max < running_max + log(LAMBDA)). "
+            "Lower = better quality, less speedup. "
             "Typical sweep: 0.1 (aggressive), 0.01 (moderate), 0.001 (conservative)."
         ),
     )
@@ -830,14 +831,14 @@ def main() -> None:
             disable_attention_kernel()
 
         # --- CLIP scores (per-video semantic alignment with prompt) ---
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Use CPU to avoid OOM: the WAN pipeline already occupies GPU memory.
         print("\nComputing CLIP scores (prompt-video semantic alignment)...")
         try:
             clip_base = compute_clip_score(
-                frames_base, args.prompt, clip_model_id=args.clip_model, device=device
+                frames_base, args.prompt, clip_model_id=args.clip_model, device="cpu"
             )
             clip_quant = compute_clip_score(
-                frames_quant, args.prompt, clip_model_id=args.clip_model, device=device
+                frames_quant, args.prompt, clip_model_id=args.clip_model, device="cpu"
             )
             print(f"  baseline CLIP:  {clip_base:.4f}")
             print(f"  {args.kernel} CLIP:  {clip_quant:.4f}  (delta {clip_quant - clip_base:+.4f})")
@@ -847,7 +848,7 @@ def main() -> None:
             print(
                 "  Tip: set HF_TOKEN env var or use --clip-model <local-path> to avoid rate limits"
             )
-        except OSError as e:
+        except (OSError, RuntimeError) as e:
             print(f"  WARNING: CLIP scoring failed ({e})")
             print("  To fix: set HF_TOKEN env var or pass --clip-model <local-path-to-clip>")
 
diff --git a/examples/vllm_serve/sparse_attn_worker.py b/examples/vllm_serve/sparse_attn_worker.py
@@ -143,6 +143,16 @@ def _replace_attention_impl(worker, config: dict):
         if layer_cfg is None or not layer_cfg.get("enable", True):
             continue
 
+        method = layer_cfg.get("method", "triton_sparse_softmax")
+        backend = layer_cfg.get("backend", "triton")
+        if backend != "triton" or method not in {"triton_sparse_softmax", "triton_skip_softmax"}:
+            raise ValueError(
+                f"{name}: unsupported sparse config for vLLM worker "
+                f"(backend={backend!r}, method={method!r}). "
+                "Only backend='triton' with method='triton_sparse_softmax' or "
+                "'triton_skip_softmax' is supported."
+            )
+
         # Build per-layer sparse kwargs
         sparse_kw = {}
         sparsity_n = layer_cfg.get("sparsity_n", 0)
diff --git a/examples/vllm_serve/vllm_serve_sparse_attn.py b/examples/vllm_serve/vllm_serve_sparse_attn.py
@@ -72,7 +72,9 @@ def main():
     repo_root = str(Path(__file__).resolve().parent)
     if repo_root not in sys.path:
         sys.path.insert(0, repo_root)
-    os.environ["PYTHONPATH"] = os.environ.get("PYTHONPATH", "") + ":" + f"{repo_root}"
+    existing = os.environ.get("PYTHONPATH")
+    parts = [p for p in [existing, repo_root] if p]
+    os.environ["PYTHONPATH"] = os.pathsep.join(parts)
 
     # Select worker based on env vars
     has_quant = os.environ.get("QUANT_CFG") or os.environ.get("KV_QUANT_CFG")
diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -63,6 +63,11 @@ def _set_attn_implementation(model: nn.Module, config: SparseAttentionConfig) ->
     # diffusers_triton: the ModelOptWanAttnProcessor calls triton_fa directly.
     # No HF attention-function registration or _attn_implementation patching needed.
     if "diffusers_triton" in backends:
+        if len(backends) > 1:
+            raise ValueError(
+                "Mixed backends including 'diffusers_triton' are not supported. "
+                "All sparse attention layers must use the same backend."
+            )
         return
 
     model_config = getattr(model, "config", None)
diff --git a/modelopt/torch/sparsity/attention_sparsity/plugins/diffusers.py b/modelopt/torch/sparsity/attention_sparsity/plugins/diffusers.py
@@ -208,6 +208,18 @@ def _wan_forward_triton(
         rotary_emb: tuple[torch.Tensor, torch.Tensor] | None,
     ) -> torch.Tensor:
         """Triton-backed WAN attention (self-attention and I2V cross-attention)."""
+        if attention_mask is not None:
+            from diffusers.models.attention_dispatch import dispatch_attention_fn
+
+            return self._wan_forward_sdpa(
+                attn,
+                hidden_states,
+                encoder_hidden_states,
+                attention_mask,
+                rotary_emb,
+                dispatch_fn=dispatch_attention_fn,
+            )
+
         encoder_hidden_states_img = None
         if attn.add_k_proj is not None:
             # 512 is the text-encoder context length (WAN hardcoded constant)