working code

csgoogle · csgoogle · commit 7026d2017255 · 2026-04-13T17:45:38.000Z
diff --git a/.gitignore b/.gitignore
@@ -180,3 +180,6 @@ wandb
 # Gemini CLI
 .gemini/
 gha-creds-*.json
+
+# JAX cache
+.jax_cache/
diff --git a/benchmark_attention.sh b/benchmark_attention.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# =============================================================================
+# Comprehensive Attention Benchmark: flash vs ring vs ulysses
+# Runs across i2v/t2v pipelines, multiple batch sizes.
+# Computes quality metrics (PSNR, SSIM) using flash as reference.
+# =============================================================================
+set -euo pipefail
+
+# === Environment ===
+source /data/maxdiffusion-work/maxdiffusion-venv/bin/activate
+export PYTHONPATH=/data/maxdiffusion-work/maxdiffusion/src
+export HF_HOME=/data/maxdiffusion-work/hf-home
+export JAX_CACHE_DIR="${JAX_CACHE_DIR:-/data/maxdiffusion-work/jax-cache}"
+
+REPO_DIR="/data/maxdiffusion-work/maxdiffusion"
+BENCH_DIR="/data/maxdiffusion-work/bench_results_$(date +%Y%m%d_%H%M%S)"
+LOG_DIR="${BENCH_DIR}/logs"
+VIDEO_DIR="${BENCH_DIR}/videos"
+mkdir -p "$LOG_DIR" "$VIDEO_DIR"
+
+GEN_SCRIPT="${REPO_DIR}/src/maxdiffusion/generate_wan.py"
+T2V_CONFIG="${REPO_DIR}/src/maxdiffusion/configs/base_wan_14b.yml"
+I2V_CONFIG="${REPO_DIR}/src/maxdiffusion/configs/base_wan_i2v_14b.yml"
+
+# === Benchmark Parameters ===
+ATTENTION_MODES=("flash" "ring" "ulysses")
+# per_device_batch_size -> global batch size on 8 devices:
+# 0.125->1, 0.25->2, 0.5->4, 1->8, 2->16
+BATCH_SIZES=("0.125" "0.25" "0.5" "1" "2")
+
+# Smaller block sizes that fit in VMEM at cp=8
+FLASH_BLOCK_SIZES='{"block_q":2048,"block_kv_compute":1024,"block_kv":4096,"block_q_dkv":2048,"block_kv_dkv":4096,"block_kv_dkv_compute":1024,"use_fused_bwd_kernel":true}'
+
+RESULTS_FILE="${BENCH_DIR}/timing_results.csv"
+echo "pipeline,attention,batch_size,compile_time_s,inference_time_s" > "$RESULTS_FILE"
+
+SUMMARY_FILE="${BENCH_DIR}/summary.txt"
+
+echo "============================================================"
+echo "  Attention Benchmark Suite"
+echo "  Started: $(date)"
+echo "  Output:  ${BENCH_DIR}"
+echo "============================================================"
+
+# === Helper: run one benchmark ===
+run_benchmark() {
+  local pipeline=$1 attention=$2 batch_size=$3
+  local config_file run_name log_file
+
+  if [[ "$pipeline" == "t2v" ]]; then
+    config_file="$T2V_CONFIG"
+  else
+    config_file="$I2V_CONFIG"
+  fi
+
+  run_name="${pipeline}_${attention}_bs${batch_size}"
+  log_file="${LOG_DIR}/${run_name}.log"
+
+  echo ""
+  echo "=========================================="
+  echo "  RUN: ${run_name}"
+  echo "  Config: ${config_file}"
+  echo "  Log: ${log_file}"
+  echo "=========================================="
+
+  # Run generation; videos are saved in cwd
+  cd "$VIDEO_DIR"
+  python "$GEN_SCRIPT" "$config_file" \
+    attention="$attention" \
+    num_inference_steps=50 \
+    num_frames=81 \
+    ici_data_parallelism=1 \
+    ici_context_parallelism=8 \
+    ici_tensor_parallelism=1 \
+    allow_split_physical_axes=True \
+    per_device_batch_size="$batch_size" \
+    seed=0 \
+    run_name="$run_name" \
+    jax_cache_dir="$JAX_CACHE_DIR" \
+    flash_block_sizes="$FLASH_BLOCK_SIZES" \
+    2>&1 | tee "$log_file"
+
+  # Rename output videos to include run info
+  for f in wan_output_*.mp4; do
+    if [[ -e "$f" ]]; then
+      mv "$f" "${run_name}_${f}"
+    fi
+  done
+  # Also catch prefixed outputs
+  for f in *wan_output_*.mp4; do
+    if [[ -e "$f" && ! "$f" =~ ^(i2v|t2v)_ ]]; then
+      mv "$f" "${run_name}_${f}" 2>/dev/null || true
+    fi
+  done
+
+  cd "$REPO_DIR"
+
+  # Extract timing from log
+  local compile_time inference_time
+  compile_time=$(grep -oP 'compile_time:\s*\K[0-9.]+' "$log_file" 2>/dev/null | tail -1 || echo "N/A")
+  inference_time=$(grep -oP 'generation_time:\s*\K[0-9.]+' "$log_file" 2>/dev/null | tail -1 || echo "N/A")
+
+  echo "${pipeline},${attention},${batch_size},${compile_time},${inference_time}" >> "$RESULTS_FILE"
+  echo "  >> compile=${compile_time}s  inference=${inference_time}s"
+}
+
+# =============================================================================
+# Phase 1: Run all benchmarks
+# =============================================================================
+echo ""
+echo "============================================================"
+echo "  Phase 1: Running inference benchmarks"
+echo "============================================================"
+
+for batch_size in "${BATCH_SIZES[@]}"; do
+  echo ""
+  echo "--- Batch size: ${batch_size} ---"
+
+  # I2V benchmarks
+  for attention in "${ATTENTION_MODES[@]}"; do
+    run_benchmark "i2v" "$attention" "$batch_size" || {
+      echo "FAILED: i2v $attention bs=$batch_size"
+      echo "i2v,${attention},${batch_size},FAILED,FAILED" >> "$RESULTS_FILE"
+    }
+  done
+
+  # T2V benchmarks
+  for attention in "${ATTENTION_MODES[@]}"; do
+    run_benchmark "t2v" "$attention" "$batch_size" || {
+      echo "FAILED: t2v $attention bs=$batch_size"
+      echo "t2v,${attention},${batch_size},FAILED,FAILED" >> "$RESULTS_FILE"
+    }
+  done
+done
+
+# =============================================================================
+# Phase 2: Quality comparison (PSNR, SSIM)
+# =============================================================================
+echo ""
+echo "============================================================"
+echo "  Phase 2: Quality Metrics (PSNR, SSIM)"
+echo "============================================================"
+
+python "$REPO_DIR/benchmark_quality.py" "$VIDEO_DIR" "$RESULTS_FILE" 2>&1 | tee "${LOG_DIR}/quality.log"
+
+# =============================================================================
+# Phase 3: Final Summary
+# =============================================================================
+{
+  echo "============================================================"
+  echo "  ATTENTION BENCHMARK RESULTS"
+  echo "  Date: $(date)"
+  echo "  Branch: $(git -C "$REPO_DIR" branch --show-current)"
+  echo "  Devices: $(python -c 'import jax; print(f"{len(jax.devices())} x {jax.devices()[0].device_kind}")')"
+  echo "============================================================"
+  echo ""
+  echo "--- Timing Results ---"
+  column -t -s',' "$RESULTS_FILE"
+  echo ""
+
+  QUALITY_CSV="${BENCH_DIR}/quality_results.csv"
+  if [[ -f "$QUALITY_CSV" ]]; then
+    echo "--- Quality Results (flash = reference) ---"
+    column -t -s',' "$QUALITY_CSV"
+    echo ""
+  fi
+
+  echo "--- Speedup vs Flash ---"
+  python3 -c "
+import csv, sys
+rows = []
+with open('$RESULTS_FILE') as f:
+    reader = csv.DictReader(f)
+    for r in reader:
+        rows.append(r)
+
+# Group by (pipeline, batch_size)
+groups = {}
+for r in rows:
+    key = (r['pipeline'], r['batch_size'])
+    groups.setdefault(key, {})[r['attention']] = r
+
+for key in sorted(groups):
+    pipeline, bs = key
+    flash = groups[key].get('flash', {})
+    flash_inf = flash.get('inference_time_s', 'N/A')
+    if flash_inf in ('N/A', 'FAILED'):
+        continue
+    flash_inf = float(flash_inf)
+    print(f'  {pipeline} bs={bs} (flash baseline: {flash_inf:.1f}s)')
+    for attn in ['ring', 'ulysses']:
+        data = groups[key].get(attn, {})
+        inf = data.get('inference_time_s', 'N/A')
+        if inf in ('N/A', 'FAILED'):
+            print(f'    {attn}: N/A')
+            continue
+        inf = float(inf)
+        speedup = ((flash_inf - inf) / flash_inf) * 100
+        sign = '+' if speedup >= 0 else ''
+        print(f'    {attn}: {inf:.1f}s ({sign}{speedup:.1f}%)')
+" 2>/dev/null || echo "  (could not compute speedups)"
+  echo ""
+  echo "Results dir: ${BENCH_DIR}"
+  echo "============================================================"
+} | tee "$SUMMARY_FILE"
+
+echo ""
+echo "Benchmark complete at $(date)"
+echo "Full summary: ${SUMMARY_FILE}"
diff --git a/benchmark_quality.py b/benchmark_quality.py
@@ -0,0 +1,134 @@
+"""Compare video quality across attention kernels.
+
+Reads mp4 files, computes per-frame PSNR and SSIM, and appends results to a CSV.
+Usage: python benchmark_quality.py <video_dir> <results_csv>
+"""
+
+import csv
+import glob
+import os
+import re
+import sys
+
+import cv2
+import numpy as np
+from skimage.metrics import structural_similarity as ssim
+
+
+def read_video_frames(path, max_frames=None):
+    """Read video frames as numpy arrays."""
+    cap = cv2.VideoCapture(path)
+    frames = []
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        frames.append(frame)
+        if max_frames and len(frames) >= max_frames:
+            break
+    cap.release()
+    return frames
+
+
+def compute_psnr(ref_frame, test_frame):
+    mse = np.mean((ref_frame.astype(np.float64) - test_frame.astype(np.float64)) ** 2)
+    if mse == 0:
+        return float("inf")
+    return 10 * np.log10(255.0**2 / mse)
+
+
+def compute_metrics(ref_path, test_path):
+    """Compute average PSNR and SSIM between two videos."""
+    ref_frames = read_video_frames(ref_path)
+    test_frames = read_video_frames(test_path)
+
+    if not ref_frames or not test_frames:
+        return None, None
+
+    n = min(len(ref_frames), len(test_frames))
+    psnr_vals = []
+    ssim_vals = []
+
+    for i in range(n):
+        ref = ref_frames[i]
+        tst = test_frames[i]
+        # Resize if dimensions differ
+        if ref.shape != tst.shape:
+            tst = cv2.resize(tst, (ref.shape[1], ref.shape[0]))
+
+        psnr_vals.append(compute_psnr(ref, tst))
+        # SSIM on grayscale
+        ref_gray = cv2.cvtColor(ref, cv2.COLOR_BGR2GRAY)
+        tst_gray = cv2.cvtColor(tst, cv2.COLOR_BGR2GRAY)
+        ssim_vals.append(ssim(ref_gray, tst_gray))
+
+    return np.mean(psnr_vals), np.mean(ssim_vals)
+
+
+def find_video_pairs(video_dir):
+    """Find flash (reference) vs ring/ulysses video pairs.
+
+    Expects filenames like: {pipeline}_{attention}_bs{batch}_wan_output_0_0.mp4
+    """
+    videos = glob.glob(os.path.join(video_dir, "*.mp4"))
+    # Group by (pipeline, batch_size)
+    groups = {}
+    for v in videos:
+        base = os.path.basename(v)
+        # Match pattern: {pipeline}_{attention}_bs{batch}_wan_output_{seed}_{idx}.mp4
+        m = re.match(r"^(i2v|t2v)_(flash|ring|ulysses)_(bs[\d.]+)_wan_output_(\d+)_(\d+)\.mp4$", base)
+        if not m:
+            continue
+        pipeline, attention, batch, seed, idx = m.groups()
+        key = (pipeline, batch, seed, idx)
+        groups.setdefault(key, {})[attention] = v
+
+    pairs = []
+    for key, attn_map in sorted(groups.items()):
+        if "flash" not in attn_map:
+            continue
+        ref = attn_map["flash"]
+        for attn in ["ring", "ulysses"]:
+            if attn in attn_map:
+                pairs.append((key, attn, ref, attn_map[attn]))
+    return pairs
+
+
+def main():
+    video_dir = sys.argv[1]
+    results_csv = sys.argv[2]
+
+    pairs = find_video_pairs(video_dir)
+    if not pairs:
+        print("No video pairs found for quality comparison.")
+        return
+
+    quality_csv = os.path.join(os.path.dirname(results_csv), "quality_results.csv")
+    with open(quality_csv, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["pipeline", "test_attention", "batch_size", "avg_psnr_db", "avg_ssim"])
+        for key, attn, ref_path, test_path in pairs:
+            pipeline, batch, seed, idx = key
+            print(f"Comparing: flash vs {attn} | {pipeline} {batch} (video {idx})")
+            avg_psnr, avg_ssim = compute_metrics(ref_path, test_path)
+            if avg_psnr is not None:
+                print(f"  PSNR: {avg_psnr:.2f} dB  |  SSIM: {avg_ssim:.4f}")
+                writer.writerow([pipeline, attn, batch, f"{avg_psnr:.2f}", f"{avg_ssim:.4f}"])
+            else:
+                print("  Could not compute metrics (missing/empty video)")
+                writer.writerow([pipeline, attn, batch, "N/A", "N/A"])
+
+    print(f"\nQuality results saved to: {quality_csv}")
+
+    # Print summary table
+    print("\n" + "=" * 60)
+    print("  QUALITY SUMMARY (flash = reference)")
+    print("=" * 60)
+    with open(quality_csv) as f:
+        for line in f:
+            print(f"  {line.strip()}")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/maxdiffusion/common_types.py b/src/maxdiffusion/common_types.py
@@ -84,3 +84,13 @@
     [CROSS_ATTN_Q_LENGTH, CONTEXT],
     [CROSS_ATTN_KV_LENGTH, None],
 ]
+
+### Common axis rules for ulysses attention ###
+ULYSSES_ATTENTION_AXIS_RULES = [
+    [SELF_ATTN_HEAD, None],
+    [SELF_ATTN_Q_LENGTH, CONTEXT],
+    [SELF_ATTN_KV_LENGTH, CONTEXT],
+    [CROSS_ATTN_HEAD, None],
+    [CROSS_ATTN_Q_LENGTH, CONTEXT],
+    [CROSS_ATTN_KV_LENGTH, CONTEXT],
+]
diff --git a/src/maxdiffusion/models/attention_flax.py b/src/maxdiffusion/models/attention_flax.py
diff --git a/src/maxdiffusion/pyconfig.py b/src/maxdiffusion/pyconfig.py