Fix video tensor permutation order for correct [B, C, T, H, W] shape

Copilot · anxiangsir · Copilot · commit e7cddaa015b5 · 2025-12-24T17:51:50.000Z
Co-authored-by: anxiangsir &lt;31175974+anxiangsir@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -209,7 +209,7 @@ num_frames, frame_tokens, target_frames = 16, 256, 64
 frames = [Image.open(f"path/to/frame_{i}.jpg") for i in range(num_frames)]
 video_pixel_values = preprocessor(images=frames, return_tensors="pt")["pixel_values"]
 # Reshape from [T, C, H, W] to [B, C, T, H, W]
-video = video_pixel_values.permute(1, 0, 2, 3).unsqueeze(0).to("cuda")
+video = video_pixel_values.unsqueeze(0).permute(0, 2, 1, 3, 4).to("cuda")
 
 # Build visible_indices for temporal sampling
 frame_pos = torch.linspace(0, target_frames - 1, num_frames).long().cuda()