@@ -196,7 +196,7 @@ preprocessor = AutoImageProcessor.from_pretrained(
196196)
197197
198198# Image inference: [B, C, H, W]
199- image = Image.open(" path/to/image.jpg" )
199+ image = Image.open(" path/to/your/ image.jpg" ) # Replace with your image path
200200pixel_values = preprocessor(images = image, return_tensors = " pt" )[" pixel_values" ].to(" cuda" )
201201with torch.no_grad():
202202 outputs = model(pixel_values)
@@ -205,10 +205,11 @@ with torch.no_grad():
205205
206206# Video inference: [B, C, T, H, W] with visible_indices
207207num_frames, frame_tokens, target_frames = 16 , 256 , 64
208- # Load video frames and preprocess each frame
208+ # Load video frames and preprocess each frame (replace with your video frame paths)
209209frames = [Image.open(f " path/to/frame_ { i} .jpg " ) for i in range (num_frames)]
210210video_pixel_values = preprocessor(images = frames, return_tensors = " pt" )[" pixel_values" ]
211- video = video_pixel_values.permute(1 , 0 , 2 , 3 ).unsqueeze(0 ).to(" cuda" ) # [B, C, T, H, W]
211+ # Reshape from [T, C, H, W] to [B, C, T, H, W]
212+ video = video_pixel_values.permute(1 , 0 , 2 , 3 ).unsqueeze(0 ).to(" cuda" )
212213
213214# Build visible_indices for temporal sampling
214215frame_pos = torch.linspace(0 , target_frames - 1 , num_frames).long().cuda()
0 commit comments