Add clarifying comments for image paths and tensor transformations

Copilot · anxiangsir · Copilot · commit b94ff54ab135 · 2025-12-24T17:49:50.000Z
Co-authored-by: anxiangsir &lt;31175974+anxiangsir@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -196,7 +196,7 @@ preprocessor = AutoImageProcessor.from_pretrained(
 )
 
 # Image inference: [B, C, H, W]
-image = Image.open("path/to/image.jpg")
+image = Image.open("path/to/your/image.jpg")  # Replace with your image path
 pixel_values = preprocessor(images=image, return_tensors="pt")["pixel_values"].to("cuda")
 with torch.no_grad():
     outputs = model(pixel_values)
@@ -205,10 +205,11 @@ with torch.no_grad():
 
 # Video inference: [B, C, T, H, W] with visible_indices
 num_frames, frame_tokens, target_frames = 16, 256, 64
-# Load video frames and preprocess each frame
+# Load video frames and preprocess each frame (replace with your video frame paths)
 frames = [Image.open(f"path/to/frame_{i}.jpg") for i in range(num_frames)]
 video_pixel_values = preprocessor(images=frames, return_tensors="pt")["pixel_values"]
-video = video_pixel_values.permute(1, 0, 2, 3).unsqueeze(0).to("cuda")  # [B, C, T, H, W]
+# Reshape from [T, C, H, W] to [B, C, T, H, W]
+video = video_pixel_values.permute(1, 0, 2, 3).unsqueeze(0).to("cuda")
 
 # Build visible_indices for temporal sampling
 frame_pos = torch.linspace(0, target_frames - 1, num_frames).long().cuda()
diff --git a/onevision_encoder/modeling_onevision_encoder.py b/onevision_encoder/modeling_onevision_encoder.py
@@ -533,7 +533,7 @@ def forward(
 
         >>> model = AutoModel.from_pretrained("lmms-lab/onevision-encoder-large", trust_remote_code=True)
         >>> preprocessor = AutoImageProcessor.from_pretrained("lmms-lab/onevision-encoder-large", trust_remote_code=True)
-        >>> image = Image.open("path/to/image.jpg")
+        >>> image = Image.open("path/to/your/image.jpg")  # Replace with your image path
         >>> pixel_values = preprocessor(images=image, return_tensors="pt")["pixel_values"]
         >>> outputs = model(pixel_values)
         >>> last_hidden_states = outputs.last_hidden_state