@@ -172,118 +172,42 @@ pip install -e .
172172
173173## ⚡ Quick Start
174174
175- ### Loading the Model
176-
177- Load the pre-trained OneVision Encoder model using HuggingFace Transformers:
175+ > ** Note:** This model supports native resolution input. For optimal performance:
176+ > - ** Image** : 448×448 resolution (pre-trained)
177+ > - ** Video** : 224×224 resolution with 256 tokens per frame (pre-trained)
178+ >
179+ > Use CLIP preprocessing from the [ model repository] ( https://huggingface.co/lmms-lab/onevision-encoder-large ) .
178180
179181``` python
180182from transformers import AutoModel
181183import torch
182184
183- model = AutoModel.from_pretrained(
184- " lmms-lab/onevision-encoder-large" ,
185- trust_remote_code = True ,
186- attn_implementation = " flash_attention_2" # Optional: use flash attention for efficiency
187- )
188- model = model.to(" cuda" ).eval()
189- ```
190-
191- ### Image Inference
192-
193- For single image inference:
194-
195- ``` python
196- import torch
197- from transformers import AutoModel
198-
199185# Load model
200186model = AutoModel.from_pretrained(
201187 " lmms-lab/onevision-encoder-large" ,
202188 trust_remote_code = True ,
203189 attn_implementation = " flash_attention_2"
204- )
205- model = model.to(" cuda" ).eval()
190+ ).to(" cuda" ).eval()
206191
207- # Prepare image input: [batch_size, channels, height, width]
208- # The model expects 448x448 resolution by default
209- pixel_values = torch.randn(1 , 3 , 448 , 448 ).to(" cuda" )
210-
211- # Run inference
192+ # Image inference: [B, C, H, W]
193+ image = torch.randn(1 , 3 , 448 , 448 ).to(" cuda" )
212194with torch.no_grad():
213- with torch.cuda.amp.autocast(dtype = torch.bfloat16):
214- outputs = model(pixel_values)
215-
216- # Get outputs
217- last_hidden_state = outputs.last_hidden_state # [batch_size, num_patches, hidden_size]
218- pooler_output = outputs.pooler_output # [batch_size, hidden_size] - pooled representation
219- ```
195+ outputs = model(image)
196+ # outputs.last_hidden_state: [B, num_patches, hidden_size]
197+ # outputs.pooler_output: [B, hidden_size]
220198
221- ### Video Inference
199+ # Video inference: [B, C, T, H, W] with visible_indices
200+ num_frames, frame_tokens, target_frames = 16 , 256 , 64
201+ video = torch.randn(1 , 3 , num_frames, 224 , 224 ).to(" cuda" )
222202
223- For video inference with temporal sampling:
203+ # Build visible_indices for temporal sampling
204+ frame_pos = torch.linspace(0 , target_frames - 1 , num_frames).long().cuda()
205+ visible_indices = (frame_pos.unsqueeze(- 1 ) * frame_tokens + torch.arange(frame_tokens).cuda()).reshape(1 , - 1 )
224206
225- ``` python
226- import torch
227- from transformers import AutoModel
228-
229- # Load model
230- model = AutoModel.from_pretrained(
231- " lmms-lab/onevision-encoder-large" ,
232- trust_remote_code = True ,
233- attn_implementation = " flash_attention_2"
234- )
235- model = model.to(" cuda" ).eval()
236-
237- # Configuration
238- num_frames = 16 # Number of frames sampled from video
239- # frame_tokens: number of spatial tokens per frame
240- # For 448x448 resolution with patch_size=16: (448/16)^2 = 784 tokens per frame (full resolution)
241- # Use fewer tokens (e.g., 256) when applying spatial sampling/compression
242- frame_tokens = 256
243- target_frames = 64 # Target temporal resolution for RoPE (default in model config)
244-
245- # Prepare video input: [batch_size, channels, num_frames, height, width]
246- batch_size = 1
247- pixel_values = torch.randn(batch_size, 3 , num_frames, 448 , 448 ).to(" cuda" )
248-
249- # Create visible indices for temporal sampling
250- # visible_indices specifies which patches to process from the full spatio-temporal grid
251- device = pixel_values.device
252-
253- # Assume uniform sampling: map sampled frame positions to target_frames grid
254- # Example: if you sampled 16 frames uniformly from a video, map them to positions in [0, target_frames-1]
255- sampled_frame_positions = torch.linspace(0 , target_frames - 1 , num_frames).long().to(device)
256- per_frame_tokens = torch.arange(frame_tokens, device = device)
257-
258- # Calculate visible_indices: position in the full (target_frames * frame_tokens) grid
259- visible_indices = (sampled_frame_positions.unsqueeze(- 1 ) * frame_tokens + per_frame_tokens).reshape(1 , - 1 )
260- visible_indices = visible_indices.expand(batch_size, - 1 ).clamp(max = target_frames * frame_tokens - 1 )
261-
262- # Run inference
263207with torch.no_grad():
264- with torch.cuda.amp.autocast(dtype = torch.bfloat16):
265- outputs = model(pixel_values, visible_indices = visible_indices)
266-
267- # Get outputs
268- last_hidden_state = outputs.last_hidden_state # [batch_size, num_visible_patches, hidden_size]
269- pooler_output = outputs.pooler_output # [batch_size, hidden_size]
208+ outputs = model(video, visible_indices = visible_indices)
270209```
271210
272- ### Key Parameters
273-
274- | Parameter | Description | Default |
275- | -----------| -------------| ---------|
276- | ` pixel_values ` | Image: ` [B, C, H, W] ` or Video: ` [B, C, T, H, W] ` | Required |
277- | ` visible_indices ` | Indices of visible patches for sparse processing | Optional (processes all patches if not provided) |
278- | ` attn_implementation ` | Attention implementation: ` "eager" ` or ` "flash_attention_2" ` | ` "flash_attention_2" ` |
279-
280- ### Model Outputs
281-
282- - ` last_hidden_state ` : Hidden states from the last encoder layer ` [batch_size, sequence_length, hidden_size] `
283- - ` pooler_output ` : Pooled representation from the attention pooling head ` [batch_size, hidden_size] `
284- - ` hidden_states ` : (Optional) Hidden states from all layers
285- - ` attentions ` : (Optional) Attention weights from all layers
286-
287211---
288212
289213## 🚀 Training
0 commit comments