@@ -115,8 +115,9 @@ def test_automodel_vs_onevision_encoder_model_output_consistency(
115115
116116 # Run inference with both models
117117 with torch .no_grad ():
118- auto_output = auto_model (pixel_values )
119- onevision_output = onevision_model (pixel_values )
118+ with torch .amp .autocast (dtype = torch .bfloat16 , device_type = "cuda" ):
119+ auto_output = auto_model (pixel_values )
120+ onevision_output = onevision_model (pixel_values )
120121
121122 # Compare last_hidden_state
122123 assert auto_output .last_hidden_state .shape == onevision_output .last_hidden_state .shape , (
@@ -239,76 +240,6 @@ def test_automodel_vs_onevision_encoder_model_eager_attention(
239240 del auto_model , onevision_model
240241 torch .cuda .empty_cache ()
241242
242- @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
243- def test_automodel_vs_onevision_encoder_model_batch_input (
244- self , model_name
245- ):
246- """
247- Test output consistency with batched input.
248-
249- This ensures both loading methods handle batch processing identically.
250- """
251- from transformers import AutoModel , AutoImageProcessor
252-
253- # Log transformers version
254- current_version = get_current_transformers_version ()
255- print (f"\n Running test with transformers version: { current_version } " )
256-
257- # Create multiple test images
258- images = [create_test_image (seed = i ) for i in range (3 )]
259-
260- # Load models
261- auto_model = AutoModel .from_pretrained (
262- model_name ,
263- trust_remote_code = True ,
264- attn_implementation = "flash_attention_2"
265- ).to ("cuda" ).eval ()
266-
267- onevision_model = OneVisionEncoderModel .from_pretrained (
268- model_name ,
269- trust_remote_code = True ,
270- attn_implementation = "flash_attention_2"
271- ).to ("cuda" ).eval ()
272-
273- # Load preprocessor
274- preprocessor = AutoImageProcessor .from_pretrained (
275- model_name ,
276- trust_remote_code = True
277- )
278-
279- # Preprocess batch of images
280- inputs = preprocessor (images = images , return_tensors = "pt" )
281- pixel_values = inputs ["pixel_values" ].to ("cuda" )
282-
283- # Run inference
284- with torch .no_grad ():
285- auto_output = auto_model (pixel_values )
286- onevision_output = onevision_model (pixel_values )
287-
288- # Compare outputs
289- assert auto_output .last_hidden_state .shape [0 ] == len (images ), (
290- f"Expected batch size { len (images )} , got { auto_output .last_hidden_state .shape [0 ]} "
291- )
292-
293- is_close = torch .allclose (
294- auto_output .last_hidden_state ,
295- onevision_output .last_hidden_state ,
296- rtol = 1e-4 ,
297- atol = 1e-4
298- )
299-
300- if not is_close :
301- max_diff = (
302- auto_output .last_hidden_state - onevision_output .last_hidden_state
303- ).abs ().max ().item ()
304- pytest .fail (
305- f"Batch output mismatch!\n "
306- f"Max difference: { max_diff } "
307- )
308-
309- # Clean up
310- del auto_model , onevision_model
311- torch .cuda .empty_cache ()
312243
313244 @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
314245 def test_automodel_vs_onevision_encoder_model_dtype_consistency (
0 commit comments