@@ -316,6 +316,315 @@ def test_automodel_vs_onevision_encoder_model_dtype_consistency(
316316 torch .cuda .empty_cache ()
317317
318318
319+ class TestDirectClassInstantiation :
320+ """
321+ Tests for output consistency using direct class instantiation.
322+
323+ These tests do NOT use AutoModel - they directly instantiate OneVisionEncoderModel
324+ class to verify output consistency across different transformers versions.
325+ """
326+
327+ @pytest .fixture
328+ def test_image (self ):
329+ """Create a sample test image."""
330+ return create_test_image ()
331+
332+ @pytest .fixture
333+ def model_name (self ):
334+ """Model name for loading from HuggingFace."""
335+ return "lmms-lab-encoder/onevision-encoder-large"
336+
337+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
338+ def test_direct_instantiation_flash_attention (self , test_image , model_name ):
339+ """
340+ Test direct class instantiation with flash_attention_2.
341+
342+ This test directly uses OneVisionEncoderModel.from_pretrained()
343+ without AutoModel to verify output consistency.
344+ """
345+ from transformers import AutoImageProcessor
346+ from onevision_encoder import OneVisionEncoderModel
347+
348+ current_version = get_current_transformers_version ()
349+ print (f"\n Running direct instantiation test with transformers version: { current_version } " )
350+
351+ # Load model directly using OneVisionEncoderModel class
352+ model = OneVisionEncoderModel .from_pretrained (
353+ model_name ,
354+ trust_remote_code = True ,
355+ attn_implementation = "flash_attention_2"
356+ ).to ("cuda" ).eval ()
357+
358+ # Load preprocessor
359+ preprocessor = AutoImageProcessor .from_pretrained (
360+ model_name ,
361+ trust_remote_code = True
362+ )
363+
364+ # Preprocess image
365+ inputs = preprocessor (images = test_image , return_tensors = "pt" )
366+ pixel_values = inputs ["pixel_values" ].to ("cuda" )
367+
368+ # Run inference
369+ with torch .no_grad ():
370+ with torch .amp .autocast (dtype = torch .bfloat16 , device_type = "cuda" ):
371+ output = model (pixel_values )
372+
373+ # Verify output shape and values
374+ assert output .last_hidden_state is not None , "last_hidden_state should not be None"
375+ assert output .last_hidden_state .shape [0 ] == 1 , "Batch size should be 1"
376+ assert not torch .isnan (output .last_hidden_state ).any (), "Output contains NaN values"
377+ assert not torch .isinf (output .last_hidden_state ).any (), "Output contains Inf values"
378+
379+ print (f"Output shape: { output .last_hidden_state .shape } " )
380+ print (f"Output stats: min={ output .last_hidden_state .min ().item ():.4f} , "
381+ f"max={ output .last_hidden_state .max ().item ():.4f} , "
382+ f"mean={ output .last_hidden_state .mean ().item ():.4f} " )
383+
384+ # Clean up
385+ del model
386+ torch .cuda .empty_cache ()
387+
388+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
389+ def test_direct_instantiation_eager_attention (self , test_image , model_name ):
390+ """
391+ Test direct class instantiation with eager attention.
392+
393+ This test directly uses OneVisionEncoderModel.from_pretrained()
394+ with eager attention implementation.
395+ """
396+ from transformers import AutoImageProcessor
397+ from onevision_encoder import OneVisionEncoderModel
398+
399+ current_version = get_current_transformers_version ()
400+ print (f"\n Running direct instantiation (eager) test with transformers version: { current_version } " )
401+
402+ # Load model directly using OneVisionEncoderModel class with eager attention
403+ model = OneVisionEncoderModel .from_pretrained (
404+ model_name ,
405+ trust_remote_code = True ,
406+ attn_implementation = "eager"
407+ ).to ("cuda" ).eval ()
408+
409+ # Load preprocessor
410+ preprocessor = AutoImageProcessor .from_pretrained (
411+ model_name ,
412+ trust_remote_code = True
413+ )
414+
415+ # Preprocess image
416+ inputs = preprocessor (images = test_image , return_tensors = "pt" )
417+ pixel_values = inputs ["pixel_values" ].to ("cuda" )
418+
419+ # Run inference
420+ with torch .no_grad ():
421+ output = model (pixel_values )
422+
423+ # Verify output shape and values
424+ assert output .last_hidden_state is not None , "last_hidden_state should not be None"
425+ assert output .last_hidden_state .shape [0 ] == 1 , "Batch size should be 1"
426+ assert not torch .isnan (output .last_hidden_state ).any (), "Output contains NaN values"
427+ assert not torch .isinf (output .last_hidden_state ).any (), "Output contains Inf values"
428+
429+ print (f"Output shape: { output .last_hidden_state .shape } " )
430+ print (f"Output stats: min={ output .last_hidden_state .min ().item ():.4f} , "
431+ f"max={ output .last_hidden_state .max ().item ():.4f} , "
432+ f"mean={ output .last_hidden_state .mean ().item ():.4f} " )
433+
434+ # Clean up
435+ del model
436+ torch .cuda .empty_cache ()
437+
438+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
439+ def test_direct_instantiation_flash_vs_eager_consistency (self , test_image , model_name ):
440+ """
441+ Test output consistency between flash_attention_2 and eager attention.
442+
443+ This test directly instantiates two models with different attention
444+ implementations and compares their outputs to verify consistency.
445+ """
446+ from transformers import AutoImageProcessor
447+ from onevision_encoder import OneVisionEncoderModel
448+
449+ current_version = get_current_transformers_version ()
450+ print (f"\n Running flash vs eager consistency test with transformers version: { current_version } " )
451+
452+ # Load model with flash_attention_2
453+ model_flash = OneVisionEncoderModel .from_pretrained (
454+ model_name ,
455+ trust_remote_code = True ,
456+ attn_implementation = "flash_attention_2"
457+ ).to ("cuda" ).eval ()
458+
459+ # Load model with eager attention
460+ model_eager = OneVisionEncoderModel .from_pretrained (
461+ model_name ,
462+ trust_remote_code = True ,
463+ attn_implementation = "eager"
464+ ).to ("cuda" ).eval ()
465+
466+ # Load preprocessor
467+ preprocessor = AutoImageProcessor .from_pretrained (
468+ model_name ,
469+ trust_remote_code = True
470+ )
471+
472+ # Preprocess image
473+ inputs = preprocessor (images = test_image , return_tensors = "pt" )
474+ pixel_values = inputs ["pixel_values" ].to ("cuda" )
475+
476+ # Run inference with both models
477+ with torch .no_grad ():
478+ with torch .amp .autocast (dtype = torch .bfloat16 , device_type = "cuda" ):
479+ output_flash = model_flash (pixel_values )
480+ output_eager = model_eager (pixel_values )
481+
482+ # Compare shapes
483+ assert output_flash .last_hidden_state .shape == output_eager .last_hidden_state .shape , (
484+ f"Shape mismatch: flash={ output_flash .last_hidden_state .shape } , "
485+ f"eager={ output_eager .last_hidden_state .shape } "
486+ )
487+
488+ # Compare outputs (allow some tolerance due to different implementations)
489+ max_diff = (output_flash .last_hidden_state - output_eager .last_hidden_state ).abs ().max ().item ()
490+ mean_diff = (output_flash .last_hidden_state - output_eager .last_hidden_state ).abs ().mean ().item ()
491+
492+ print (f"Flash vs Eager comparison:" )
493+ print (f" Max difference: { max_diff :.6f} " )
494+ print (f" Mean difference: { mean_diff :.6f} " )
495+
496+ # Flash and Eager attention produce similar but not identical results due to:
497+ # 1. Different numerical algorithms (FlashAttention uses online softmax)
498+ # 2. bfloat16 precision limitations with autocast
499+ # 3. Different memory access patterns affecting floating point accumulation
500+ # Tolerance of 1e-2 is appropriate for this comparison
501+ FLASH_EAGER_RTOL = 1e-2
502+ FLASH_EAGER_ATOL = 1e-2
503+ is_close = torch .allclose (
504+ output_flash .last_hidden_state ,
505+ output_eager .last_hidden_state ,
506+ rtol = FLASH_EAGER_RTOL ,
507+ atol = FLASH_EAGER_ATOL
508+ )
509+
510+ if not is_close :
511+ pytest .fail (
512+ f"Output mismatch between flash_attention_2 and eager!\n "
513+ f"Max difference: { max_diff } \n "
514+ f"Mean difference: { mean_diff } \n "
515+ f"Flash stats: min={ output_flash .last_hidden_state .min ()} , max={ output_flash .last_hidden_state .max ()} \n "
516+ f"Eager stats: min={ output_eager .last_hidden_state .min ()} , max={ output_eager .last_hidden_state .max ()} "
517+ )
518+
519+ # Clean up
520+ del model_flash , model_eager
521+ torch .cuda .empty_cache ()
522+
523+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
524+ def test_direct_instantiation_deterministic_output (self , test_image , model_name ):
525+ """
526+ Test that direct class instantiation produces deterministic outputs.
527+
528+ Running the same model twice with the same input should produce
529+ identical outputs.
530+ """
531+ from transformers import AutoImageProcessor
532+ from onevision_encoder import OneVisionEncoderModel
533+
534+ current_version = get_current_transformers_version ()
535+ print (f"\n Running deterministic output test with transformers version: { current_version } " )
536+
537+ # Load model
538+ model = OneVisionEncoderModel .from_pretrained (
539+ model_name ,
540+ trust_remote_code = True ,
541+ attn_implementation = "flash_attention_2"
542+ ).to ("cuda" ).eval ()
543+
544+ # Load preprocessor
545+ preprocessor = AutoImageProcessor .from_pretrained (
546+ model_name ,
547+ trust_remote_code = True
548+ )
549+
550+ # Preprocess image
551+ inputs = preprocessor (images = test_image , return_tensors = "pt" )
552+ pixel_values = inputs ["pixel_values" ].to ("cuda" )
553+
554+ # Run inference twice
555+ # Note: Using autocast with bfloat16 during eval mode should be deterministic
556+ # since dropout is disabled and no stochastic operations are performed
557+ with torch .no_grad ():
558+ with torch .amp .autocast (dtype = torch .bfloat16 , device_type = "cuda" ):
559+ output1 = model (pixel_values )
560+ output2 = model (pixel_values )
561+
562+ # Outputs should be identical
563+ is_identical = torch .equal (output1 .last_hidden_state , output2 .last_hidden_state )
564+
565+ if not is_identical :
566+ max_diff = (output1 .last_hidden_state - output2 .last_hidden_state ).abs ().max ().item ()
567+ pytest .fail (
568+ f"Non-deterministic output detected!\n "
569+ f"Max difference between two runs: { max_diff } "
570+ )
571+
572+ print ("Deterministic output verified: two identical runs produce identical outputs" )
573+
574+ # Clean up
575+ del model
576+ torch .cuda .empty_cache ()
577+
578+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "CUDA not available" )
579+ def test_direct_instantiation_bfloat16 (self , test_image , model_name ):
580+ """
581+ Test direct class instantiation with bfloat16 dtype.
582+ """
583+ from transformers import AutoImageProcessor
584+ from onevision_encoder import OneVisionEncoderModel
585+
586+ current_version = get_current_transformers_version ()
587+ print (f"\n Running bfloat16 test with transformers version: { current_version } " )
588+
589+ # Load model with bfloat16
590+ model = OneVisionEncoderModel .from_pretrained (
591+ model_name ,
592+ trust_remote_code = True ,
593+ attn_implementation = "flash_attention_2" ,
594+ torch_dtype = torch .bfloat16
595+ ).to ("cuda" ).eval ()
596+
597+ # Load preprocessor
598+ preprocessor = AutoImageProcessor .from_pretrained (
599+ model_name ,
600+ trust_remote_code = True
601+ )
602+
603+ # Preprocess image
604+ inputs = preprocessor (images = test_image , return_tensors = "pt" )
605+ pixel_values = inputs ["pixel_values" ].to ("cuda" , dtype = torch .bfloat16 )
606+
607+ # Run inference
608+ with torch .no_grad ():
609+ output = model (pixel_values )
610+
611+ # Verify dtype
612+ assert output .last_hidden_state .dtype == torch .bfloat16 , (
613+ f"Expected bfloat16 output, got: { output .last_hidden_state .dtype } "
614+ )
615+
616+ # Verify no NaN/Inf
617+ assert not torch .isnan (output .last_hidden_state ).any (), "Output contains NaN values"
618+ assert not torch .isinf (output .last_hidden_state ).any (), "Output contains Inf values"
619+
620+ print (f"bfloat16 output verified: dtype={ output .last_hidden_state .dtype } " )
621+ print (f"Output shape: { output .last_hidden_state .shape } " )
622+
623+ # Clean up
624+ del model
625+ torch .cuda .empty_cache ()
626+
627+
319628class TestTransformersVersionInfo :
320629 """Test class to document and verify transformers version information."""
321630
0 commit comments