Add test for CLIP Processor consistency with AutoImageProcessor (#45)

Copilot · anxiangsir · web-flow · commit e21c17701b7e · 2025-12-31T23:15:18.000+08:00
* Initial plan

* Add test for CLIP Processor consistency with AutoImageProcessor

Co-authored-by: anxiangsir &lt;31175974+anxiangsir@users.noreply.github.com&gt;

* refactor: remove unnecessary skip markers from consistency tests

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: anxiangsir &lt;31175974+anxiangsir@users.noreply.github.com&gt;
Co-authored-by: xiangan &lt;anxiangsir@outlook.com&gt;
diff --git a/tests/test_consistency.py b/tests/test_consistency.py
@@ -111,10 +111,7 @@ def _compute_file_hash(self, content: str) -> str:
         """Compute SHA256 hash of file content."""
         return hashlib.sha256(content.encode("utf-8")).hexdigest()
 
-    @pytest.mark.skipif(
-        True,  # Skip by default since network may not be available
-        reason="Network access required for HuggingFace download"
-    )
+
     def test_modeling_file_consistency(self, local_modeling_path):
         """Test consistency between local and remote modeling file.
 
@@ -143,10 +140,7 @@ def test_modeling_file_consistency(self, local_modeling_path):
         except Exception as e:
             pytest.skip(f"Could not download from HuggingFace: {e}")
 
-    @pytest.mark.skipif(
-        True,  # Skip by default since network may not be available
-        reason="Network access required for HuggingFace download"
-    )
+
     def test_config_file_consistency(self, local_config_path):
         """Test consistency between local and remote configuration file.
 
@@ -261,10 +255,7 @@ def test_manual_preprocess_normalization(self, sample_image):
         assert not torch.isnan(tensor).any(), "Tensor contains NaN values"
         assert not torch.isinf(tensor).any(), "Tensor contains Inf values"
 
-    @pytest.mark.skipif(
-        True,  # Skip by default since network may not be available
-        reason="Network access required for HuggingFace model download"
-    )
+
     def test_preprocessing_consistency_with_auto_processor(self, sample_image):
         """Test consistency between manual preprocessing and AutoImageProcessor.
 
@@ -307,6 +298,56 @@ def test_preprocessing_consistency_with_auto_processor(self, sample_image):
         except Exception as e:
             pytest.skip(f"Could not load AutoImageProcessor: {e}")
 
+
+    def test_preprocessing_consistency_with_clip_processor(self, sample_image):
+        """Test consistency between CLIP Processor and AutoImageProcessor.
+
+        This test compares the output of CLIP CLIPImageProcessor with
+        the AutoImageProcessor from HuggingFace onevision-encoder-large
+        to ensure they produce identical results.
+        """
+        try:
+            from transformers import AutoImageProcessor, CLIPImageProcessor
+
+            # Load the AutoImageProcessor from HuggingFace onevision-encoder-large
+            auto_preprocessor = AutoImageProcessor.from_pretrained(
+                "lmms-lab-encoder/onevision-encoder-large",
+                trust_remote_code=True
+            )
+
+            # Load the CLIP CLIPImageProcessor with same parameters
+            clip_preprocessor = CLIPImageProcessor.from_pretrained(
+                "lmms-lab-encoder/onevision-encoder-large",
+                trust_remote_code=True
+            )
+
+            # Process with AutoImageProcessor
+            auto_output = auto_preprocessor(images=sample_image, return_tensors="pt")
+            auto_tensor = auto_output["pixel_values"]
+
+            # Process with CLIP CLIPImageProcessor
+            clip_output = clip_preprocessor(images=sample_image, return_tensors="pt")
+            clip_tensor = clip_output["pixel_values"]
+
+            # Compare outputs
+            assert auto_tensor.shape == clip_tensor.shape, (
+                f"Shape mismatch: auto={auto_tensor.shape}, clip={clip_tensor.shape}"
+            )
+
+            # Check if values are close (allowing for small floating point differences)
+            is_close = torch.allclose(auto_tensor, clip_tensor, rtol=1e-4, atol=1e-4)
+            if not is_close:
+                max_diff = (auto_tensor - clip_tensor).abs().max().item()
+                pytest.fail(
+                    f"Preprocessing outputs differ between AutoImageProcessor and CLIPImageProcessor!\n"
+                    f"Max difference: {max_diff}\n"
+                    f"Auto tensor stats: min={auto_tensor.min()}, max={auto_tensor.max()}\n"
+                    f"CLIP tensor stats: min={clip_tensor.min()}, max={clip_tensor.max()}"
+                )
+
+        except Exception as e:
+            pytest.skip(f"Could not load processors: {e}")
+
     def test_grayscale_image_conversion(self):
         """Test that grayscale images are converted to RGB."""
         # Create a grayscale image