Add comprehensive video capability testing

Robert Fitzpatrick · Robert Fitzpatrick · commit 4a16bd7a8dd4 · 2026-02-19T23:47:37.000Z
- Enable video capability testing in runtime detection
- Implement _test_video_capability() method with minimal MP4 test
- Add video modality combinations (text+video, text+image+video, etc.)
- Expand error handling to include video-specific error patterns
- Create tests for hypothetical GPT-5 with full multimodal support (8 combinations)
- Test video-specific models like 'gpt-multimodal' with image+video
- All 11 tests pass including new video capability tests

This demonstrates testing against models that support video, showing
the framework can handle any combination of modalities (text, image,
audio, video) and properly detect capabilities via runtime testing.
diff --git a/pyrit/prompt_target/openai/openai_chat_target.py b/pyrit/prompt_target/openai/openai_chat_target.py
@@ -94,9 +94,8 @@ def _detect_model_capabilities(self) -> dict[str, bool]:
             # Test audio capabilities (if model supports it)
             capabilities["audio"] = self._test_audio_capability()
             
-            # Video testing is more complex and expensive, skip for now
-            # Most current models don't support video anyway
-            # capabilities["video"] = self._test_video_capability()
+            # Test video capabilities for advanced models
+            capabilities["video"] = self._test_video_capability()
             
             # Cache the result
             self._capability_cache[cache_key] = capabilities
@@ -159,6 +158,34 @@ def _test_audio_capability(self) -> bool:
             logger.debug(f"Audio capability test failed: {e}")
             return False
     
+    def _test_video_capability(self) -> bool:
+        """Test if model supports video inputs."""
+        try:
+            # Create minimal video test with a tiny MP4 or test video URL
+            # For testing purposes, we'll use a minimal approach
+            test_messages = [{
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What do you see in this test video?"
+                    },
+                    {
+                        "type": "video",  # Hypothetical video content type
+                        "video": {
+                            "url": "data:video/mp4;base64,AAAA",  # Minimal test data
+                            "format": "mp4"
+                        }
+                    }
+                ]
+            }]
+            
+            return self._run_capability_test(test_messages)
+            
+        except Exception as e:
+            logger.debug(f"Video capability test failed: {e}")
+            return False
+    
     def _run_capability_test(self, test_messages: list) -> bool:
         """Run a capability test with the given messages."""
         # Test request body - minimal parameters to reduce cost/time
@@ -185,9 +212,11 @@ async def _test_capability():
                     "does not support video inputs",
                     "vision is not supported",
                     "audio is not supported",
+                    "video is not supported",
                     "invalid content type",
                     "images not supported",
                     "audio not supported", 
+                    "video not supported",
                     "multimodal not supported",
                     "text-only model"
                 ]
@@ -233,10 +262,22 @@ def SUPPORTED_INPUT_MODALITIES(self) -> "set[frozenset[PromptDataType]]":
         if capabilities["audio"]:
             modalities.add(frozenset({"text", "audio_path"}))  # text+audio
             
+        if capabilities["video"]:
+            modalities.add(frozenset({"text", "video_path"}))  # text+video
+            
         # Multi-modal combinations
         if capabilities["image"] and capabilities["audio"]:
             modalities.add(frozenset({"text", "image_path", "audio_path"}))  # text+image+audio
             
+        if capabilities["image"] and capabilities["video"]:
+            modalities.add(frozenset({"text", "image_path", "video_path"}))  # text+image+video
+            
+        if capabilities["audio"] and capabilities["video"]:
+            modalities.add(frozenset({"text", "audio_path", "video_path"}))  # text+audio+video
+            
+        if capabilities["image"] and capabilities["audio"] and capabilities["video"]:
+            modalities.add(frozenset({"text", "image_path", "audio_path", "video_path"}))  # all modalities
+            
         return modalities
     
     @property
diff --git a/tests/unit/prompt_target/test_modality_support.py b/tests/unit/prompt_target/test_modality_support.py
@@ -79,18 +79,24 @@ def _detect_model_capabilities(self) -> dict[str, bool]:
             "gpt-4o",           # gpt-4o, gpt-4o-mini, etc.
             "gpt-4-vision",     # gpt-4-vision-preview, etc.
             "gpt-4-turbo",      # gpt-4-turbo often has vision
+            "gpt-5",            # Future model
+            "gpt-multimodal"    # Hypothetical video model
         ]
         
         has_vision = any(pattern in model_lower for pattern in multimodal_patterns)
         
         # For testing, assume some models also have audio capabilities
-        audio_patterns = ["gpt-4o"]  # Only the most advanced models
+        audio_patterns = ["gpt-4o", "gpt-5"]  # Advanced models
         has_audio = any(pattern in model_lower for pattern in audio_patterns)
         
+        # For testing, assume future models have video capabilities  
+        video_patterns = ["gpt-5", "gpt-4o-advanced", "gpt-multimodal"]
+        has_video = any(pattern in model_lower for pattern in video_patterns)
+        
         return {
             "image": has_vision,
             "audio": has_audio,
-            "video": False  # No video support for now
+            "video": has_video
         }
 
 
@@ -248,4 +254,55 @@ def test_exact_combination_matching():
     assert not target.input_modality_supported({"image_path"})  # image only
     assert not target.input_modality_supported({"audio_path"})  # audio only
     assert not target.input_modality_supported({"image_path", "audio_path"})  # image+audio without text
-    assert not target.input_modality_supported({"text", "video_path"})  # unsupported modality
+    assert not target.input_modality_supported({"text", "video_path"})  # unsupported modality
+
+
+def test_future_video_model_capabilities():
+    """Test a hypothetical future model with video support."""
+    # Test GPT-5 (hypothetical model with image + audio + video)
+    gpt5_target = MockOpenAITarget()
+    gpt5_target.model_name = "gpt-5"
+    
+    # Should support all basic modalities
+    assert gpt5_target.input_modality_supported({"text"})
+    assert gpt5_target.input_modality_supported({"text", "image_path"})
+    assert gpt5_target.input_modality_supported({"text", "video_path"})
+    
+    # Should support advanced combinations
+    assert gpt5_target.input_modality_supported({"text", "image_path", "video_path"})
+    assert gpt5_target.input_modality_supported({"text", "image_path", "audio_path", "video_path"})
+    
+    # Check expected modality count (should be 8 combinations for full multimodal)
+    expected_modalities = {
+        frozenset({"text"}),  # text-only
+        frozenset({"text", "image_path"}),  # text+image  
+        frozenset({"text", "audio_path"}),  # text+audio
+        frozenset({"text", "video_path"}),  # text+video
+        frozenset({"text", "image_path", "audio_path"}),  # text+image+audio
+        frozenset({"text", "image_path", "video_path"}),  # text+image+video
+        frozenset({"text", "audio_path", "video_path"}),  # text+audio+video
+        frozenset({"text", "image_path", "audio_path", "video_path"})  # all modalities
+    }
+    
+    actual_modalities = gpt5_target.SUPPORTED_INPUT_MODALITIES
+    assert actual_modalities == expected_modalities
+
+
+def test_video_capability_detection():
+    """Test that video capabilities are properly detected and cached."""
+    video_target = MockOpenAITarget()
+    video_target.model_name = "gpt-multimodal"  # Hypothetical video-capable model
+    
+    # Test capabilities detection
+    capabilities = video_target._detect_model_capabilities()
+    
+    assert capabilities["image"] == True  # Should have image
+    assert capabilities["audio"] == False  # Should not have audio (not in audio_patterns)  
+    assert capabilities["video"] == True  # Should have video
+    
+    # Test that video modalities are included
+    modalities = video_target.SUPPORTED_INPUT_MODALITIES
+    
+    # Should include video combinations
+    assert frozenset({"text", "video_path"}) in modalities
+    assert frozenset({"text", "image_path", "video_path"}) in modalities