Fix tests to match enhanced multimodal capabilities

Robert Fitzpatrick · Robert Fitzpatrick · commit 6c81cb81e0ee · 2026-02-19T23:43:41.000Z
- Update test expectations for gpt-4o to include audio support
- Modify supported_modalities test to expect 3 modalities (text, image, audio)
- Update exact_combination_matching to test all valid combinations
- All 9 tests now pass with enhanced multimodal detection

The tests now correctly reflect that gpt-4o supports both image AND audio
capabilities, matching our enhanced runtime testing implementation.
diff --git a/tests/unit/prompt_target/test_modality_support.py b/tests/unit/prompt_target/test_modality_support.py
@@ -172,11 +172,12 @@ def test_supported_modalities_properties():
     openai_target.model_name = "gpt-4o"  # Use multimodal model
     text_target = MockTextTarget()
     
-    # OpenAI with gpt-4o should return text and image_path
+    # OpenAI with gpt-4o should return text, image_path, and audio_path
     openai_input_modalities = openai_target.supported_input_modalities
     assert "text" in openai_input_modalities
     assert "image_path" in openai_input_modalities
-    assert len(openai_input_modalities) == 2
+    assert "audio_path" in openai_input_modalities
+    assert len(openai_input_modalities) == 3  # text, image_path, audio_path
     
     # Text target should return only text
     text_input_modalities = text_target.supported_input_modalities  
@@ -235,13 +236,16 @@ def test_openai_model_specific_capabilities():
 def test_exact_combination_matching():
     """Test that modality support requires exact combination matching."""
     target = MockOpenAITarget()
-    target.model_name = "gpt-4o"  # Use multimodal model
+    target.model_name = "gpt-4o"  # Use multimodal model with image+audio support
     
-    # Supported combinations: {text} and {text, image_path}
+    # Supported combinations for gpt-4o: text, text+image, text+audio, text+image+audio
     assert target.input_modality_supported({"text"})
     assert target.input_modality_supported({"text", "image_path"})
+    assert target.input_modality_supported({"text", "audio_path"})
+    assert target.input_modality_supported({"text", "image_path", "audio_path"})
     
-    # Unsupported combinations
+    # Unsupported combinations (missing text or only single modality)
     assert not target.input_modality_supported({"image_path"})  # image only
-    assert not target.input_modality_supported({"text", "audio_path"})  # text+audio
-    assert not target.input_modality_supported({"text", "image_path", "audio_path"})  # text+image+audio
+    assert not target.input_modality_supported({"audio_path"})  # audio only
+    assert not target.input_modality_supported({"image_path", "audio_path"})  # image+audio without text
+    assert not target.input_modality_supported({"text", "video_path"})  # unsupported modality