@@ -64,16 +64,17 @@ class OpenAIChatTarget(OpenAITarget, PromptChatTarget):
6464
6565 """
6666
67- def _detect_model_capabilities (self ) -> bool :
67+ def _detect_model_capabilities (self ) -> dict [ str , bool ] :
6868 """
6969 Detect model multimodal capabilities via runtime testing.
7070
71- Sends a minimal multimodal test request to determine if the model
72- supports image inputs . This is the most robust approach that works
71+ Tests multiple modalities (image, audio, video) to determine what
72+ the model actually supports . This is the most robust approach that works
7373 regardless of model names or naming conventions.
7474
7575 Returns:
76- bool: True if model supports multimodal input, False if text-only
76+ dict: Mapping of modality types to support status
77+ e.g., {"image": True, "audio": False, "video": False}
7778 """
7879 # Cache the result to avoid repeated testing
7980 if not hasattr (self , '_capability_cache' ):
@@ -83,6 +84,35 @@ def _detect_model_capabilities(self) -> bool:
8384 if cache_key in self ._capability_cache :
8485 return self ._capability_cache [cache_key ]
8586
87+ # Test results for different modalities
88+ capabilities = {"image" : False , "audio" : False , "video" : False }
89+
90+ try :
91+ # Test image capabilities
92+ capabilities ["image" ] = self ._test_image_capability ()
93+
94+ # Test audio capabilities (if model supports it)
95+ capabilities ["audio" ] = self ._test_audio_capability ()
96+
97+ # Video testing is more complex and expensive, skip for now
98+ # Most current models don't support video anyway
99+ # capabilities["video"] = self._test_video_capability()
100+
101+ # Cache the result
102+ self ._capability_cache [cache_key ] = capabilities
103+ logger .info (f"Detected model { self .model_name } capabilities: { capabilities } " )
104+
105+ return capabilities
106+
107+ except Exception as e :
108+ # If runtime testing fails entirely, default to text-only as safe fallback
109+ logger .warning (f"Runtime capability detection failed: { e } . Defaulting to text-only." )
110+ default_capabilities = {"image" : False , "audio" : False , "video" : False }
111+ self ._capability_cache [cache_key ] = default_capabilities
112+ return default_capabilities
113+
114+ def _test_image_capability (self ) -> bool :
115+ """Test if model supports image inputs."""
86116 try :
87117 # Create minimal 1x1 pixel transparent PNG as base64
88118 # This is the smallest possible valid PNG image (67 bytes)
@@ -108,83 +138,106 @@ def _detect_model_capabilities(self) -> bool:
108138 ]
109139 }]
110140
111- # Test request body - minimal parameters to reduce cost/time
112- test_body = {
113- "model" : self .model_name ,
114- "messages" : test_messages ,
115- "max_tokens" : 1 , # Minimal response to reduce cost
116- "temperature" : 0.0 # Deterministic for consistency
117- }
118-
119- # Try the multimodal request
120- async def _test_capability ():
121- try :
122- response = await self ._async_client .chat .completions .create (** test_body )
123- # If we got a response, the model supports multimodal
124- return True
125- except Exception as e :
126- error_msg = str (e ).lower ()
127-
128- # Check for specific errors that indicate no multimodal support
129- no_vision_indicators = [
130- "does not support image inputs" ,
131- "vision is not supported" ,
132- "invalid content type" ,
133- "images not supported" ,
134- "multimodal not supported" ,
135- "text-only model"
136- ]
137-
138- if any (indicator in error_msg for indicator in no_vision_indicators ):
139- return False
140-
141- # For other errors (auth, rate limit, etc.), assume text-only as safe default
142- logger .warning (f"Capability test failed with error: { e } . Defaulting to text-only." )
143- return False
144-
145- # Run the test - handle both running and new event loops
146- try :
147- loop = asyncio .get_running_loop ()
148- # If we're in an async context, create a task
149- with concurrent .futures .ThreadPoolExecutor () as executor :
150- future = executor .submit (asyncio .run , _test_capability ())
151- result = future .result (timeout = 30 ) # 30 second timeout
152- except RuntimeError :
153- # No running loop, safe to use asyncio.run
154- result = asyncio .run (_test_capability ())
141+ return self ._run_capability_test (test_messages )
155142
156- # Cache the result
157- self ._capability_cache [cache_key ] = result
158- logger .info (f"Detected model { self .model_name } multimodal capability: { result } " )
143+ except Exception as e :
144+ logger .debug (f"Image capability test failed: { e } " )
145+ return False
146+
147+ def _test_audio_capability (self ) -> bool :
148+ """Test if model supports audio inputs."""
149+ try :
150+ # Create minimal audio test (this would need actual audio data)
151+ # For now, we'll assume audio follows similar patterns to image
152+ # TODO: Implement actual audio testing when we have sample audio data
159153
160- return result
154+ # Simplified test - just check if audio is mentioned in model capabilities
155+ # This is a placeholder until we implement full audio testing
156+ return False # Conservative default
161157
162158 except Exception as e :
163- # If runtime testing fails entirely, default to text-only as safe fallback
164- logger .warning (f"Runtime capability detection failed: { e } . Defaulting to text-only." )
165- self ._capability_cache [cache_key ] = False
159+ logger .debug (f"Audio capability test failed: { e } " )
166160 return False
161+
162+ def _run_capability_test (self , test_messages : list ) -> bool :
163+ """Run a capability test with the given messages."""
164+ # Test request body - minimal parameters to reduce cost/time
165+ test_body = {
166+ "model" : self .model_name ,
167+ "messages" : test_messages ,
168+ "max_tokens" : 1 , # Minimal response to reduce cost
169+ "temperature" : 0.0 # Deterministic for consistency
170+ }
171+
172+ # Try the multimodal request
173+ async def _test_capability ():
174+ try :
175+ response = await self ._async_client .chat .completions .create (** test_body )
176+ # If we got a response, the model supports this modality
177+ return True
178+ except Exception as e :
179+ error_msg = str (e ).lower ()
180+
181+ # Check for specific errors that indicate no multimodal support
182+ no_support_indicators = [
183+ "does not support image inputs" ,
184+ "does not support audio inputs" ,
185+ "does not support video inputs" ,
186+ "vision is not supported" ,
187+ "audio is not supported" ,
188+ "invalid content type" ,
189+ "images not supported" ,
190+ "audio not supported" ,
191+ "multimodal not supported" ,
192+ "text-only model"
193+ ]
194+
195+ if any (indicator in error_msg for indicator in no_support_indicators ):
196+ return False
197+
198+ # For other errors (auth, rate limit, etc.), assume not supported as safe default
199+ logger .debug (f"Capability test failed with error: { e } . Defaulting to not supported." )
200+ return False
201+
202+ # Run the test - handle both running and new event loops
203+ try :
204+ loop = asyncio .get_running_loop ()
205+ # If we're in an async context, create a task
206+ with concurrent .futures .ThreadPoolExecutor () as executor :
207+ future = executor .submit (asyncio .run , _test_capability ())
208+ result = future .result (timeout = 30 ) # 30 second timeout
209+ except RuntimeError :
210+ # No running loop, safe to use asyncio.run
211+ result = asyncio .run (_test_capability ())
212+
213+ return result
167214
168215 @property
169216 def SUPPORTED_INPUT_MODALITIES (self ) -> "set[frozenset[PromptDataType]]" :
170217 """
171218 Get supported input modalities based on the specific OpenAI model.
172219
173220 Uses runtime testing to detect multimodal capabilities:
174- - Sends a minimal test image+text request to the model
175- - Returns multimodal support if successful, text-only if not
221+ - Tests image, audio, and potentially video support
222+ - Returns appropriate modality combinations based on detected capabilities
176223 - Caches results to avoid repeated testing
177224 - Works with any model regardless of naming conventions
178225 """
179- if self ._detect_model_capabilities ():
180- return {
181- frozenset ({"text" }), # text-only
182- frozenset ({"text" , "image_path" }) # text+image
183- }
184- else :
185- return {
186- frozenset ({"text" }) # text-only
187- }
226+ capabilities = self ._detect_model_capabilities ()
227+
228+ modalities = {frozenset ({"text" })} # All models support text
229+
230+ if capabilities ["image" ]:
231+ modalities .add (frozenset ({"text" , "image_path" })) # text+image
232+
233+ if capabilities ["audio" ]:
234+ modalities .add (frozenset ({"text" , "audio_path" })) # text+audio
235+
236+ # Multi-modal combinations
237+ if capabilities ["image" ] and capabilities ["audio" ]:
238+ modalities .add (frozenset ({"text" , "image_path" , "audio_path" })) # text+image+audio
239+
240+ return modalities
188241
189242 @property
190243 def SUPPORTED_OUTPUT_MODALITIES (self ) -> "set[frozenset[PromptDataType]]" :
0 commit comments