Skip to content

Commit a4f4d3c

Browse files
sena-labsclaude
andcommitted
feat: handle audio and image output models from OpenRouter
OpenRouter exposes models that return audio transcripts (message.audio) and generated images (message.images) instead of text in message.content. Without this fix those models return an empty response in Open WebUI. - Add _format_image_output() to render image URLs as markdown image tags - In _non_stream_response: fall back to audio.transcript when content is empty/None, and append any images as markdown after the text content - In _stream_response: use delta.audio.transcript as the streamed text when delta.content is absent (audio-streaming models) - Add 23 unit tests covering all new paths (section 34 in test_pipe.py) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent a7e973d commit a4f4d3c

2 files changed

Lines changed: 192 additions & 2 deletions

File tree

openrouter_pipe.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,18 @@ def _format_cost_info(usage: dict, currency: str = "USD") -> str:
156156
return f"\n\n---\n*{' · '.join(parts)}*"
157157

158158

159+
def _format_image_output(images: list) -> str:
160+
"""Format OpenRouter image output objects as markdown image tags."""
161+
parts = []
162+
for img in (images or []):
163+
if not isinstance(img, dict):
164+
continue
165+
url = (img.get("image_url") or {}).get("url", "")
166+
if url:
167+
parts.append(f"![Generated image]({url})")
168+
return "\n\n".join(parts)
169+
170+
159171
class Pipe:
160172
class Valves(BaseModel):
161173
OPENROUTER_API_KEY: str = Field(
@@ -859,14 +871,25 @@ def _non_stream_response(self, headers: dict, payload: dict) -> str:
859871
citations = res.get("citations", [])
860872

861873
reasoning = _insert_citations(message.get("reasoning", ""), citations)
862-
content = _insert_citations(message.get("content", ""), citations)
874+
content = _insert_citations(message.get("content") or "", citations)
863875
rendered_citations = _format_citation_list(citations)
864876

877+
# Audio output: show transcript when the model returns audio instead of text
878+
audio_obj = message.get("audio") or {}
879+
if audio_obj and not content:
880+
transcript = audio_obj.get("transcript", "")
881+
content = transcript or "*[Audio response — transcript not available.]*"
882+
883+
# Image output: render generated images as markdown
884+
image_md = _format_image_output(message.get("images") or [])
885+
865886
final_parts = []
866887
if reasoning:
867888
final_parts.append(f"<think>\n{reasoning}\n</think>\n")
868889
if content:
869890
final_parts.append(content)
891+
if image_md:
892+
final_parts.append(image_md)
870893

871894
# Show which fallback model actually responded
872895
actual_model = res.get("model", "")
@@ -951,7 +974,13 @@ def _close_think_tag():
951974
first_choice = choices[0] if choices and isinstance(choices[0], dict) else {}
952975
delta = first_choice.get("delta", {})
953976
reasoning = delta.get("reasoning", "")
954-
content = delta.get("content", "")
977+
content = delta.get("content") or ""
978+
979+
# Audio transcript fallback: stream the transcript when the model
980+
# returns audio instead of text (e.g. openai/gpt-audio).
981+
if not content:
982+
audio_delta = delta.get("audio") or {}
983+
content = audio_delta.get("transcript", "")
955984

956985
if reasoning:
957986
if not in_think:

test_pipe.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2109,6 +2109,167 @@ async def _test_pipe_no_msgs_key():
21092109
_assert("USD" in _cc_values, "COST_CURRENCY options: USD present")
21102110
_assert("EUR" in _cc_values, "COST_CURRENCY options: EUR present")
21112111

2112+
# ══════════════════════════════════════════════════════════════════════════════
2113+
# 34. Audio and image output model support
2114+
# ══════════════════════════════════════════════════════════════════════════════
2115+
2116+
_section("34. Audio / image output model support")
2117+
2118+
# 34a. _format_image_output — empty list returns empty string
2119+
from openrouter_pipe import _format_image_output
2120+
_assert(_format_image_output([]) == "", "_format_image_output: empty list → empty string")
2121+
_assert(_format_image_output(None) == "", "_format_image_output: None → empty string")
2122+
2123+
# 34b. Single image with valid URL → markdown tag
2124+
_img_single = [{"image_url": {"url": "data:image/png;base64,ABC=="}}]
2125+
_result_img = _format_image_output(_img_single)
2126+
_assert(_result_img == "![Generated image](data:image/png;base64,ABC==)", "_format_image_output: single image → markdown tag")
2127+
2128+
# 34c. Multiple images → joined with double newline
2129+
_img_multi = [
2130+
{"image_url": {"url": "data:image/png;base64,AAA=="}},
2131+
{"image_url": {"url": "data:image/png;base64,BBB=="}},
2132+
]
2133+
_result_multi = _format_image_output(_img_multi)
2134+
_assert("![Generated image](data:image/png;base64,AAA==)" in _result_multi, "_format_image_output: multi — first image present")
2135+
_assert("![Generated image](data:image/png;base64,BBB==)" in _result_multi, "_format_image_output: multi — second image present")
2136+
_assert("\n\n" in _result_multi, "_format_image_output: multi — separated by double newline")
2137+
2138+
# 34d. Non-dict items in list are skipped gracefully
2139+
_img_mixed = ["not_a_dict", {"image_url": {"url": "https://example.com/img.png"}}, 42]
2140+
_result_mixed = _format_image_output(_img_mixed)
2141+
_assert("https://example.com/img.png" in _result_mixed, "_format_image_output: non-dict items skipped, valid item rendered")
2142+
_assert("not_a_dict" not in _result_mixed, "_format_image_output: string item not in output")
2143+
2144+
# 34e. Image dict missing 'url' key → skipped
2145+
_img_no_url = [{"image_url": {}}, {"image_url": {"url": ""}}]
2146+
_assert(_format_image_output(_img_no_url) == "", "_format_image_output: missing/empty url → empty string")
2147+
2148+
# ── Non-streaming audio response ───────────────────────────────────────────
2149+
2150+
_pipe34 = Pipe()
2151+
_pipe34.valves = Pipe.Valves(OPENROUTER_API_KEY="k")
2152+
2153+
# 34f. Audio model with transcript → transcript used as content
2154+
_mock_audio = MagicMock()
2155+
_mock_audio.json.return_value = {
2156+
"choices": [{
2157+
"message": {
2158+
"content": None,
2159+
"audio": {"transcript": "Hello from audio", "data": "base64data...", "id": "audio_123"},
2160+
}
2161+
}]
2162+
}
2163+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio):
2164+
_audio_result = _pipe34._non_stream_response({}, {})
2165+
_assert("Hello from audio" in _audio_result, "non-stream audio: transcript used as content")
2166+
2167+
# 34g. Audio model without transcript → placeholder message returned
2168+
_mock_audio_no_transcript = MagicMock()
2169+
_mock_audio_no_transcript.json.return_value = {
2170+
"choices": [{
2171+
"message": {
2172+
"content": None,
2173+
"audio": {"data": "base64audiodata...", "id": "audio_456"},
2174+
}
2175+
}]
2176+
}
2177+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_no_transcript):
2178+
_audio_no_tx_result = _pipe34._non_stream_response({}, {})
2179+
_assert("transcript not available" in _audio_no_tx_result, "non-stream audio no transcript: placeholder shown")
2180+
2181+
# 34h. Audio model with both content and audio → text content takes priority
2182+
_mock_audio_with_content = MagicMock()
2183+
_mock_audio_with_content.json.return_value = {
2184+
"choices": [{
2185+
"message": {
2186+
"content": "Text response",
2187+
"audio": {"transcript": "Audio transcript", "data": "base64..."},
2188+
}
2189+
}]
2190+
}
2191+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_with_content):
2192+
_audio_content_result = _pipe34._non_stream_response({}, {})
2193+
_assert("Text response" in _audio_content_result, "non-stream audio+content: text content preserved")
2194+
_assert("Audio transcript" not in _audio_content_result, "non-stream audio+content: transcript not used when content present")
2195+
2196+
# 34i. Image output model → markdown image tag in response
2197+
_mock_image = MagicMock()
2198+
_mock_image.json.return_value = {
2199+
"choices": [{
2200+
"message": {
2201+
"content": None,
2202+
"images": [{"image_url": {"url": "data:image/png;base64,IMGDATA=="}}],
2203+
}
2204+
}]
2205+
}
2206+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_image):
2207+
_image_result = _pipe34._non_stream_response({}, {})
2208+
_assert("![Generated image]" in _image_result, "non-stream image: markdown image tag present")
2209+
_assert("IMGDATA==" in _image_result, "non-stream image: URL data in output")
2210+
2211+
# 34j. Image output with text content → both text and image in response
2212+
_mock_image_with_text = MagicMock()
2213+
_mock_image_with_text.json.return_value = {
2214+
"choices": [{
2215+
"message": {
2216+
"content": "Here is the image:",
2217+
"images": [{"image_url": {"url": "data:image/png;base64,IMGDATA2=="}}],
2218+
}
2219+
}]
2220+
}
2221+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_image_with_text):
2222+
_image_text_result = _pipe34._non_stream_response({}, {})
2223+
_assert("Here is the image:" in _image_text_result, "non-stream image+text: text preserved")
2224+
_assert("![Generated image]" in _image_text_result, "non-stream image+text: image markdown present")
2225+
2226+
# 34k. message.content = None handled without crash (or "")
2227+
_mock_content_null = MagicMock()
2228+
_mock_content_null.json.return_value = {
2229+
"choices": [{"message": {"content": None}}]
2230+
}
2231+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_content_null):
2232+
_null_result = _pipe34._non_stream_response({}, {})
2233+
_assert(isinstance(_null_result, str), "non-stream content=None: returns string (no crash)")
2234+
2235+
# ── Streaming audio response ────────────────────────────────────────────────
2236+
2237+
_pipe34s = Pipe()
2238+
_pipe34s.valves = Pipe.Valves(OPENROUTER_API_KEY="k")
2239+
2240+
# 34l. Audio transcript in streaming delta → yielded as content
2241+
_sse_audio_chunks = [
2242+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "Hello "}}}]}).encode(),
2243+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "world"}}}]}).encode(),
2244+
b"data: [DONE]",
2245+
]
2246+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_audio_chunks)):
2247+
_stream_audio_chunks = list(_pipe34s._stream_response({}, {}))
2248+
_stream_audio_full = "".join(_stream_audio_chunks)
2249+
_assert("Hello " in _stream_audio_full, "stream audio: first transcript chunk yielded")
2250+
_assert("world" in _stream_audio_full, "stream audio: second transcript chunk yielded")
2251+
2252+
# 34m. Mixed stream: normal content chunks + audio transcript fallback
2253+
_sse_mixed_chunks = [
2254+
b"data: " + json.dumps({"choices": [{"delta": {"content": "Text first"}}]}).encode(),
2255+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": " then audio"}}}]}).encode(),
2256+
b"data: [DONE]",
2257+
]
2258+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_mixed_chunks)):
2259+
_mixed_chunks = list(_pipe34s._stream_response({}, {}))
2260+
_mixed_full = "".join(_mixed_chunks)
2261+
_assert("Text first" in _mixed_full, "stream mixed: text content chunk present")
2262+
_assert("then audio" in _mixed_full, "stream mixed: audio transcript chunk present")
2263+
2264+
# 34n. Stream delta with content=None → handled as empty (no crash)
2265+
_sse_null_content = [
2266+
b"data: " + json.dumps({"choices": [{"delta": {"content": None}}]}).encode(),
2267+
b"data: [DONE]",
2268+
]
2269+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_null_content)):
2270+
_null_chunks = list(_pipe34s._stream_response({}, {}))
2271+
_assert(isinstance("".join(_null_chunks), str), "stream content=None delta: no crash")
2272+
21122273
# ══════════════════════════════════════════════════════════════════════════════
21132274
# Summary
21142275
# ══════════════════════════════════════════════════════════════════════════════

0 commit comments

Comments
 (0)