Skip to content

Commit 7019ea1

Browse files
authored
Merge pull request #9 from sena-labs/claude/laughing-beaver-f67164
feat: handle audio and image output models from OpenRouter
2 parents 0f31796 + c2c7881 commit 7019ea1

2 files changed

Lines changed: 229 additions & 2 deletions

File tree

openrouter_pipe.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,26 @@ def _format_cost_info(usage: dict, currency: str = "USD") -> str:
156156
return f"\n\n---\n*{' · '.join(parts)}*"
157157

158158

159+
def _format_image_output(images: list) -> str:
160+
"""Format OpenRouter image output objects as markdown image tags.
161+
162+
Only http(s) and data:image/* URLs are rendered; others are dropped.
163+
Closing parentheses in URLs are percent-encoded to avoid breaking markdown.
164+
"""
165+
parts = []
166+
for img in (images or []):
167+
if not isinstance(img, dict):
168+
continue
169+
url = (img.get("image_url") or {}).get("url", "")
170+
if not url:
171+
continue
172+
lower = url.lower()
173+
if not (lower.startswith(("http://", "https://")) or lower.startswith("data:image/")):
174+
continue
175+
parts.append(f"![Generated image]({url.replace(')', '%29')})")
176+
return "\n\n".join(parts)
177+
178+
159179
class Pipe:
160180
class Valves(BaseModel):
161181
OPENROUTER_API_KEY: str = Field(
@@ -859,14 +879,27 @@ def _non_stream_response(self, headers: dict, payload: dict) -> str:
859879
citations = res.get("citations", [])
860880

861881
reasoning = _insert_citations(message.get("reasoning", ""), citations)
862-
content = _insert_citations(message.get("content", ""), citations)
882+
content = _insert_citations(message.get("content") or "", citations)
863883
rendered_citations = _format_citation_list(citations)
864884

885+
# Audio output: show transcript when the model returns audio instead of text
886+
audio_obj = message.get("audio") or {}
887+
if audio_obj and not content:
888+
transcript = audio_obj.get("transcript", "")
889+
content = transcript or "*[Audio response — transcript not available.]*"
890+
891+
# Image output: render generated images as markdown
892+
image_md = _format_image_output(message.get("images") or [])
893+
865894
final_parts = []
866895
if reasoning:
867896
final_parts.append(f"<think>\n{reasoning}\n</think>\n")
868897
if content:
869898
final_parts.append(content)
899+
if image_md:
900+
# Ensure a blank line before the image when there is preceding text
901+
prefix = "\n\n" if final_parts else ""
902+
final_parts.append(prefix + image_md)
870903

871904
# Show which fallback model actually responded
872905
actual_model = res.get("model", "")
@@ -951,7 +984,13 @@ def _close_think_tag():
951984
first_choice = choices[0] if choices and isinstance(choices[0], dict) else {}
952985
delta = first_choice.get("delta", {})
953986
reasoning = delta.get("reasoning", "")
954-
content = delta.get("content", "")
987+
content = delta.get("content") or ""
988+
989+
# Audio transcript fallback: stream the transcript when the model
990+
# returns audio instead of text (e.g. openai/gpt-audio).
991+
if not content:
992+
audio_delta = delta.get("audio") or {}
993+
content = audio_delta.get("transcript", "")
955994

956995
if reasoning:
957996
if not in_think:

test_pipe.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2109,6 +2109,194 @@ async def _test_pipe_no_msgs_key():
21092109
_assert("USD" in _cc_values, "COST_CURRENCY options: USD present")
21102110
_assert("EUR" in _cc_values, "COST_CURRENCY options: EUR present")
21112111

2112+
# ══════════════════════════════════════════════════════════════════════════════
2113+
# 34. Audio and image output model support
2114+
# ══════════════════════════════════════════════════════════════════════════════
2115+
2116+
_section("34. Audio / image output model support")
2117+
2118+
# 34a. _format_image_output — empty list returns empty string
2119+
from openrouter_pipe import _format_image_output
2120+
_assert(_format_image_output([]) == "", "_format_image_output: empty list → empty string")
2121+
_assert(_format_image_output(None) == "", "_format_image_output: None → empty string")
2122+
2123+
# 34b. Single image with valid URL → markdown tag
2124+
_img_single = [{"image_url": {"url": "data:image/png;base64,ABC=="}}]
2125+
_result_img = _format_image_output(_img_single)
2126+
_assert(_result_img == "![Generated image](data:image/png;base64,ABC==)", "_format_image_output: single image → markdown tag")
2127+
2128+
# 34c. Multiple images → joined with double newline
2129+
_img_multi = [
2130+
{"image_url": {"url": "data:image/png;base64,AAA=="}},
2131+
{"image_url": {"url": "data:image/png;base64,BBB=="}},
2132+
]
2133+
_result_multi = _format_image_output(_img_multi)
2134+
_assert("![Generated image](data:image/png;base64,AAA==)" in _result_multi, "_format_image_output: multi — first image present")
2135+
_assert("![Generated image](data:image/png;base64,BBB==)" in _result_multi, "_format_image_output: multi — second image present")
2136+
_assert("\n\n" in _result_multi, "_format_image_output: multi — separated by double newline")
2137+
2138+
# 34d. Non-dict items in list are skipped gracefully
2139+
_img_mixed = ["not_a_dict", {"image_url": {"url": "https://example.com/img.png"}}, 42]
2140+
_result_mixed = _format_image_output(_img_mixed)
2141+
_assert("https://example.com/img.png" in _result_mixed, "_format_image_output: non-dict items skipped, valid item rendered")
2142+
_assert("not_a_dict" not in _result_mixed, "_format_image_output: string item not in output")
2143+
2144+
# 34e. Image dict missing 'url' key → skipped
2145+
_img_no_url = [{"image_url": {}}, {"image_url": {"url": ""}}]
2146+
_assert(_format_image_output(_img_no_url) == "", "_format_image_output: missing/empty url → empty string")
2147+
2148+
# 34e2. Unsafe URL schemes are rejected
2149+
_img_js = [{"image_url": {"url": "javascript:alert(1)"}}]
2150+
_assert(_format_image_output(_img_js) == "", "_format_image_output: javascript: scheme → empty (rejected)")
2151+
_img_file = [{"image_url": {"url": "file:///etc/passwd"}}]
2152+
_assert(_format_image_output(_img_file) == "", "_format_image_output: file: scheme → empty (rejected)")
2153+
2154+
# 34e3. Closing parenthesis in URL is percent-encoded
2155+
_img_paren = [{"image_url": {"url": "https://example.com/img(1).png"}}]
2156+
_result_paren = _format_image_output(_img_paren)
2157+
_assert("%29" in _result_paren, "_format_image_output: ) in URL → percent-encoded as %29")
2158+
_assert("(1)" not in _result_paren, "_format_image_output: raw ) not in output")
2159+
2160+
# ── Non-streaming audio response ───────────────────────────────────────────
2161+
2162+
_pipe34 = Pipe()
2163+
_pipe34.valves = Pipe.Valves(OPENROUTER_API_KEY="k")
2164+
2165+
# 34f. Audio model with transcript → transcript used as content
2166+
_mock_audio = MagicMock()
2167+
_mock_audio.json.return_value = {
2168+
"choices": [{
2169+
"message": {
2170+
"content": None,
2171+
"audio": {"transcript": "Hello from audio", "data": "base64data...", "id": "audio_123"},
2172+
}
2173+
}]
2174+
}
2175+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio):
2176+
_audio_result = _pipe34._non_stream_response({}, {})
2177+
_assert("Hello from audio" in _audio_result, "non-stream audio: transcript used as content")
2178+
2179+
# 34g. Audio model without transcript → placeholder message returned
2180+
_mock_audio_no_transcript = MagicMock()
2181+
_mock_audio_no_transcript.json.return_value = {
2182+
"choices": [{
2183+
"message": {
2184+
"content": None,
2185+
"audio": {"data": "base64audiodata...", "id": "audio_456"},
2186+
}
2187+
}]
2188+
}
2189+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_no_transcript):
2190+
_audio_no_tx_result = _pipe34._non_stream_response({}, {})
2191+
_assert("transcript not available" in _audio_no_tx_result, "non-stream audio no transcript: placeholder shown")
2192+
2193+
# 34h. Audio model with both content and audio → text content takes priority
2194+
_mock_audio_with_content = MagicMock()
2195+
_mock_audio_with_content.json.return_value = {
2196+
"choices": [{
2197+
"message": {
2198+
"content": "Text response",
2199+
"audio": {"transcript": "Audio transcript", "data": "base64..."},
2200+
}
2201+
}]
2202+
}
2203+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_with_content):
2204+
_audio_content_result = _pipe34._non_stream_response({}, {})
2205+
_assert("Text response" in _audio_content_result, "non-stream audio+content: text content preserved")
2206+
_assert("Audio transcript" not in _audio_content_result, "non-stream audio+content: transcript not used when content present")
2207+
2208+
# 34i. Image output model → markdown image tag in response
2209+
_mock_image = MagicMock()
2210+
_mock_image.json.return_value = {
2211+
"choices": [{
2212+
"message": {
2213+
"content": None,
2214+
"images": [{"image_url": {"url": "data:image/png;base64,IMGDATA=="}}],
2215+
}
2216+
}]
2217+
}
2218+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_image):
2219+
_image_result = _pipe34._non_stream_response({}, {})
2220+
_assert("![Generated image]" in _image_result, "non-stream image: markdown image tag present")
2221+
_assert("IMGDATA==" in _image_result, "non-stream image: URL data in output")
2222+
2223+
# 34j. Image output with text content → both text and image in response, separated by blank line
2224+
_mock_image_with_text = MagicMock()
2225+
_mock_image_with_text.json.return_value = {
2226+
"choices": [{
2227+
"message": {
2228+
"content": "Here is the image:",
2229+
"images": [{"image_url": {"url": "data:image/png;base64,IMGDATA2=="}}],
2230+
}
2231+
}]
2232+
}
2233+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_image_with_text):
2234+
_image_text_result = _pipe34._non_stream_response({}, {})
2235+
_assert("Here is the image:" in _image_text_result, "non-stream image+text: text preserved")
2236+
_assert("![Generated image]" in _image_text_result, "non-stream image+text: image markdown present")
2237+
_assert("\n\n![Generated image]" in _image_text_result, "non-stream image+text: blank line before image tag")
2238+
2239+
# 34j2. Image-only (no text) → no leading blank lines before image tag
2240+
_mock_image_only = MagicMock()
2241+
_mock_image_only.json.return_value = {
2242+
"choices": [{
2243+
"message": {
2244+
"content": None,
2245+
"images": [{"image_url": {"url": "data:image/png;base64,ONLY=="}}],
2246+
}
2247+
}]
2248+
}
2249+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_image_only):
2250+
_image_only_result = _pipe34._non_stream_response({}, {})
2251+
_assert(_image_only_result.startswith("![Generated image]"), "non-stream image-only: no leading blank lines")
2252+
2253+
# 34k. message.content = None handled without crash (or "")
2254+
_mock_content_null = MagicMock()
2255+
_mock_content_null.json.return_value = {
2256+
"choices": [{"message": {"content": None}}]
2257+
}
2258+
with patch.object(_pipe34, "_retryable_request", return_value=_mock_content_null):
2259+
_null_result = _pipe34._non_stream_response({}, {})
2260+
_assert(isinstance(_null_result, str), "non-stream content=None: returns string (no crash)")
2261+
2262+
# ── Streaming audio response ────────────────────────────────────────────────
2263+
2264+
_pipe34s = Pipe()
2265+
_pipe34s.valves = Pipe.Valves(OPENROUTER_API_KEY="k")
2266+
2267+
# 34l. Audio transcript in streaming delta → yielded as content
2268+
_sse_audio_chunks = [
2269+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "Hello "}}}]}).encode(),
2270+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "world"}}}]}).encode(),
2271+
b"data: [DONE]",
2272+
]
2273+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_audio_chunks)):
2274+
_stream_audio_chunks = list(_pipe34s._stream_response({}, {}))
2275+
_stream_audio_full = "".join(_stream_audio_chunks)
2276+
_assert("Hello " in _stream_audio_full, "stream audio: first transcript chunk yielded")
2277+
_assert("world" in _stream_audio_full, "stream audio: second transcript chunk yielded")
2278+
2279+
# 34m. Mixed stream: normal content chunks + audio transcript fallback
2280+
_sse_mixed_chunks = [
2281+
b"data: " + json.dumps({"choices": [{"delta": {"content": "Text first"}}]}).encode(),
2282+
b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": " then audio"}}}]}).encode(),
2283+
b"data: [DONE]",
2284+
]
2285+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_mixed_chunks)):
2286+
_mixed_chunks = list(_pipe34s._stream_response({}, {}))
2287+
_mixed_full = "".join(_mixed_chunks)
2288+
_assert("Text first" in _mixed_full, "stream mixed: text content chunk present")
2289+
_assert("then audio" in _mixed_full, "stream mixed: audio transcript chunk present")
2290+
2291+
# 34n. Stream delta with content=None → handled as empty (no crash)
2292+
_sse_null_content = [
2293+
b"data: " + json.dumps({"choices": [{"delta": {"content": None}}]}).encode(),
2294+
b"data: [DONE]",
2295+
]
2296+
with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_null_content)):
2297+
_null_chunks = list(_pipe34s._stream_response({}, {}))
2298+
_assert(isinstance("".join(_null_chunks), str), "stream content=None delta: no crash")
2299+
21122300
# ══════════════════════════════════════════════════════════════════════════════
21132301
# Summary
21142302
# ══════════════════════════════════════════════════════════════════════════════

0 commit comments

Comments
 (0)