diff --git a/openrouter_pipe.py b/openrouter_pipe.py index e45cf47..442b3b7 100644 --- a/openrouter_pipe.py +++ b/openrouter_pipe.py @@ -156,6 +156,26 @@ def _format_cost_info(usage: dict, currency: str = "USD") -> str: return f"\n\n---\n*{' · '.join(parts)}*" +def _format_image_output(images: list) -> str: + """Format OpenRouter image output objects as markdown image tags. + + Only http(s) and data:image/* URLs are rendered; others are dropped. + Closing parentheses in URLs are percent-encoded to avoid breaking markdown. + """ + parts = [] + for img in (images or []): + if not isinstance(img, dict): + continue + url = (img.get("image_url") or {}).get("url", "") + if not url: + continue + lower = url.lower() + if not (lower.startswith(("http://", "https://")) or lower.startswith("data:image/")): + continue + parts.append(f"![Generated image]({url.replace(')', '%29')})") + return "\n\n".join(parts) + + class Pipe: class Valves(BaseModel): OPENROUTER_API_KEY: str = Field( @@ -859,14 +879,27 @@ def _non_stream_response(self, headers: dict, payload: dict) -> str: citations = res.get("citations", []) reasoning = _insert_citations(message.get("reasoning", ""), citations) - content = _insert_citations(message.get("content", ""), citations) + content = _insert_citations(message.get("content") or "", citations) rendered_citations = _format_citation_list(citations) + # Audio output: show transcript when the model returns audio instead of text + audio_obj = message.get("audio") or {} + if audio_obj and not content: + transcript = audio_obj.get("transcript", "") + content = transcript or "*[Audio response — transcript not available.]*" + + # Image output: render generated images as markdown + image_md = _format_image_output(message.get("images") or []) + final_parts = [] if reasoning: final_parts.append(f"\n{reasoning}\n\n") if content: final_parts.append(content) + if image_md: + # Ensure a blank line before the image when there is preceding text + prefix = "\n\n" if final_parts else "" + final_parts.append(prefix + image_md) # Show which fallback model actually responded actual_model = res.get("model", "") @@ -951,7 +984,13 @@ def _close_think_tag(): first_choice = choices[0] if choices and isinstance(choices[0], dict) else {} delta = first_choice.get("delta", {}) reasoning = delta.get("reasoning", "") - content = delta.get("content", "") + content = delta.get("content") or "" + + # Audio transcript fallback: stream the transcript when the model + # returns audio instead of text (e.g. openai/gpt-audio). + if not content: + audio_delta = delta.get("audio") or {} + content = audio_delta.get("transcript", "") if reasoning: if not in_think: diff --git a/test_pipe.py b/test_pipe.py index 7f3a8f3..09fec7a 100644 --- a/test_pipe.py +++ b/test_pipe.py @@ -2109,6 +2109,194 @@ async def _test_pipe_no_msgs_key(): _assert("USD" in _cc_values, "COST_CURRENCY options: USD present") _assert("EUR" in _cc_values, "COST_CURRENCY options: EUR present") +# ══════════════════════════════════════════════════════════════════════════════ +# 34. Audio and image output model support +# ══════════════════════════════════════════════════════════════════════════════ + +_section("34. Audio / image output model support") + +# 34a. _format_image_output — empty list returns empty string +from openrouter_pipe import _format_image_output +_assert(_format_image_output([]) == "", "_format_image_output: empty list → empty string") +_assert(_format_image_output(None) == "", "_format_image_output: None → empty string") + +# 34b. Single image with valid URL → markdown tag +_img_single = [{"image_url": {"url": "data:image/png;base64,ABC=="}}] +_result_img = _format_image_output(_img_single) +_assert(_result_img == "![Generated image](data:image/png;base64,ABC==)", "_format_image_output: single image → markdown tag") + +# 34c. Multiple images → joined with double newline +_img_multi = [ + {"image_url": {"url": "data:image/png;base64,AAA=="}}, + {"image_url": {"url": "data:image/png;base64,BBB=="}}, +] +_result_multi = _format_image_output(_img_multi) +_assert("![Generated image](data:image/png;base64,AAA==)" in _result_multi, "_format_image_output: multi — first image present") +_assert("![Generated image](data:image/png;base64,BBB==)" in _result_multi, "_format_image_output: multi — second image present") +_assert("\n\n" in _result_multi, "_format_image_output: multi — separated by double newline") + +# 34d. Non-dict items in list are skipped gracefully +_img_mixed = ["not_a_dict", {"image_url": {"url": "https://example.com/img.png"}}, 42] +_result_mixed = _format_image_output(_img_mixed) +_assert("https://example.com/img.png" in _result_mixed, "_format_image_output: non-dict items skipped, valid item rendered") +_assert("not_a_dict" not in _result_mixed, "_format_image_output: string item not in output") + +# 34e. Image dict missing 'url' key → skipped +_img_no_url = [{"image_url": {}}, {"image_url": {"url": ""}}] +_assert(_format_image_output(_img_no_url) == "", "_format_image_output: missing/empty url → empty string") + +# 34e2. Unsafe URL schemes are rejected +_img_js = [{"image_url": {"url": "javascript:alert(1)"}}] +_assert(_format_image_output(_img_js) == "", "_format_image_output: javascript: scheme → empty (rejected)") +_img_file = [{"image_url": {"url": "file:///etc/passwd"}}] +_assert(_format_image_output(_img_file) == "", "_format_image_output: file: scheme → empty (rejected)") + +# 34e3. Closing parenthesis in URL is percent-encoded +_img_paren = [{"image_url": {"url": "https://example.com/img(1).png"}}] +_result_paren = _format_image_output(_img_paren) +_assert("%29" in _result_paren, "_format_image_output: ) in URL → percent-encoded as %29") +_assert("(1)" not in _result_paren, "_format_image_output: raw ) not in output") + +# ── Non-streaming audio response ─────────────────────────────────────────── + +_pipe34 = Pipe() +_pipe34.valves = Pipe.Valves(OPENROUTER_API_KEY="k") + +# 34f. Audio model with transcript → transcript used as content +_mock_audio = MagicMock() +_mock_audio.json.return_value = { + "choices": [{ + "message": { + "content": None, + "audio": {"transcript": "Hello from audio", "data": "base64data...", "id": "audio_123"}, + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio): + _audio_result = _pipe34._non_stream_response({}, {}) +_assert("Hello from audio" in _audio_result, "non-stream audio: transcript used as content") + +# 34g. Audio model without transcript → placeholder message returned +_mock_audio_no_transcript = MagicMock() +_mock_audio_no_transcript.json.return_value = { + "choices": [{ + "message": { + "content": None, + "audio": {"data": "base64audiodata...", "id": "audio_456"}, + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_no_transcript): + _audio_no_tx_result = _pipe34._non_stream_response({}, {}) +_assert("transcript not available" in _audio_no_tx_result, "non-stream audio no transcript: placeholder shown") + +# 34h. Audio model with both content and audio → text content takes priority +_mock_audio_with_content = MagicMock() +_mock_audio_with_content.json.return_value = { + "choices": [{ + "message": { + "content": "Text response", + "audio": {"transcript": "Audio transcript", "data": "base64..."}, + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_audio_with_content): + _audio_content_result = _pipe34._non_stream_response({}, {}) +_assert("Text response" in _audio_content_result, "non-stream audio+content: text content preserved") +_assert("Audio transcript" not in _audio_content_result, "non-stream audio+content: transcript not used when content present") + +# 34i. Image output model → markdown image tag in response +_mock_image = MagicMock() +_mock_image.json.return_value = { + "choices": [{ + "message": { + "content": None, + "images": [{"image_url": {"url": "data:image/png;base64,IMGDATA=="}}], + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_image): + _image_result = _pipe34._non_stream_response({}, {}) +_assert("![Generated image]" in _image_result, "non-stream image: markdown image tag present") +_assert("IMGDATA==" in _image_result, "non-stream image: URL data in output") + +# 34j. Image output with text content → both text and image in response, separated by blank line +_mock_image_with_text = MagicMock() +_mock_image_with_text.json.return_value = { + "choices": [{ + "message": { + "content": "Here is the image:", + "images": [{"image_url": {"url": "data:image/png;base64,IMGDATA2=="}}], + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_image_with_text): + _image_text_result = _pipe34._non_stream_response({}, {}) +_assert("Here is the image:" in _image_text_result, "non-stream image+text: text preserved") +_assert("![Generated image]" in _image_text_result, "non-stream image+text: image markdown present") +_assert("\n\n![Generated image]" in _image_text_result, "non-stream image+text: blank line before image tag") + +# 34j2. Image-only (no text) → no leading blank lines before image tag +_mock_image_only = MagicMock() +_mock_image_only.json.return_value = { + "choices": [{ + "message": { + "content": None, + "images": [{"image_url": {"url": "data:image/png;base64,ONLY=="}}], + } + }] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_image_only): + _image_only_result = _pipe34._non_stream_response({}, {}) +_assert(_image_only_result.startswith("![Generated image]"), "non-stream image-only: no leading blank lines") + +# 34k. message.content = None handled without crash (or "") +_mock_content_null = MagicMock() +_mock_content_null.json.return_value = { + "choices": [{"message": {"content": None}}] +} +with patch.object(_pipe34, "_retryable_request", return_value=_mock_content_null): + _null_result = _pipe34._non_stream_response({}, {}) +_assert(isinstance(_null_result, str), "non-stream content=None: returns string (no crash)") + +# ── Streaming audio response ──────────────────────────────────────────────── + +_pipe34s = Pipe() +_pipe34s.valves = Pipe.Valves(OPENROUTER_API_KEY="k") + +# 34l. Audio transcript in streaming delta → yielded as content +_sse_audio_chunks = [ + b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "Hello "}}}]}).encode(), + b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": "world"}}}]}).encode(), + b"data: [DONE]", +] +with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_audio_chunks)): + _stream_audio_chunks = list(_pipe34s._stream_response({}, {})) +_stream_audio_full = "".join(_stream_audio_chunks) +_assert("Hello " in _stream_audio_full, "stream audio: first transcript chunk yielded") +_assert("world" in _stream_audio_full, "stream audio: second transcript chunk yielded") + +# 34m. Mixed stream: normal content chunks + audio transcript fallback +_sse_mixed_chunks = [ + b"data: " + json.dumps({"choices": [{"delta": {"content": "Text first"}}]}).encode(), + b"data: " + json.dumps({"choices": [{"delta": {"content": "", "audio": {"transcript": " then audio"}}}]}).encode(), + b"data: [DONE]", +] +with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_mixed_chunks)): + _mixed_chunks = list(_pipe34s._stream_response({}, {})) +_mixed_full = "".join(_mixed_chunks) +_assert("Text first" in _mixed_full, "stream mixed: text content chunk present") +_assert("then audio" in _mixed_full, "stream mixed: audio transcript chunk present") + +# 34n. Stream delta with content=None → handled as empty (no crash) +_sse_null_content = [ + b"data: " + json.dumps({"choices": [{"delta": {"content": None}}]}).encode(), + b"data: [DONE]", +] +with patch.object(_pipe34s, "_retryable_request", return_value=_make_sse_response(_sse_null_content)): + _null_chunks = list(_pipe34s._stream_response({}, {})) +_assert(isinstance("".join(_null_chunks), str), "stream content=None delta: no crash") + # ══════════════════════════════════════════════════════════════════════════════ # Summary # ══════════════════════════════════════════════════════════════════════════════