PolyTalkIO
diff --git a/‎.env.example‎
Lines changed: 22 additions & 0 deletions b/‎.env.example‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 0 deletions b/‎README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎app/config.py‎
Lines changed: 10 additions & 1 deletion b/‎app/config.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎app/main.py‎
Lines changed: 6 additions & 2 deletions b/‎app/main.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎app/routers/api.py‎
Lines changed: 102 additions & 1 deletion b/‎app/routers/api.py‎
Lines changed: 102 additions & 1 deletion
diff --git a/‎app/services/base.py‎
Lines changed: 3 additions & 0 deletions b/‎app/services/base.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎app/services/pipeline_service.py‎
Lines changed: 33 additions & 0 deletions b/‎app/services/pipeline_service.py‎
Lines changed: 33 additions & 0 deletions
@@ -111,6 +111,28 @@ TRANSLATION_MODEL=qwen3-8b
 # allow enough room for Indic-script targets and longer sentence buffers.
 TRANSLATION_MAX_TOKENS=240
 
+# ============================================================================
+# VISUAL CONTEXT SERVICE (Shared Tab/Page Screenshot)
+# ============================================================================
+# Enable one-time shared tab/page screenshot summarization when tab audio
+# sharing starts. The summary is used as translation context only.
+VISUAL_CONTEXT_ENABLED=false
+
+# Keep mock mode enabled for local setup. For real visual context, set this to
+# false and configure a vision-capable provider below.
+VISUAL_CONTEXT_MOCK_MODE=true
+
+# Visual context can use a separate vision-capable provider/model, or mirror the
+# translation provider if it supports image inputs.
+VISUAL_CONTEXT_BASE_URL=https://ai.example.com
+VISUAL_CONTEXT_ENDPOINT=/v1/chat/completions
+VISUAL_CONTEXT_API_FORMAT=openai_chat
+VISUAL_CONTEXT_API_KEY=your_visual_context_api_key_here
+VISUAL_CONTEXT_MODEL=gpt-4o-mini
+
+# Maximum output tokens for the compact screenshot summary.
+VISUAL_CONTEXT_MAX_TOKENS=300
+
 # ============================================================================
 # TTS SERVICE (Local Text-to-Speech with Piper)
 # ============================================================================
 
@@ -301,6 +301,13 @@ The main latency knobs are:
 | `STT_TRANSCRIBE_WORKERS` | `2` | Per-stream STT transcription workers. Use more than 1 only when the GPU has spare compute. |
 | `STT_TRANSCRIBE_QUEUE_SIZE` | `8` | Max queued audio windows per stream before receiver backpressure. |
 | `STT_MODEL_WORKERS` | `2` | faster-whisper/CTranslate2 model workers for concurrent transcribe calls. |
+| `VISUAL_CONTEXT_ENABLED` | empty/false | Enable one-time shared tab/page screenshot summarization when tab audio sharing starts. |
+| `VISUAL_CONTEXT_BASE_URL` | `TRANSLATION_BASE_URL` | Optional separate base URL for the vision-capable visual context provider. |
+| `VISUAL_CONTEXT_API_KEY` | `TRANSLATION_API_KEY` | Optional separate API key for the visual context provider. |
+| `VISUAL_CONTEXT_ENDPOINT` | `TRANSLATION_ENDPOINT` | Optional separate endpoint for the visual context provider. |
+| `VISUAL_CONTEXT_API_FORMAT` | `TRANSLATION_API_FORMAT` | Optional separate API format for the visual context provider. |
+| `VISUAL_CONTEXT_MODEL` | `TRANSLATION_MODEL` | Vision-capable model used to summarize the shared tab/page screenshot. |
+| `VISUAL_CONTEXT_MAX_TOKENS` | `240` | Maximum output tokens for the visual context summary. |
 | `app.translation_flush_chars` | `300` | Translate buffered text once this many characters are available. |
 | `app.translation_flush_seconds` | `5.0` | Translate buffered text after this many seconds if enough text is available. |
 | `app.translation_flush_min_chars` | `120` | Minimum text required for time-based translation flushing. |
@@ -563,3 +570,7 @@ guidance for adding custom provider adapters.
 - Persist `media/output` if generated audio should survive restarts.
 - Treat transcripts, translations, and generated audio as user data.
 - Review AGPL-3.0 obligations before offering a modified hosted service.
+
+### Shared Tab Visual Context
+
+When `VISUAL_CONTEXT_ENABLED=true`, tab-audio sessions capture one browser-approved shared tab/page screenshot after sharing starts. PolyTalk sends the image for immediate summarization and does not store the raw screenshot. The generated summary is used as a translation hint for visible titles, names, labels, and domain vocabulary; spoken audio remains authoritative if it conflicts with the visual hint.
@@ -125,6 +125,11 @@ def translation(self) -> dict:
         """Get translation AI configuration."""
         return self._config.get("translation", {})
 
+    @property
+    def visual_context(self) -> dict:
+        """Get shared tab/page visual context configuration."""
+        return self._config.get("visual_context", {})
+
     @property
     def tts(self) -> dict:
         """Get TTS configuration."""
@@ -153,7 +158,11 @@ def host(self) -> str:
     @property
     def port(self) -> int:
         """Get application port."""
-        return int(self.app.get("port", 8000))
+        value = self.app.get("port", 8000)
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            return 8000
 
 
 def get_config() -> Config:
 
@@ -17,6 +17,7 @@
 
 from .config import get_config
 from .routers import api_router, web_router
+from .routers.api import close_visual_context_service
 from .utils.logger import get_logger
 from .version import __version__
 
@@ -77,8 +78,11 @@ async def lifespan(app: FastAPI):
     logger.info(
         f"Server will be available at http://{get_config().host}:{get_config().port}"
     )
-    yield
-    logger.info("PolyTalk shutting down...")
+    try:
+        yield
+    finally:
+        await close_visual_context_service()
+        logger.info("PolyTalk shutting down...")
 
 
 app = create_app()
 
@@ -19,6 +19,7 @@
 )
 
 from ..services.pipeline_service import TranslationPipelineService
+from ..services.visual_context_service import VisualContextService
 from ..utils.logger import get_logger
 from ..version import __version__
 
@@ -27,6 +28,7 @@
 router = APIRouter(prefix="/api", tags=["api"])
 
 pipeline_service: Optional[TranslationPipelineService] = None
+visual_context_service: Optional[VisualContextService] = None
 
 
 def get_pipeline_service() -> TranslationPipelineService:
@@ -42,6 +44,31 @@ def get_pipeline_service() -> TranslationPipelineService:
     return pipeline_service
 
 
+def get_visual_context_service() -> VisualContextService:
+    """Get or create the visual context service singleton."""
+    global visual_context_service
+    if visual_context_service is None:
+        visual_context_service = VisualContextService()
+    return visual_context_service
+
+
+async def close_visual_context_service() -> None:
+    """Close and reset the visual context service singleton."""
+    global visual_context_service
+    if visual_context_service is None:
+        return
+
+    await visual_context_service.close()
+    visual_context_service = None
+
+
+def should_start_visual_context_request(
+    image_data_url: str, in_flight: bool, ready: bool
+) -> bool:
+    """Return whether a visual context request should be accepted."""
+    return bool(image_data_url) and not (in_flight or ready)
+
+
 @router.get("/health")
 async def health_check() -> dict:
     """
@@ -81,6 +108,10 @@ async def websocket_translate(
     client_disconnected = False
     pause_event = asyncio.Event()
     language_swap_queue = asyncio.Queue()
+    visual_context_queue = asyncio.Queue(maxsize=1)
+    visual_context_tasks: set[asyncio.Task] = set()
+    visual_context_in_flight = False
+    visual_context_ready = False
 
     connection_start = time.time()
     idle_timeout_seconds = 300
@@ -102,7 +133,7 @@ async def send_pipeline_status(
 
     async def audio_generator() -> AsyncGenerator[bytes, None]:
         """Generate audio chunks from WebSocket messages."""
-        nonlocal client_disconnected, connection_start
+        nonlocal client_disconnected, connection_start, visual_context_in_flight
         is_paused = False
         try:
             while True:
@@ -146,6 +177,21 @@ async def audio_generator() -> AsyncGenerator[bytes, None]:
                         logger.info(
                             "Client sent 'resume' signal, resuming audio transmission"
                         )
+                    elif data.get("type") == "visual_context":
+                        image_data_url = data.get("image_data_url") or ""
+                        if should_start_visual_context_request(
+                            image_data_url,
+                            visual_context_in_flight,
+                            visual_context_ready,
+                        ):
+                            visual_context_in_flight = True
+                            task = asyncio.create_task(
+                                summarize_visual_context(image_data_url)
+                            )
+                            visual_context_tasks.add(task)
+                            task.add_done_callback(visual_context_tasks.discard)
+                        elif image_data_url:
+                            logger.debug("Ignoring duplicate visual context request")
                     elif data.get("type") == "swap_languages":
                         new_source = data.get("source_language")
                         new_target = data.get("target_language")
@@ -189,6 +235,55 @@ async def send_result(result: dict):
         except Exception as e:
             logger.debug(f"Error sending result: {e}")
 
+    async def summarize_visual_context(image_data_url: str) -> None:
+        """Summarize a shared tab/page screenshot without blocking audio receive."""
+        nonlocal visual_context_in_flight, visual_context_ready
+        try:
+            await send_pipeline_status(
+                "visual_context",
+                "active",
+                "Reading shared tab context",
+            )
+            service = get_visual_context_service()
+            summary = await service.summarize_screenshot(
+                image_data_url,
+                source_language,
+                target_language,
+            )
+            if not summary:
+                await send_pipeline_status(
+                    "visual_context",
+                    "warning",
+                    "Shared tab context unavailable",
+                )
+                return
+
+            while not visual_context_queue.empty():
+                try:
+                    visual_context_queue.get_nowait()
+                except asyncio.QueueEmpty:
+                    break
+            visual_context_queue.put_nowait(summary)
+            visual_context_ready = True
+            logger.info(
+                "Visual context summary received: "
+                f"chars={len(summary)} summary={summary[:1200]!r}"
+            )
+            await send_pipeline_status(
+                "visual_context",
+                "done",
+                "Shared tab context ready",
+            )
+        except Exception as exc:
+            logger.warning(f"Visual context service call failed: {exc}")
+            await send_pipeline_status(
+                "visual_context",
+                "warning",
+                "Shared tab context unavailable",
+            )
+        finally:
+            visual_context_in_flight = False
+
     audio_gen = audio_generator()
 
     try:
@@ -223,6 +318,7 @@ async def send_result(result: dict):
             target_language,
             pause_event=pause_event,
             language_swap_queue=language_swap_queue,
+            visual_context_queue=visual_context_queue,
         ):
             if client_disconnected:
                 logger.info("Client disconnected, stopping pipeline")
@@ -256,6 +352,11 @@ async def send_result(result: dict):
         except Exception as e:
             logger.error(f"Error closing audio generator: {e}")
 
+        for task in visual_context_tasks:
+            task.cancel()
+        if visual_context_tasks:
+            await asyncio.gather(*visual_context_tasks, return_exceptions=True)
+
         if not client_disconnected:
             try:
                 await websocket.close()
 
@@ -114,6 +114,7 @@ async def translate(
         source_language: str,
         target_language: str,
         context: Optional[list[dict[str, str]]] = None,
+        visual_context: Optional[str] = None,
     ) -> TranslationResult:
         """
         Translate text from source to target language.
@@ -124,6 +125,8 @@ async def translate(
             target_language: Target language code
             context: Optional prior source/target translations to use as
                 read-only context
+            visual_context: Optional shared tab/page visual summary to use as
+                a read-only hint
 
         Returns:
             TranslationResult with translated text
 
@@ -275,6 +275,7 @@ async def process_streaming(
         save_media: bool = True,
         pause_event: Optional[asyncio.Event] = None,
         language_swap_queue: Optional[asyncio.Queue] = None,
+        visual_context_queue: Optional[asyncio.Queue] = None,
     ) -> AsyncGenerator[dict, None]:
         """
         Process streaming audio with the real-time translation pipeline.
@@ -295,6 +296,8 @@ async def process_streaming(
             save_media: Whether to save generated media files
             pause_event: Optional asyncio.Event to signal pause (set=paused, clear=resume)
             language_swap_queue: Optional asyncio.Queue for receiving language swap updates
+            visual_context_queue: Optional asyncio.Queue for shared tab/page visual
+                context summary updates
 
         Yields:
             Dictionary with streaming pipeline results
@@ -560,6 +563,29 @@ async def translation_worker():
                 max_chunks=translation_context_max_chunks,
                 max_chars=translation_context_max_chars,
             )
+            visual_context_summary = None
+
+            async def drain_visual_context_updates() -> None:
+                nonlocal visual_context_summary
+                if visual_context_queue is None:
+                    return
+
+                updated = False
+                while True:
+                    try:
+                        summary = visual_context_queue.get_nowait()
+                    except asyncio.QueueEmpty:
+                        break
+                    visual_context_summary = (
+                        " ".join(str(summary or "").split()) or None
+                    )
+                    updated = True
+
+                if updated:
+                    logger.info(
+                        "Visual context summary updated: "
+                        f"chars={len(visual_context_summary or '')}"
+                    )
 
             async def enqueue_tts(text: str, sequence: int) -> None:
                 await tts_queue.put(
@@ -578,6 +604,8 @@ async def flush_translation_buffer(reason: str) -> None:
                 nonlocal full_translation, translation_buffer
                 nonlocal translation_buffer_started_at, translation_sequence
 
+                await drain_visual_context_updates()
+
                 remaining_text = translation_buffer.strip()
                 if not remaining_text:
                     return
@@ -594,6 +622,7 @@ async def flush_translation_buffer(reason: str) -> None:
                         translation_source_lang,
                         target_lang,
                         context=translation_context.snapshot(),
+                        visual_context=visual_context_summary,
                     )
                     if result.success:
                         translation_context.remember(remaining_text, result.text)
@@ -638,6 +667,7 @@ async def flush_translation_buffer(reason: str) -> None:
                     try:
                         msg = await asyncio.wait_for(trans_queue.get(), timeout=0.5)
                     except asyncio.TimeoutError:
+                        await drain_visual_context_updates()
                         if translation_buffer.strip() and translation_buffer_started_at:
                             buffer_age = time.time() - translation_buffer_started_at
                             if buffer_age >= translation_flush_seconds:
@@ -661,6 +691,8 @@ async def flush_translation_buffer(reason: str) -> None:
                         await result_queue.put(msg)
                         continue
 
+                    await drain_visual_context_updates()
+
                     trans_result = msg["result"]
                     asr_translation_queue_wait = (
                         time.time() - msg["queued_at"] if msg.get("queued_at") else 0.0
@@ -798,6 +830,7 @@ async def flush_translation_buffer(reason: str) -> None:
                                     translation_source_lang,
                                     target_lang,
                                     context=translation_context.snapshot(),
+                                    visual_context=visual_context_summary,
                                 )
                                 if result.success:
                                     break