Skip to content

Commit 8a86953

Browse files
committed
Merge branch 'T11667-RUS-shared-tab-visual-context' into 'main'
T11667 RUS: add shared tab visual context See merge request bizzappdev/ai/polytalkio/polytalk!9
2 parents f9fa49e + 51b24f4 commit 8a86953

17 files changed

Lines changed: 1201 additions & 30 deletions

.env.example

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,28 @@ TRANSLATION_MODEL=qwen3-8b
111111
# allow enough room for Indic-script targets and longer sentence buffers.
112112
TRANSLATION_MAX_TOKENS=240
113113

114+
# ============================================================================
115+
# VISUAL CONTEXT SERVICE (Shared Tab/Page Screenshot)
116+
# ============================================================================
117+
# Enable one-time shared tab/page screenshot summarization when tab audio
118+
# sharing starts. The summary is used as translation context only.
119+
VISUAL_CONTEXT_ENABLED=false
120+
121+
# Keep mock mode enabled for local setup. For real visual context, set this to
122+
# false and configure a vision-capable provider below.
123+
VISUAL_CONTEXT_MOCK_MODE=true
124+
125+
# Visual context can use a separate vision-capable provider/model, or mirror the
126+
# translation provider if it supports image inputs.
127+
VISUAL_CONTEXT_BASE_URL=https://ai.example.com
128+
VISUAL_CONTEXT_ENDPOINT=/v1/chat/completions
129+
VISUAL_CONTEXT_API_FORMAT=openai_chat
130+
VISUAL_CONTEXT_API_KEY=your_visual_context_api_key_here
131+
VISUAL_CONTEXT_MODEL=gpt-4o-mini
132+
133+
# Maximum output tokens for the compact screenshot summary.
134+
VISUAL_CONTEXT_MAX_TOKENS=300
135+
114136
# ============================================================================
115137
# TTS SERVICE (Local Text-to-Speech with Piper)
116138
# ============================================================================

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,13 @@ The main latency knobs are:
301301
| `STT_TRANSCRIBE_WORKERS` | `2` | Per-stream STT transcription workers. Use more than 1 only when the GPU has spare compute. |
302302
| `STT_TRANSCRIBE_QUEUE_SIZE` | `8` | Max queued audio windows per stream before receiver backpressure. |
303303
| `STT_MODEL_WORKERS` | `2` | faster-whisper/CTranslate2 model workers for concurrent transcribe calls. |
304+
| `VISUAL_CONTEXT_ENABLED` | empty/false | Enable one-time shared tab/page screenshot summarization when tab audio sharing starts. |
305+
| `VISUAL_CONTEXT_BASE_URL` | `TRANSLATION_BASE_URL` | Optional separate base URL for the vision-capable visual context provider. |
306+
| `VISUAL_CONTEXT_API_KEY` | `TRANSLATION_API_KEY` | Optional separate API key for the visual context provider. |
307+
| `VISUAL_CONTEXT_ENDPOINT` | `TRANSLATION_ENDPOINT` | Optional separate endpoint for the visual context provider. |
308+
| `VISUAL_CONTEXT_API_FORMAT` | `TRANSLATION_API_FORMAT` | Optional separate API format for the visual context provider. |
309+
| `VISUAL_CONTEXT_MODEL` | `TRANSLATION_MODEL` | Vision-capable model used to summarize the shared tab/page screenshot. |
310+
| `VISUAL_CONTEXT_MAX_TOKENS` | `240` | Maximum output tokens for the visual context summary. |
304311
| `app.translation_flush_chars` | `300` | Translate buffered text once this many characters are available. |
305312
| `app.translation_flush_seconds` | `5.0` | Translate buffered text after this many seconds if enough text is available. |
306313
| `app.translation_flush_min_chars` | `120` | Minimum text required for time-based translation flushing. |
@@ -563,3 +570,7 @@ guidance for adding custom provider adapters.
563570
- Persist `media/output` if generated audio should survive restarts.
564571
- Treat transcripts, translations, and generated audio as user data.
565572
- Review AGPL-3.0 obligations before offering a modified hosted service.
573+
574+
### Shared Tab Visual Context
575+
576+
When `VISUAL_CONTEXT_ENABLED=true`, tab-audio sessions capture one browser-approved shared tab/page screenshot after sharing starts. PolyTalk sends the image for immediate summarization and does not store the raw screenshot. The generated summary is used as a translation hint for visible titles, names, labels, and domain vocabulary; spoken audio remains authoritative if it conflicts with the visual hint.

app/config.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ def translation(self) -> dict:
125125
"""Get translation AI configuration."""
126126
return self._config.get("translation", {})
127127

128+
@property
129+
def visual_context(self) -> dict:
130+
"""Get shared tab/page visual context configuration."""
131+
return self._config.get("visual_context", {})
132+
128133
@property
129134
def tts(self) -> dict:
130135
"""Get TTS configuration."""
@@ -153,7 +158,11 @@ def host(self) -> str:
153158
@property
154159
def port(self) -> int:
155160
"""Get application port."""
156-
return int(self.app.get("port", 8000))
161+
value = self.app.get("port", 8000)
162+
try:
163+
return int(value)
164+
except (TypeError, ValueError):
165+
return 8000
157166

158167

159168
def get_config() -> Config:

app/main.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
from .config import get_config
1919
from .routers import api_router, web_router
20+
from .routers.api import close_visual_context_service
2021
from .utils.logger import get_logger
2122
from .version import __version__
2223

@@ -77,8 +78,11 @@ async def lifespan(app: FastAPI):
7778
logger.info(
7879
f"Server will be available at http://{get_config().host}:{get_config().port}"
7980
)
80-
yield
81-
logger.info("PolyTalk shutting down...")
81+
try:
82+
yield
83+
finally:
84+
await close_visual_context_service()
85+
logger.info("PolyTalk shutting down...")
8286

8387

8488
app = create_app()

app/routers/api.py

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
)
2020

2121
from ..services.pipeline_service import TranslationPipelineService
22+
from ..services.visual_context_service import VisualContextService
2223
from ..utils.logger import get_logger
2324
from ..version import __version__
2425

@@ -27,6 +28,7 @@
2728
router = APIRouter(prefix="/api", tags=["api"])
2829

2930
pipeline_service: Optional[TranslationPipelineService] = None
31+
visual_context_service: Optional[VisualContextService] = None
3032

3133

3234
def get_pipeline_service() -> TranslationPipelineService:
@@ -42,6 +44,31 @@ def get_pipeline_service() -> TranslationPipelineService:
4244
return pipeline_service
4345

4446

47+
def get_visual_context_service() -> VisualContextService:
48+
"""Get or create the visual context service singleton."""
49+
global visual_context_service
50+
if visual_context_service is None:
51+
visual_context_service = VisualContextService()
52+
return visual_context_service
53+
54+
55+
async def close_visual_context_service() -> None:
56+
"""Close and reset the visual context service singleton."""
57+
global visual_context_service
58+
if visual_context_service is None:
59+
return
60+
61+
await visual_context_service.close()
62+
visual_context_service = None
63+
64+
65+
def should_start_visual_context_request(
66+
image_data_url: str, in_flight: bool, ready: bool
67+
) -> bool:
68+
"""Return whether a visual context request should be accepted."""
69+
return bool(image_data_url) and not (in_flight or ready)
70+
71+
4572
@router.get("/health")
4673
async def health_check() -> dict:
4774
"""
@@ -81,6 +108,10 @@ async def websocket_translate(
81108
client_disconnected = False
82109
pause_event = asyncio.Event()
83110
language_swap_queue = asyncio.Queue()
111+
visual_context_queue = asyncio.Queue(maxsize=1)
112+
visual_context_tasks: set[asyncio.Task] = set()
113+
visual_context_in_flight = False
114+
visual_context_ready = False
84115

85116
connection_start = time.time()
86117
idle_timeout_seconds = 300
@@ -102,7 +133,7 @@ async def send_pipeline_status(
102133

103134
async def audio_generator() -> AsyncGenerator[bytes, None]:
104135
"""Generate audio chunks from WebSocket messages."""
105-
nonlocal client_disconnected, connection_start
136+
nonlocal client_disconnected, connection_start, visual_context_in_flight
106137
is_paused = False
107138
try:
108139
while True:
@@ -146,6 +177,21 @@ async def audio_generator() -> AsyncGenerator[bytes, None]:
146177
logger.info(
147178
"Client sent 'resume' signal, resuming audio transmission"
148179
)
180+
elif data.get("type") == "visual_context":
181+
image_data_url = data.get("image_data_url") or ""
182+
if should_start_visual_context_request(
183+
image_data_url,
184+
visual_context_in_flight,
185+
visual_context_ready,
186+
):
187+
visual_context_in_flight = True
188+
task = asyncio.create_task(
189+
summarize_visual_context(image_data_url)
190+
)
191+
visual_context_tasks.add(task)
192+
task.add_done_callback(visual_context_tasks.discard)
193+
elif image_data_url:
194+
logger.debug("Ignoring duplicate visual context request")
149195
elif data.get("type") == "swap_languages":
150196
new_source = data.get("source_language")
151197
new_target = data.get("target_language")
@@ -189,6 +235,55 @@ async def send_result(result: dict):
189235
except Exception as e:
190236
logger.debug(f"Error sending result: {e}")
191237

238+
async def summarize_visual_context(image_data_url: str) -> None:
239+
"""Summarize a shared tab/page screenshot without blocking audio receive."""
240+
nonlocal visual_context_in_flight, visual_context_ready
241+
try:
242+
await send_pipeline_status(
243+
"visual_context",
244+
"active",
245+
"Reading shared tab context",
246+
)
247+
service = get_visual_context_service()
248+
summary = await service.summarize_screenshot(
249+
image_data_url,
250+
source_language,
251+
target_language,
252+
)
253+
if not summary:
254+
await send_pipeline_status(
255+
"visual_context",
256+
"warning",
257+
"Shared tab context unavailable",
258+
)
259+
return
260+
261+
while not visual_context_queue.empty():
262+
try:
263+
visual_context_queue.get_nowait()
264+
except asyncio.QueueEmpty:
265+
break
266+
visual_context_queue.put_nowait(summary)
267+
visual_context_ready = True
268+
logger.info(
269+
"Visual context summary received: "
270+
f"chars={len(summary)} summary={summary[:1200]!r}"
271+
)
272+
await send_pipeline_status(
273+
"visual_context",
274+
"done",
275+
"Shared tab context ready",
276+
)
277+
except Exception as exc:
278+
logger.warning(f"Visual context service call failed: {exc}")
279+
await send_pipeline_status(
280+
"visual_context",
281+
"warning",
282+
"Shared tab context unavailable",
283+
)
284+
finally:
285+
visual_context_in_flight = False
286+
192287
audio_gen = audio_generator()
193288

194289
try:
@@ -223,6 +318,7 @@ async def send_result(result: dict):
223318
target_language,
224319
pause_event=pause_event,
225320
language_swap_queue=language_swap_queue,
321+
visual_context_queue=visual_context_queue,
226322
):
227323
if client_disconnected:
228324
logger.info("Client disconnected, stopping pipeline")
@@ -256,6 +352,11 @@ async def send_result(result: dict):
256352
except Exception as e:
257353
logger.error(f"Error closing audio generator: {e}")
258354

355+
for task in visual_context_tasks:
356+
task.cancel()
357+
if visual_context_tasks:
358+
await asyncio.gather(*visual_context_tasks, return_exceptions=True)
359+
259360
if not client_disconnected:
260361
try:
261362
await websocket.close()

app/services/base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ async def translate(
114114
source_language: str,
115115
target_language: str,
116116
context: Optional[list[dict[str, str]]] = None,
117+
visual_context: Optional[str] = None,
117118
) -> TranslationResult:
118119
"""
119120
Translate text from source to target language.
@@ -124,6 +125,8 @@ async def translate(
124125
target_language: Target language code
125126
context: Optional prior source/target translations to use as
126127
read-only context
128+
visual_context: Optional shared tab/page visual summary to use as
129+
a read-only hint
127130
128131
Returns:
129132
TranslationResult with translated text

app/services/pipeline_service.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ async def process_streaming(
275275
save_media: bool = True,
276276
pause_event: Optional[asyncio.Event] = None,
277277
language_swap_queue: Optional[asyncio.Queue] = None,
278+
visual_context_queue: Optional[asyncio.Queue] = None,
278279
) -> AsyncGenerator[dict, None]:
279280
"""
280281
Process streaming audio with the real-time translation pipeline.
@@ -295,6 +296,8 @@ async def process_streaming(
295296
save_media: Whether to save generated media files
296297
pause_event: Optional asyncio.Event to signal pause (set=paused, clear=resume)
297298
language_swap_queue: Optional asyncio.Queue for receiving language swap updates
299+
visual_context_queue: Optional asyncio.Queue for shared tab/page visual
300+
context summary updates
298301
299302
Yields:
300303
Dictionary with streaming pipeline results
@@ -560,6 +563,29 @@ async def translation_worker():
560563
max_chunks=translation_context_max_chunks,
561564
max_chars=translation_context_max_chars,
562565
)
566+
visual_context_summary = None
567+
568+
async def drain_visual_context_updates() -> None:
569+
nonlocal visual_context_summary
570+
if visual_context_queue is None:
571+
return
572+
573+
updated = False
574+
while True:
575+
try:
576+
summary = visual_context_queue.get_nowait()
577+
except asyncio.QueueEmpty:
578+
break
579+
visual_context_summary = (
580+
" ".join(str(summary or "").split()) or None
581+
)
582+
updated = True
583+
584+
if updated:
585+
logger.info(
586+
"Visual context summary updated: "
587+
f"chars={len(visual_context_summary or '')}"
588+
)
563589

564590
async def enqueue_tts(text: str, sequence: int) -> None:
565591
await tts_queue.put(
@@ -578,6 +604,8 @@ async def flush_translation_buffer(reason: str) -> None:
578604
nonlocal full_translation, translation_buffer
579605
nonlocal translation_buffer_started_at, translation_sequence
580606

607+
await drain_visual_context_updates()
608+
581609
remaining_text = translation_buffer.strip()
582610
if not remaining_text:
583611
return
@@ -594,6 +622,7 @@ async def flush_translation_buffer(reason: str) -> None:
594622
translation_source_lang,
595623
target_lang,
596624
context=translation_context.snapshot(),
625+
visual_context=visual_context_summary,
597626
)
598627
if result.success:
599628
translation_context.remember(remaining_text, result.text)
@@ -638,6 +667,7 @@ async def flush_translation_buffer(reason: str) -> None:
638667
try:
639668
msg = await asyncio.wait_for(trans_queue.get(), timeout=0.5)
640669
except asyncio.TimeoutError:
670+
await drain_visual_context_updates()
641671
if translation_buffer.strip() and translation_buffer_started_at:
642672
buffer_age = time.time() - translation_buffer_started_at
643673
if buffer_age >= translation_flush_seconds:
@@ -661,6 +691,8 @@ async def flush_translation_buffer(reason: str) -> None:
661691
await result_queue.put(msg)
662692
continue
663693

694+
await drain_visual_context_updates()
695+
664696
trans_result = msg["result"]
665697
asr_translation_queue_wait = (
666698
time.time() - msg["queued_at"] if msg.get("queued_at") else 0.0
@@ -798,6 +830,7 @@ async def flush_translation_buffer(reason: str) -> None:
798830
translation_source_lang,
799831
target_lang,
800832
context=translation_context.snapshot(),
833+
visual_context=visual_context_summary,
801834
)
802835
if result.success:
803836
break

0 commit comments

Comments
 (0)