|
| 1 | +# PolyTalk Environment Variables |
| 2 | +# Copy this file to .env and update values as needed |
| 3 | +# All ${VAR} references in config/config.yaml will use these values |
| 4 | + |
| 5 | +# ============================================================================ |
| 6 | +# APPLICATION LOGGING |
| 7 | +# ============================================================================ |
| 8 | +# Logging level: DEBUG, INFO, WARNING, ERROR, CRITICAL |
| 9 | +LOG_LEVEL=INFO |
| 10 | + |
| 11 | +# ============================================================================ |
| 12 | +# STT SERVICE (Local Speech-to-Text with faster-whisper) |
| 13 | +# ============================================================================ |
| 14 | +# STT model to use: small, small-v3, medium, large-v3 |
| 15 | +STT_MODEL=small |
| 16 | + |
| 17 | +# Device to run STT: cpu or cuda |
| 18 | +STT_DEVICE=cpu |
| 19 | + |
| 20 | +# Compute type: int8 (CPU) or float16 (CUDA) |
| 21 | +STT_COMPUTE_TYPE=int8 |
| 22 | + |
| 23 | +# Number of STT web workers. Each worker loads its own Whisper model. |
| 24 | +STT_WORKERS=1 |
| 25 | + |
| 26 | +# Load the Whisper model during STT service startup instead of on first stream. |
| 27 | +STT_PRELOAD_MODEL=true |
| 28 | + |
| 29 | +# Max file upload size in MB |
| 30 | +STT_MAX_UPLOAD_MB=200 |
| 31 | + |
| 32 | +# Streaming audio window in seconds. Lower values reduce latency but can reduce |
| 33 | +# transcript stability. 3.0 gives Whisper more context while pause flush handles utterance endings. |
| 34 | +STT_STREAM_CHUNK_SECONDS=3.0 |
| 35 | + |
| 36 | +# Audio overlap between STT windows. Helps avoid missing words at chunk boundaries. |
| 37 | +# Keep this modest; too much overlap can increase repeated/hallucinated text. |
| 38 | +STT_CHUNK_OVERLAP_SECONDS=0.25 |
| 39 | + |
| 40 | +# Parallel STT queue workers. Increase to 2 when STT inference is slower than |
| 41 | +# incoming audio windows and the GPU has spare compute. |
| 42 | +STT_TRANSCRIBE_WORKERS=2 |
| 43 | +STT_TRANSCRIBE_QUEUE_SIZE=8 |
| 44 | +STT_MODEL_WORKERS=2 |
| 45 | + |
| 46 | +# Transcript emit batching. STT may infer more often than it emits to PolyTalk. |
| 47 | +# Increase these values if live transcript/translation/TTS chunks are too small. |
| 48 | +STT_EMIT_MIN_CHARS=120 |
| 49 | +STT_EMIT_INTERVAL_SECONDS=4.5 |
| 50 | +# Flush the current speech window after this much trailing silence, even if the |
| 51 | +# normal stream window or emit thresholds have not been reached. Set 0 to disable. |
| 52 | +STT_PAUSE_FLUSH_SECONDS=1.2 |
| 53 | + |
| 54 | +# Silence/hallucination guards for streaming STT. These balanced defaults work |
| 55 | +# well for typical microphone input: raise RMS/no-speech strictness if Whisper |
| 56 | +# hallucinates during silence; lower them if quiet speech is missed. |
| 57 | +STT_SILENCE_RMS_THRESHOLD=0.003 |
| 58 | +STT_NO_SPEECH_PROB_THRESHOLD=0.50 |
| 59 | +STT_LOG_PROB_THRESHOLD=-1.0 |
| 60 | +STT_MAX_CROSS_DELTA_WORD_REPEATS=6 |
| 61 | + |
| 62 | +# faster-whisper decoding/VAD knobs. Keep previous-text conditioning disabled |
| 63 | +# by default for streaming because it can repeat or invent text during silence. |
| 64 | +STT_VAD_FILTER=true |
| 65 | +STT_VAD_MIN_SILENCE_MS=500 |
| 66 | +STT_VAD_SPEECH_PAD_MS=200 |
| 67 | +STT_WORD_TIMESTAMPS=true |
| 68 | +STT_CONDITION_ON_PREVIOUS_TEXT=false |
| 69 | +STT_TEMPERATURE=0.0 |
| 70 | +# Optional domain/context prompt for Whisper, for example names or product terms. |
| 71 | +# STT_INITIAL_PROMPT= |
| 72 | + |
| 73 | +# ============================================================================ |
| 74 | +# WHISPER SERVICE CONFIGURATION (Points to local STT or external) |
| 75 | +# ============================================================================ |
| 76 | +# Base URL for Whisper API (defaults to local STT service in Docker) |
| 77 | +# For external service, use: https://whisper.your-domain.com |
| 78 | +WHISPER_BASE_URL=http://stt:8000 |
| 79 | + |
| 80 | +# WebSocket endpoint for streaming transcription (used by PolyTalk) |
| 81 | +WHISPER_WS_ENDPOINT=/v1/stream/transcriptions |
| 82 | + |
| 83 | +# Optional: API key for external Whisper API (e.g., OpenAI, custom deployment) |
| 84 | +# Set this if your Whisper service requires authentication |
| 85 | +# WHISPER_API_KEY=your-api-key-here |
| 86 | + |
| 87 | +# ============================================================================ |
| 88 | +# TRANSLATION SERVICE (AI Translation) |
| 89 | +# ============================================================================ |
| 90 | +# Translation API format: openai_chat, openai_responses, anthropic_messages, |
| 91 | +# or gemini_generate_content. |
| 92 | +TRANSLATION_API_FORMAT=openai_chat |
| 93 | + |
| 94 | +# Base URL and endpoint for Translation API. Use your self-hosted AI server |
| 95 | +# URL here, or an OpenAI-compatible provider URL. |
| 96 | +TRANSLATION_BASE_URL=https://ai.example.com |
| 97 | +TRANSLATION_ENDPOINT=/v1/chat/completions |
| 98 | + |
| 99 | +# API key for Translation service |
| 100 | +TRANSLATION_API_KEY=your_translation_api_key_here |
| 101 | + |
| 102 | +# AI model to use for translation. For self-hosted translation, use models such |
| 103 | +# as qwen3-8b, TranslateGama, or other open-source/open-weight models supported |
| 104 | +# by your model server. |
| 105 | +TRANSLATION_MODEL=qwen3-8b |
| 106 | + |
| 107 | +# Maximum translation output tokens. Keep this bounded for live streaming, but |
| 108 | +# allow enough room for Indic-script targets and longer sentence buffers. |
| 109 | +TRANSLATION_MAX_TOKENS=240 |
| 110 | + |
| 111 | +# ============================================================================ |
| 112 | +# TTS SERVICE (Local Text-to-Speech with Piper) |
| 113 | +# ============================================================================ |
| 114 | +# Piper model to use (voice model name in tts/voices directory) |
| 115 | +TTS_MODEL=en_GB-jenny_dioco-medium |
| 116 | + |
| 117 | +# Base URL for TTS API (local Piper service in Docker) |
| 118 | +# For external service, use: https://tts.your-domain.com |
| 119 | +TTS_BASE_URL=http://tts:5000 |
| 120 | + |
| 121 | +# ============================================================================ |
| 122 | +# APPLICATION SETTINGS |
| 123 | +# ============================================================================ |
| 124 | +# Host to bind the application (0.0.0.0 for all interfaces) |
| 125 | +APP_HOST=0.0.0.0 |
| 126 | + |
| 127 | +# Port to run the application |
| 128 | +APP_PORT=9000 |
| 129 | + |
| 130 | +# Enable debug mode (set to false for production) |
| 131 | +APP_DEBUG=true |
| 132 | + |
| 133 | +# Comma-separated browser origins allowed to call the app. |
| 134 | +# Use the exact HTTPS origin in production, for example: |
| 135 | +# ALLOWED_ORIGINS=https://polytalk.example.com |
| 136 | +ALLOWED_ORIGINS=http://localhost:9000,http://127.0.0.1:9000 |
| 137 | + |
| 138 | +# Translate partial speech after this many buffered characters or seconds. |
| 139 | +# Lower values reduce latency; higher values improve context and quality. |
| 140 | +TRANSLATION_FLUSH_CHARS=300 |
| 141 | +TRANSLATION_FLUSH_SECONDS=5.0 |
| 142 | +TRANSLATION_FLUSH_MIN_CHARS=120 |
0 commit comments