feat: add endpoint API keys

Donach · Donach · commit effcc220ff82 · 2026-05-26T10:38:59.000+02:00
diff --git a/README.md b/README.md
@@ -26,17 +26,19 @@ whisperlivekit-server --host 0.0.0.0 --port 8090 --pcm-input
 
 For TTS, run Kokoro-FastAPI on your machine or tailnet. The app discovers healthy TTS servers on port `8880` by default and uses the OpenAI-compatible `/v1/audio/speech` endpoint.
 
+LiteLLM/OpenAI-compatible proxies are supported for REST STT and TTS. Set the proxy URL manually, add the matching STT/TTS API key in Settings, and use the proxy model name for TTS, for example `kokoro-tts`. Some proxies do not expose `/v1/audio/voices`; in that case enter the voice name manually.
+
 ### App
 
 1. Install the APK (grab from [Actions artifacts](../../actions) or build yourself)
-2. Open the app. Leave the server URL blank to auto-discover WhisperLiveKit on local networks and Tailscale port `8090`, or set a URL manually in **Settings**.
+2. Open the app. Leave the server URL blank to auto-discover WhisperLiveKit on local networks and Tailscale port `8090`, or set a URL manually in **Settings**. If the endpoint requires auth, fill **STT API Key**; the app sends it as `Authorization: Bearer ...`.
 3. Grant permissions when prompted:
    - **Microphone** — for recording audio
    - **Display over other apps** — for the floating bubble
    - **Notifications** — for the foreground service
 4. Enable the **Whisper Transcriber** accessibility service in Android Settings → Accessibility (needed to type into other apps' text fields)
 5. Tap **Start Overlay** — the floating bubble appears
-6. Optional: in **Settings → Text To Speech**, discover your Kokoro server, test the connection to load voices, pick a voice/speed, and play sample text
+6. Optional: in **Settings → Text To Speech**, discover your Kokoro server, test the connection to load voices, pick a model/voice/speed, and play sample text. If the endpoint requires auth, fill **TTS API Key**.
 
 ### Permissions
 
diff --git a/app/src/main/java/com/whispertranscriber/data/SettingsStore.kt b/app/src/main/java/com/whispertranscriber/data/SettingsStore.kt
@@ -17,9 +17,12 @@ private val Context.dataStore: DataStore<Preferences> by preferencesDataStore(na
 data class AppSettings(
     val whisperServerUrl: String = "",
     val whisperServerPort: Int = WhisperServerDiscovery.DEFAULT_PORT,
+    val whisperApiKey: String = "",
     val audioQuality: String = "medium",
     val ttsServerUrl: String = "",
     val ttsServerPort: Int = 8880,
+    val ttsApiKey: String = "",
+    val ttsModel: String = "kokoro",
     val ttsVoice: String = "af_heart",
     val ttsSpeed: Float = 1.0f
 )
@@ -29,9 +32,12 @@ class SettingsStore(private val context: Context) {
     companion object {
         private val KEY_SERVER_URL = stringPreferencesKey("whisper_server_url")
         private val KEY_SERVER_PORT = intPreferencesKey("whisper_server_port")
+        private val KEY_WHISPER_API_KEY = stringPreferencesKey("whisper_api_key")
         private val KEY_AUDIO_QUALITY = stringPreferencesKey("audio_quality")
         private val KEY_TTS_SERVER_URL = stringPreferencesKey("tts_server_url")
         private val KEY_TTS_SERVER_PORT = intPreferencesKey("tts_server_port")
+        private val KEY_TTS_API_KEY = stringPreferencesKey("tts_api_key")
+        private val KEY_TTS_MODEL = stringPreferencesKey("tts_model")
         private val KEY_TTS_VOICE = stringPreferencesKey("tts_voice")
         private val KEY_TTS_SPEED = floatPreferencesKey("tts_speed")
     }
@@ -40,9 +46,12 @@ class SettingsStore(private val context: Context) {
         AppSettings(
             whisperServerUrl = prefs[KEY_SERVER_URL] ?: AppSettings().whisperServerUrl,
             whisperServerPort = prefs[KEY_SERVER_PORT] ?: AppSettings().whisperServerPort,
+            whisperApiKey = prefs[KEY_WHISPER_API_KEY] ?: AppSettings().whisperApiKey,
             audioQuality = prefs[KEY_AUDIO_QUALITY] ?: AppSettings().audioQuality,
             ttsServerUrl = prefs[KEY_TTS_SERVER_URL] ?: AppSettings().ttsServerUrl,
             ttsServerPort = prefs[KEY_TTS_SERVER_PORT] ?: AppSettings().ttsServerPort,
+            ttsApiKey = prefs[KEY_TTS_API_KEY] ?: AppSettings().ttsApiKey,
+            ttsModel = prefs[KEY_TTS_MODEL] ?: AppSettings().ttsModel,
             ttsVoice = prefs[KEY_TTS_VOICE] ?: AppSettings().ttsVoice,
             ttsSpeed = prefs[KEY_TTS_SPEED] ?: AppSettings().ttsSpeed
         )
@@ -56,6 +65,10 @@ class SettingsStore(private val context: Context) {
         context.dataStore.edit { it[KEY_SERVER_PORT] = port }
     }
 
+    suspend fun updateWhisperApiKey(apiKey: String) {
+        context.dataStore.edit { it[KEY_WHISPER_API_KEY] = apiKey }
+    }
+
     suspend fun updateAudioQuality(quality: String) {
         context.dataStore.edit { it[KEY_AUDIO_QUALITY] = quality }
     }
@@ -68,6 +81,14 @@ class SettingsStore(private val context: Context) {
         context.dataStore.edit { it[KEY_TTS_SERVER_PORT] = port }
     }
 
+    suspend fun updateTtsApiKey(apiKey: String) {
+        context.dataStore.edit { it[KEY_TTS_API_KEY] = apiKey }
+    }
+
+    suspend fun updateTtsModel(model: String) {
+        context.dataStore.edit { it[KEY_TTS_MODEL] = model }
+    }
+
     suspend fun updateTtsVoice(voice: String) {
         context.dataStore.edit { it[KEY_TTS_VOICE] = voice }
     }
diff --git a/app/src/main/java/com/whispertranscriber/network/KokoroTtsClient.kt b/app/src/main/java/com/whispertranscriber/network/KokoroTtsClient.kt
@@ -23,10 +23,11 @@ class KokoroTtsClient {
         .readTimeout(120, TimeUnit.SECONDS)
         .build()
 
-    suspend fun voices(serverUrl: String): List<String> = withContext(Dispatchers.IO) {
+    suspend fun voices(serverUrl: String, apiKey: String = ""): List<String> = withContext(Dispatchers.IO) {
         val request = Request.Builder()
             .url(serverUrl.trimEnd('/') + "/v1/audio/voices")
             .header("Cache-Control", "no-cache")
+            .withBearerAuth(apiKey)
             .build()
         val response = client.newCall(request).await()
         response.use {
@@ -35,15 +36,31 @@ class KokoroTtsClient {
         }
     }
 
+    suspend fun models(serverUrl: String, apiKey: String = ""): List<String> = withContext(Dispatchers.IO) {
+        val request = Request.Builder()
+            .url(serverUrl.trimEnd('/') + "/v1/models")
+            .header("Cache-Control", "no-cache")
+            .withBearerAuth(apiKey)
+            .build()
+        val response = client.newCall(request).await()
+        response.use {
+            if (!it.isSuccessful) throw IOException("Models HTTP ${it.code}")
+            OpenAiModelParser.parse(it.body?.string().orEmpty())
+        }
+    }
+
     suspend fun synthesizeWav(
         serverUrl: String,
         text: String,
         voice: String,
-        speed: Float
+        speed: Float,
+        apiKey: String = "",
+        model: String = "kokoro"
     ): ByteArray = withContext(Dispatchers.IO) {
         val request = Request.Builder()
             .url(serverUrl.trimEnd('/') + "/v1/audio/speech")
-            .post(KokoroSpeechRequest.json(text, voice, speed).toRequestBody("application/json".toMediaType()))
+            .withBearerAuth(apiKey)
+            .post(KokoroSpeechRequest.json(text, voice, speed, model).toRequestBody("application/json".toMediaType()))
             .build()
         val response = client.newCall(request).await()
         response.use {
@@ -81,10 +98,26 @@ object KokoroVoiceParser {
     }
 }
 
+object OpenAiModelParser {
+    fun parse(jsonText: String): List<String> {
+        val json = JsonParser.parseString(jsonText).asJsonObject
+        return json.getAsJsonArray("data")
+            ?.mapNotNull { element ->
+                element.takeUnless { it.isJsonNull }
+                    ?.asJsonObject
+                    ?.get("id")
+                    ?.takeUnless { it.isJsonNull }
+                    ?.asString
+            }
+            ?.filter { it.isNotBlank() }
+            .orEmpty()
+    }
+}
+
 object KokoroSpeechRequest {
-    fun json(text: String, voice: String, speed: Float): String {
+    fun json(text: String, voice: String, speed: Float, model: String = "kokoro"): String {
         val request = JsonObject().apply {
-            addProperty("model", "kokoro")
+            addProperty("model", model.ifBlank { "kokoro" })
             addProperty("input", text)
             addProperty("voice", voice)
             addProperty("response_format", "wav")
diff --git a/app/src/main/java/com/whispertranscriber/network/RequestAuth.kt b/app/src/main/java/com/whispertranscriber/network/RequestAuth.kt
@@ -0,0 +1,11 @@
+package com.whispertranscriber.network
+
+import okhttp3.Request
+
+fun Request.Builder.withBearerAuth(apiKey: String): Request.Builder {
+    val trimmed = apiKey.trim()
+    if (trimmed.isNotEmpty()) {
+        header("Authorization", "Bearer $trimmed")
+    }
+    return this
+}
diff --git a/app/src/main/java/com/whispertranscriber/network/WhisperApiClient.kt b/app/src/main/java/com/whispertranscriber/network/WhisperApiClient.kt
@@ -46,6 +46,7 @@ class WhisperApiClient {
     suspend fun transcribe(
         serverUrl: String,
         audioData: ByteArray,
+        apiKey: String = "",
         fileName: String = "audio.wav"
     ): TranscriptionResult = withContext(Dispatchers.IO) {
         val url = serverUrl.trimEnd('/') + "/v1/audio/transcriptions"
@@ -60,6 +61,7 @@ class WhisperApiClient {
 
         val request = Request.Builder()
             .url(url)
+            .withBearerAuth(apiKey)
             .post(multipartBody)
             .build()
 
diff --git a/app/src/main/java/com/whispertranscriber/network/WhisperLiveKitClient.kt b/app/src/main/java/com/whispertranscriber/network/WhisperLiveKitClient.kt
@@ -24,6 +24,7 @@ class WhisperLiveKitClient {
 
     suspend fun connect(
         serverUrl: String,
+        apiKey: String = "",
         onPartial: (String) -> Unit,
         onReadyToStop: (TranscriptionResult) -> Unit = {}
     ): WhisperLiveKitSession = withContext(Dispatchers.IO) {
@@ -34,6 +35,7 @@ class WhisperLiveKitClient {
 
         val request = Request.Builder()
             .url(toWebSocketUrl(serverUrl))
+            .withBearerAuth(apiKey)
             .build()
 
         lateinit var socket: WebSocket
diff --git a/app/src/main/java/com/whispertranscriber/service/FloatingOverlayService.kt b/app/src/main/java/com/whispertranscriber/service/FloatingOverlayService.kt
@@ -221,6 +221,7 @@ class FloatingOverlayService : Service() {
             liveKitSession = try {
                 liveKitClient.connect(
                     serverUrl = serverUrl,
+                    apiKey = settings.whisperApiKey,
                     onPartial = { partial ->
                         serviceScope.launch {
                             handleLivePartial(partial)
@@ -276,21 +277,23 @@ class FloatingOverlayService : Service() {
                 val session = liveKitSession
                 liveKitSession = null
                 val result = liveResult?.let {
-                    retryRestIfLiveResultIsBlank(serverUrl, wavData, it)
+                    retryRestIfLiveResultIsBlank(serverUrl, wavData, settings.whisperApiKey, it)
                 } ?: if (liveKitReady && session != null) {
                     try {
-                        retryRestIfLiveResultIsBlank(serverUrl, wavData, session.finish())
+                        retryRestIfLiveResultIsBlank(serverUrl, wavData, settings.whisperApiKey, session.finish())
                     } catch (e: Exception) {
                         Log.w(TAG, "Live transcription finalization failed, retrying with REST", e)
                         whisperClient.transcribe(
                             serverUrl = serverUrl,
-                            audioData = wavData
+                            audioData = wavData,
+                            apiKey = settings.whisperApiKey
                         )
                     }
                 } else {
                     whisperClient.transcribe(
                         serverUrl = serverUrl,
-                        audioData = wavData
+                        audioData = wavData,
+                        apiKey = settings.whisperApiKey
                     )
                 }
                 liveKitReady = false
@@ -335,13 +338,15 @@ class FloatingOverlayService : Service() {
     private suspend fun retryRestIfLiveResultIsBlank(
         serverUrl: String,
         wavData: ByteArray,
+        apiKey: String,
         liveResult: TranscriptionResult
     ): TranscriptionResult {
         if (!liveResult.shouldRetryRestAfterLive()) return liveResult
         Log.w(TAG, "Live transcription returned empty text, retrying with REST")
         return whisperClient.transcribe(
             serverUrl = serverUrl,
-            audioData = wavData
+            audioData = wavData,
+            apiKey = apiKey
         )
     }
 
@@ -550,7 +555,9 @@ class FloatingOverlayService : Service() {
                     serverUrl = serverUrl,
                     text = text,
                     voice = settings.ttsVoice,
-                    speed = settings.ttsSpeed
+                    speed = settings.ttsSpeed,
+                    apiKey = settings.ttsApiKey,
+                    model = settings.ttsModel
                 )
                 Log.d(TAG, "TTS synthesized ${audio.size} bytes for clipboard playback")
                 Toast.makeText(this@FloatingOverlayService, "Playing clipboard", Toast.LENGTH_SHORT).show()
diff --git a/app/src/main/java/com/whispertranscriber/ui/SettingsScreen.kt b/app/src/main/java/com/whispertranscriber/ui/SettingsScreen.kt
@@ -39,6 +39,7 @@ import androidx.compose.runtime.rememberCoroutineScope
 import androidx.compose.runtime.setValue
 import androidx.compose.ui.Modifier
 import androidx.compose.ui.platform.LocalContext
+import androidx.compose.ui.text.input.PasswordVisualTransformation
 import androidx.compose.ui.text.input.KeyboardType
 import androidx.compose.ui.unit.dp
 import com.whispertranscriber.audio.TtsAudioPlayer
@@ -220,6 +221,19 @@ fun SettingsScreen(
                 singleLine = true
             )
 
+            Spacer(Modifier.height(8.dp))
+            OutlinedTextField(
+                value = settings.whisperApiKey,
+                onValueChange = { newValue ->
+                    scope.launch { settingsStore.updateWhisperApiKey(newValue) }
+                },
+                label = { Text("STT API Key") },
+                placeholder = { Text("Optional Bearer token") },
+                modifier = Modifier.fillMaxWidth(),
+                visualTransformation = PasswordVisualTransformation(),
+                singleLine = true
+            )
+
             Spacer(Modifier.height(8.dp))
             OutlinedTextField(
                 value = displayPort,
@@ -339,6 +353,31 @@ fun SettingsScreen(
                 singleLine = true
             )
 
+            Spacer(Modifier.height(8.dp))
+            OutlinedTextField(
+                value = settings.ttsApiKey,
+                onValueChange = { newValue ->
+                    scope.launch { settingsStore.updateTtsApiKey(newValue) }
+                },
+                label = { Text("TTS API Key") },
+                placeholder = { Text("Optional Bearer token") },
+                modifier = Modifier.fillMaxWidth(),
+                visualTransformation = PasswordVisualTransformation(),
+                singleLine = true
+            )
+
+            Spacer(Modifier.height(8.dp))
+            OutlinedTextField(
+                value = settings.ttsModel,
+                onValueChange = { newValue ->
+                    scope.launch { settingsStore.updateTtsModel(newValue) }
+                },
+                label = { Text("TTS Model") },
+                placeholder = { Text("kokoro or kokoro-tts") },
+                modifier = Modifier.fillMaxWidth(),
+                singleLine = true
+            )
+
             Spacer(Modifier.height(8.dp))
             OutlinedTextField(
                 value = displayTtsPort,
@@ -401,7 +440,7 @@ fun SettingsScreen(
                         ttsDiscoveryStatus = "Fetching voices..."
                         scope.launch {
                             try {
-                                val voices = ttsClient.voices(url)
+                                val voices = ttsClient.voices(url, settings.ttsApiKey)
                                 ttsVoices = voices
                                 val selectedVoice = when {
                                     settings.ttsVoice in voices -> settings.ttsVoice
@@ -415,7 +454,21 @@ fun SettingsScreen(
                                 ttsDiscoveryStatus = "Loaded ${voices.size} voice(s)."
                             } catch (e: Exception) {
                                 Log.e(TAG, "TTS voice fetch failed", e)
-                                ttsDiscoveryStatus = "TTS connection failed: ${e.message}"
+                                ttsDiscoveryStatus = if (e.message?.contains("404") == true) {
+                                    try {
+                                        val models = ttsClient.models(url, settings.ttsApiKey)
+                                        val modelHint = models.firstOrNull { it.contains("tts", ignoreCase = true) }
+                                        if (modelHint != null && settings.ttsModel.isBlank()) {
+                                            settingsStore.updateTtsModel(modelHint)
+                                        }
+                                        "Connected. Voice list unavailable; enter voice manually."
+                                    } catch (modelsError: Exception) {
+                                        Log.e(TAG, "TTS model fetch failed", modelsError)
+                                        "Voice list unavailable, and model check failed: ${modelsError.message}"
+                                    }
+                                } else {
+                                    "TTS connection failed: ${e.message}"
+                                }
                             }
                             ttsTesting = false
                         }
@@ -440,8 +493,9 @@ fun SettingsScreen(
             ) {
                 OutlinedTextField(
                     value = settings.ttsVoice,
-                    onValueChange = {},
-                    readOnly = true,
+                    onValueChange = { newValue ->
+                        scope.launch { settingsStore.updateTtsVoice(newValue) }
+                    },
                     label = { Text("Voice") },
                     trailingIcon = { ExposedDropdownMenuDefaults.TrailingIcon(expanded = ttsVoiceDropdownExpanded) },
                     modifier = Modifier
@@ -507,7 +561,9 @@ fun SettingsScreen(
                                 serverUrl = url,
                                 text = text,
                                 voice = settings.ttsVoice,
-                                speed = displayTtsSpeed
+                                speed = displayTtsSpeed,
+                                apiKey = settings.ttsApiKey,
+                                model = settings.ttsModel
                             )
                             Log.d(TAG, "TTS synthesized ${audio.size} bytes for test playback")
                             settingsStore.updateTtsSpeed(displayTtsSpeed)
diff --git a/app/src/test/java/com/whispertranscriber/network/KokoroTtsClientTest.kt b/app/src/test/java/com/whispertranscriber/network/KokoroTtsClientTest.kt
diff --git a/app/src/test/java/com/whispertranscriber/network/RequestAuthTest.kt b/app/src/test/java/com/whispertranscriber/network/RequestAuthTest.kt