Skip to content

Commit effcc22

Browse files
committed
feat: add endpoint API keys
1 parent efeecf0 commit effcc22

10 files changed

Lines changed: 191 additions & 21 deletions

File tree

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,19 @@ whisperlivekit-server --host 0.0.0.0 --port 8090 --pcm-input
2626

2727
For TTS, run Kokoro-FastAPI on your machine or tailnet. The app discovers healthy TTS servers on port `8880` by default and uses the OpenAI-compatible `/v1/audio/speech` endpoint.
2828

29+
LiteLLM/OpenAI-compatible proxies are supported for REST STT and TTS. Set the proxy URL manually, add the matching STT/TTS API key in Settings, and use the proxy model name for TTS, for example `kokoro-tts`. Some proxies do not expose `/v1/audio/voices`; in that case enter the voice name manually.
30+
2931
### App
3032

3133
1. Install the APK (grab from [Actions artifacts](../../actions) or build yourself)
32-
2. Open the app. Leave the server URL blank to auto-discover WhisperLiveKit on local networks and Tailscale port `8090`, or set a URL manually in **Settings**.
34+
2. Open the app. Leave the server URL blank to auto-discover WhisperLiveKit on local networks and Tailscale port `8090`, or set a URL manually in **Settings**. If the endpoint requires auth, fill **STT API Key**; the app sends it as `Authorization: Bearer ...`.
3335
3. Grant permissions when prompted:
3436
- **Microphone** — for recording audio
3537
- **Display over other apps** — for the floating bubble
3638
- **Notifications** — for the foreground service
3739
4. Enable the **Whisper Transcriber** accessibility service in Android Settings → Accessibility (needed to type into other apps' text fields)
3840
5. Tap **Start Overlay** — the floating bubble appears
39-
6. Optional: in **Settings → Text To Speech**, discover your Kokoro server, test the connection to load voices, pick a voice/speed, and play sample text
41+
6. Optional: in **Settings → Text To Speech**, discover your Kokoro server, test the connection to load voices, pick a model/voice/speed, and play sample text. If the endpoint requires auth, fill **TTS API Key**.
4042

4143
### Permissions
4244

app/src/main/java/com/whispertranscriber/data/SettingsStore.kt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@ private val Context.dataStore: DataStore<Preferences> by preferencesDataStore(na
1717
data class AppSettings(
1818
val whisperServerUrl: String = "",
1919
val whisperServerPort: Int = WhisperServerDiscovery.DEFAULT_PORT,
20+
val whisperApiKey: String = "",
2021
val audioQuality: String = "medium",
2122
val ttsServerUrl: String = "",
2223
val ttsServerPort: Int = 8880,
24+
val ttsApiKey: String = "",
25+
val ttsModel: String = "kokoro",
2326
val ttsVoice: String = "af_heart",
2427
val ttsSpeed: Float = 1.0f
2528
)
@@ -29,9 +32,12 @@ class SettingsStore(private val context: Context) {
2932
companion object {
3033
private val KEY_SERVER_URL = stringPreferencesKey("whisper_server_url")
3134
private val KEY_SERVER_PORT = intPreferencesKey("whisper_server_port")
35+
private val KEY_WHISPER_API_KEY = stringPreferencesKey("whisper_api_key")
3236
private val KEY_AUDIO_QUALITY = stringPreferencesKey("audio_quality")
3337
private val KEY_TTS_SERVER_URL = stringPreferencesKey("tts_server_url")
3438
private val KEY_TTS_SERVER_PORT = intPreferencesKey("tts_server_port")
39+
private val KEY_TTS_API_KEY = stringPreferencesKey("tts_api_key")
40+
private val KEY_TTS_MODEL = stringPreferencesKey("tts_model")
3541
private val KEY_TTS_VOICE = stringPreferencesKey("tts_voice")
3642
private val KEY_TTS_SPEED = floatPreferencesKey("tts_speed")
3743
}
@@ -40,9 +46,12 @@ class SettingsStore(private val context: Context) {
4046
AppSettings(
4147
whisperServerUrl = prefs[KEY_SERVER_URL] ?: AppSettings().whisperServerUrl,
4248
whisperServerPort = prefs[KEY_SERVER_PORT] ?: AppSettings().whisperServerPort,
49+
whisperApiKey = prefs[KEY_WHISPER_API_KEY] ?: AppSettings().whisperApiKey,
4350
audioQuality = prefs[KEY_AUDIO_QUALITY] ?: AppSettings().audioQuality,
4451
ttsServerUrl = prefs[KEY_TTS_SERVER_URL] ?: AppSettings().ttsServerUrl,
4552
ttsServerPort = prefs[KEY_TTS_SERVER_PORT] ?: AppSettings().ttsServerPort,
53+
ttsApiKey = prefs[KEY_TTS_API_KEY] ?: AppSettings().ttsApiKey,
54+
ttsModel = prefs[KEY_TTS_MODEL] ?: AppSettings().ttsModel,
4655
ttsVoice = prefs[KEY_TTS_VOICE] ?: AppSettings().ttsVoice,
4756
ttsSpeed = prefs[KEY_TTS_SPEED] ?: AppSettings().ttsSpeed
4857
)
@@ -56,6 +65,10 @@ class SettingsStore(private val context: Context) {
5665
context.dataStore.edit { it[KEY_SERVER_PORT] = port }
5766
}
5867

68+
suspend fun updateWhisperApiKey(apiKey: String) {
69+
context.dataStore.edit { it[KEY_WHISPER_API_KEY] = apiKey }
70+
}
71+
5972
suspend fun updateAudioQuality(quality: String) {
6073
context.dataStore.edit { it[KEY_AUDIO_QUALITY] = quality }
6174
}
@@ -68,6 +81,14 @@ class SettingsStore(private val context: Context) {
6881
context.dataStore.edit { it[KEY_TTS_SERVER_PORT] = port }
6982
}
7083

84+
suspend fun updateTtsApiKey(apiKey: String) {
85+
context.dataStore.edit { it[KEY_TTS_API_KEY] = apiKey }
86+
}
87+
88+
suspend fun updateTtsModel(model: String) {
89+
context.dataStore.edit { it[KEY_TTS_MODEL] = model }
90+
}
91+
7192
suspend fun updateTtsVoice(voice: String) {
7293
context.dataStore.edit { it[KEY_TTS_VOICE] = voice }
7394
}

app/src/main/java/com/whispertranscriber/network/KokoroTtsClient.kt

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ class KokoroTtsClient {
2323
.readTimeout(120, TimeUnit.SECONDS)
2424
.build()
2525

26-
suspend fun voices(serverUrl: String): List<String> = withContext(Dispatchers.IO) {
26+
suspend fun voices(serverUrl: String, apiKey: String = ""): List<String> = withContext(Dispatchers.IO) {
2727
val request = Request.Builder()
2828
.url(serverUrl.trimEnd('/') + "/v1/audio/voices")
2929
.header("Cache-Control", "no-cache")
30+
.withBearerAuth(apiKey)
3031
.build()
3132
val response = client.newCall(request).await()
3233
response.use {
@@ -35,15 +36,31 @@ class KokoroTtsClient {
3536
}
3637
}
3738

39+
suspend fun models(serverUrl: String, apiKey: String = ""): List<String> = withContext(Dispatchers.IO) {
40+
val request = Request.Builder()
41+
.url(serverUrl.trimEnd('/') + "/v1/models")
42+
.header("Cache-Control", "no-cache")
43+
.withBearerAuth(apiKey)
44+
.build()
45+
val response = client.newCall(request).await()
46+
response.use {
47+
if (!it.isSuccessful) throw IOException("Models HTTP ${it.code}")
48+
OpenAiModelParser.parse(it.body?.string().orEmpty())
49+
}
50+
}
51+
3852
suspend fun synthesizeWav(
3953
serverUrl: String,
4054
text: String,
4155
voice: String,
42-
speed: Float
56+
speed: Float,
57+
apiKey: String = "",
58+
model: String = "kokoro"
4359
): ByteArray = withContext(Dispatchers.IO) {
4460
val request = Request.Builder()
4561
.url(serverUrl.trimEnd('/') + "/v1/audio/speech")
46-
.post(KokoroSpeechRequest.json(text, voice, speed).toRequestBody("application/json".toMediaType()))
62+
.withBearerAuth(apiKey)
63+
.post(KokoroSpeechRequest.json(text, voice, speed, model).toRequestBody("application/json".toMediaType()))
4764
.build()
4865
val response = client.newCall(request).await()
4966
response.use {
@@ -81,10 +98,26 @@ object KokoroVoiceParser {
8198
}
8299
}
83100

101+
object OpenAiModelParser {
102+
fun parse(jsonText: String): List<String> {
103+
val json = JsonParser.parseString(jsonText).asJsonObject
104+
return json.getAsJsonArray("data")
105+
?.mapNotNull { element ->
106+
element.takeUnless { it.isJsonNull }
107+
?.asJsonObject
108+
?.get("id")
109+
?.takeUnless { it.isJsonNull }
110+
?.asString
111+
}
112+
?.filter { it.isNotBlank() }
113+
.orEmpty()
114+
}
115+
}
116+
84117
object KokoroSpeechRequest {
85-
fun json(text: String, voice: String, speed: Float): String {
118+
fun json(text: String, voice: String, speed: Float, model: String = "kokoro"): String {
86119
val request = JsonObject().apply {
87-
addProperty("model", "kokoro")
120+
addProperty("model", model.ifBlank { "kokoro" })
88121
addProperty("input", text)
89122
addProperty("voice", voice)
90123
addProperty("response_format", "wav")
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package com.whispertranscriber.network
2+
3+
import okhttp3.Request
4+
5+
fun Request.Builder.withBearerAuth(apiKey: String): Request.Builder {
6+
val trimmed = apiKey.trim()
7+
if (trimmed.isNotEmpty()) {
8+
header("Authorization", "Bearer $trimmed")
9+
}
10+
return this
11+
}

app/src/main/java/com/whispertranscriber/network/WhisperApiClient.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class WhisperApiClient {
4646
suspend fun transcribe(
4747
serverUrl: String,
4848
audioData: ByteArray,
49+
apiKey: String = "",
4950
fileName: String = "audio.wav"
5051
): TranscriptionResult = withContext(Dispatchers.IO) {
5152
val url = serverUrl.trimEnd('/') + "/v1/audio/transcriptions"
@@ -60,6 +61,7 @@ class WhisperApiClient {
6061

6162
val request = Request.Builder()
6263
.url(url)
64+
.withBearerAuth(apiKey)
6365
.post(multipartBody)
6466
.build()
6567

app/src/main/java/com/whispertranscriber/network/WhisperLiveKitClient.kt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class WhisperLiveKitClient {
2424

2525
suspend fun connect(
2626
serverUrl: String,
27+
apiKey: String = "",
2728
onPartial: (String) -> Unit,
2829
onReadyToStop: (TranscriptionResult) -> Unit = {}
2930
): WhisperLiveKitSession = withContext(Dispatchers.IO) {
@@ -34,6 +35,7 @@ class WhisperLiveKitClient {
3435

3536
val request = Request.Builder()
3637
.url(toWebSocketUrl(serverUrl))
38+
.withBearerAuth(apiKey)
3739
.build()
3840

3941
lateinit var socket: WebSocket

app/src/main/java/com/whispertranscriber/service/FloatingOverlayService.kt

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ class FloatingOverlayService : Service() {
221221
liveKitSession = try {
222222
liveKitClient.connect(
223223
serverUrl = serverUrl,
224+
apiKey = settings.whisperApiKey,
224225
onPartial = { partial ->
225226
serviceScope.launch {
226227
handleLivePartial(partial)
@@ -276,21 +277,23 @@ class FloatingOverlayService : Service() {
276277
val session = liveKitSession
277278
liveKitSession = null
278279
val result = liveResult?.let {
279-
retryRestIfLiveResultIsBlank(serverUrl, wavData, it)
280+
retryRestIfLiveResultIsBlank(serverUrl, wavData, settings.whisperApiKey, it)
280281
} ?: if (liveKitReady && session != null) {
281282
try {
282-
retryRestIfLiveResultIsBlank(serverUrl, wavData, session.finish())
283+
retryRestIfLiveResultIsBlank(serverUrl, wavData, settings.whisperApiKey, session.finish())
283284
} catch (e: Exception) {
284285
Log.w(TAG, "Live transcription finalization failed, retrying with REST", e)
285286
whisperClient.transcribe(
286287
serverUrl = serverUrl,
287-
audioData = wavData
288+
audioData = wavData,
289+
apiKey = settings.whisperApiKey
288290
)
289291
}
290292
} else {
291293
whisperClient.transcribe(
292294
serverUrl = serverUrl,
293-
audioData = wavData
295+
audioData = wavData,
296+
apiKey = settings.whisperApiKey
294297
)
295298
}
296299
liveKitReady = false
@@ -335,13 +338,15 @@ class FloatingOverlayService : Service() {
335338
private suspend fun retryRestIfLiveResultIsBlank(
336339
serverUrl: String,
337340
wavData: ByteArray,
341+
apiKey: String,
338342
liveResult: TranscriptionResult
339343
): TranscriptionResult {
340344
if (!liveResult.shouldRetryRestAfterLive()) return liveResult
341345
Log.w(TAG, "Live transcription returned empty text, retrying with REST")
342346
return whisperClient.transcribe(
343347
serverUrl = serverUrl,
344-
audioData = wavData
348+
audioData = wavData,
349+
apiKey = apiKey
345350
)
346351
}
347352

@@ -550,7 +555,9 @@ class FloatingOverlayService : Service() {
550555
serverUrl = serverUrl,
551556
text = text,
552557
voice = settings.ttsVoice,
553-
speed = settings.ttsSpeed
558+
speed = settings.ttsSpeed,
559+
apiKey = settings.ttsApiKey,
560+
model = settings.ttsModel
554561
)
555562
Log.d(TAG, "TTS synthesized ${audio.size} bytes for clipboard playback")
556563
Toast.makeText(this@FloatingOverlayService, "Playing clipboard", Toast.LENGTH_SHORT).show()

app/src/main/java/com/whispertranscriber/ui/SettingsScreen.kt

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import androidx.compose.runtime.rememberCoroutineScope
3939
import androidx.compose.runtime.setValue
4040
import androidx.compose.ui.Modifier
4141
import androidx.compose.ui.platform.LocalContext
42+
import androidx.compose.ui.text.input.PasswordVisualTransformation
4243
import androidx.compose.ui.text.input.KeyboardType
4344
import androidx.compose.ui.unit.dp
4445
import com.whispertranscriber.audio.TtsAudioPlayer
@@ -220,6 +221,19 @@ fun SettingsScreen(
220221
singleLine = true
221222
)
222223

224+
Spacer(Modifier.height(8.dp))
225+
OutlinedTextField(
226+
value = settings.whisperApiKey,
227+
onValueChange = { newValue ->
228+
scope.launch { settingsStore.updateWhisperApiKey(newValue) }
229+
},
230+
label = { Text("STT API Key") },
231+
placeholder = { Text("Optional Bearer token") },
232+
modifier = Modifier.fillMaxWidth(),
233+
visualTransformation = PasswordVisualTransformation(),
234+
singleLine = true
235+
)
236+
223237
Spacer(Modifier.height(8.dp))
224238
OutlinedTextField(
225239
value = displayPort,
@@ -339,6 +353,31 @@ fun SettingsScreen(
339353
singleLine = true
340354
)
341355

356+
Spacer(Modifier.height(8.dp))
357+
OutlinedTextField(
358+
value = settings.ttsApiKey,
359+
onValueChange = { newValue ->
360+
scope.launch { settingsStore.updateTtsApiKey(newValue) }
361+
},
362+
label = { Text("TTS API Key") },
363+
placeholder = { Text("Optional Bearer token") },
364+
modifier = Modifier.fillMaxWidth(),
365+
visualTransformation = PasswordVisualTransformation(),
366+
singleLine = true
367+
)
368+
369+
Spacer(Modifier.height(8.dp))
370+
OutlinedTextField(
371+
value = settings.ttsModel,
372+
onValueChange = { newValue ->
373+
scope.launch { settingsStore.updateTtsModel(newValue) }
374+
},
375+
label = { Text("TTS Model") },
376+
placeholder = { Text("kokoro or kokoro-tts") },
377+
modifier = Modifier.fillMaxWidth(),
378+
singleLine = true
379+
)
380+
342381
Spacer(Modifier.height(8.dp))
343382
OutlinedTextField(
344383
value = displayTtsPort,
@@ -401,7 +440,7 @@ fun SettingsScreen(
401440
ttsDiscoveryStatus = "Fetching voices..."
402441
scope.launch {
403442
try {
404-
val voices = ttsClient.voices(url)
443+
val voices = ttsClient.voices(url, settings.ttsApiKey)
405444
ttsVoices = voices
406445
val selectedVoice = when {
407446
settings.ttsVoice in voices -> settings.ttsVoice
@@ -415,7 +454,21 @@ fun SettingsScreen(
415454
ttsDiscoveryStatus = "Loaded ${voices.size} voice(s)."
416455
} catch (e: Exception) {
417456
Log.e(TAG, "TTS voice fetch failed", e)
418-
ttsDiscoveryStatus = "TTS connection failed: ${e.message}"
457+
ttsDiscoveryStatus = if (e.message?.contains("404") == true) {
458+
try {
459+
val models = ttsClient.models(url, settings.ttsApiKey)
460+
val modelHint = models.firstOrNull { it.contains("tts", ignoreCase = true) }
461+
if (modelHint != null && settings.ttsModel.isBlank()) {
462+
settingsStore.updateTtsModel(modelHint)
463+
}
464+
"Connected. Voice list unavailable; enter voice manually."
465+
} catch (modelsError: Exception) {
466+
Log.e(TAG, "TTS model fetch failed", modelsError)
467+
"Voice list unavailable, and model check failed: ${modelsError.message}"
468+
}
469+
} else {
470+
"TTS connection failed: ${e.message}"
471+
}
419472
}
420473
ttsTesting = false
421474
}
@@ -440,8 +493,9 @@ fun SettingsScreen(
440493
) {
441494
OutlinedTextField(
442495
value = settings.ttsVoice,
443-
onValueChange = {},
444-
readOnly = true,
496+
onValueChange = { newValue ->
497+
scope.launch { settingsStore.updateTtsVoice(newValue) }
498+
},
445499
label = { Text("Voice") },
446500
trailingIcon = { ExposedDropdownMenuDefaults.TrailingIcon(expanded = ttsVoiceDropdownExpanded) },
447501
modifier = Modifier
@@ -507,7 +561,9 @@ fun SettingsScreen(
507561
serverUrl = url,
508562
text = text,
509563
voice = settings.ttsVoice,
510-
speed = displayTtsSpeed
564+
speed = displayTtsSpeed,
565+
apiKey = settings.ttsApiKey,
566+
model = settings.ttsModel
511567
)
512568
Log.d(TAG, "TTS synthesized ${audio.size} bytes for test playback")
513569
settingsStore.updateTtsSpeed(displayTtsSpeed)

0 commit comments

Comments
 (0)