Skip to content

Commit 1bd8800

Browse files
committed
feat: add Kokoro TTS playback
1 parent 0a8d0dd commit 1bd8800

7 files changed

Lines changed: 559 additions & 4 deletions

File tree

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Works over Tailscale / ZeroTier — just point it at your server's VPN IP.
1111
3. Audio is streamed to WhisperLiveKit via native WebSocket (`/asr`) when PCM input is enabled
1212
4. Partial transcripts replace the in-progress text in the focused input field in real time
1313
5. If no editable field is focused, the final transcript is copied to the clipboard once the utterance is silent
14-
6. If live streaming is unavailable, the app falls back to the OpenAI-compatible REST API (`/v1/audio/transcriptions`)
14+
6. Optional Kokoro TTS can read clipboard text aloud through the overlay
15+
7. If live streaming is unavailable, the app falls back to the OpenAI-compatible REST API (`/v1/audio/transcriptions`)
1516

1617
## Setup
1718

@@ -23,6 +24,8 @@ Run [WhisperLiveKit](https://github.com/QuentinFuxa/WhisperLiveKit) on your mach
2324
whisperlivekit-server --host 0.0.0.0 --port 8090 --pcm-input
2425
```
2526

27+
For TTS, run Kokoro-FastAPI on your machine or tailnet. The app discovers healthy TTS servers on port `8880` by default and uses the OpenAI-compatible `/v1/audio/speech` endpoint.
28+
2629
### App
2730

2831
1. Install the APK (grab from [Actions artifacts](../../actions) or build yourself)
@@ -33,6 +36,7 @@ whisperlivekit-server --host 0.0.0.0 --port 8090 --pcm-input
3336
- **Notifications** — for the foreground service
3437
4. Enable the **Whisper Transcriber** accessibility service in Android Settings → Accessibility (needed to type into other apps' text fields)
3538
5. Tap **Start Overlay** — the floating bubble appears
39+
6. Optional: in **Settings → Text To Speech**, discover your Kokoro server, test the connection to load voices, pick a voice/speed, and play sample text
3640

3741
### Permissions
3842

@@ -122,6 +126,7 @@ The app checks a rolling GitHub Release manifest at `app-latest`. When a newer `
122126
- **HTTP / ws://** works out of the box to any IP (cleartext traffic is allowed via network security config)
123127
- **HTTPS with self-signed certs** works — the client trusts all certificates (this is a private VPN tool, not a public app)
124128
- Works over **Tailscale**, **ZeroTier**, or any VPN — just use the VPN IP as the server URL
129+
- Long-press the overlay to open the panel, then tap **SPEAK** to read the current clipboard with the selected Kokoro voice
125130

126131
## Live endpoint probe
127132

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package com.whispertranscriber.audio
2+
3+
import android.content.Context
4+
import android.media.MediaPlayer
5+
import java.io.File
6+
7+
class TtsAudioPlayer(private val context: Context) {
8+
private var mediaPlayer: MediaPlayer? = null
9+
10+
fun playWav(bytes: ByteArray) {
11+
stop()
12+
val file = File(context.cacheDir, "tts/kokoro-test.wav").apply {
13+
parentFile?.mkdirs()
14+
writeBytes(bytes)
15+
}
16+
mediaPlayer = MediaPlayer().apply {
17+
setDataSource(file.absolutePath)
18+
setOnCompletionListener {
19+
it.release()
20+
if (mediaPlayer == it) mediaPlayer = null
21+
}
22+
setOnErrorListener { player, _, _ ->
23+
player.release()
24+
if (mediaPlayer == player) mediaPlayer = null
25+
true
26+
}
27+
prepare()
28+
start()
29+
}
30+
}
31+
32+
fun stop() {
33+
mediaPlayer?.let {
34+
if (it.isPlaying) it.stop()
35+
it.release()
36+
}
37+
mediaPlayer = null
38+
}
39+
}

app/src/main/java/com/whispertranscriber/data/SettingsStore.kt

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import android.content.Context
44
import androidx.datastore.core.DataStore
55
import androidx.datastore.preferences.core.Preferences
66
import androidx.datastore.preferences.core.edit
7+
import androidx.datastore.preferences.core.floatPreferencesKey
78
import androidx.datastore.preferences.core.intPreferencesKey
89
import androidx.datastore.preferences.core.stringPreferencesKey
910
import androidx.datastore.preferences.preferencesDataStore
@@ -16,7 +17,11 @@ private val Context.dataStore: DataStore<Preferences> by preferencesDataStore(na
1617
data class AppSettings(
1718
val whisperServerUrl: String = "",
1819
val whisperServerPort: Int = WhisperServerDiscovery.DEFAULT_PORT,
19-
val audioQuality: String = "medium"
20+
val audioQuality: String = "medium",
21+
val ttsServerUrl: String = "",
22+
val ttsServerPort: Int = 8880,
23+
val ttsVoice: String = "af_heart",
24+
val ttsSpeed: Float = 1.0f
2025
)
2126

2227
class SettingsStore(private val context: Context) {
@@ -25,13 +30,21 @@ class SettingsStore(private val context: Context) {
2530
private val KEY_SERVER_URL = stringPreferencesKey("whisper_server_url")
2631
private val KEY_SERVER_PORT = intPreferencesKey("whisper_server_port")
2732
private val KEY_AUDIO_QUALITY = stringPreferencesKey("audio_quality")
33+
private val KEY_TTS_SERVER_URL = stringPreferencesKey("tts_server_url")
34+
private val KEY_TTS_SERVER_PORT = intPreferencesKey("tts_server_port")
35+
private val KEY_TTS_VOICE = stringPreferencesKey("tts_voice")
36+
private val KEY_TTS_SPEED = floatPreferencesKey("tts_speed")
2837
}
2938

3039
val settings: Flow<AppSettings> = context.dataStore.data.map { prefs ->
3140
AppSettings(
3241
whisperServerUrl = prefs[KEY_SERVER_URL] ?: AppSettings().whisperServerUrl,
3342
whisperServerPort = prefs[KEY_SERVER_PORT] ?: AppSettings().whisperServerPort,
34-
audioQuality = prefs[KEY_AUDIO_QUALITY] ?: AppSettings().audioQuality
43+
audioQuality = prefs[KEY_AUDIO_QUALITY] ?: AppSettings().audioQuality,
44+
ttsServerUrl = prefs[KEY_TTS_SERVER_URL] ?: AppSettings().ttsServerUrl,
45+
ttsServerPort = prefs[KEY_TTS_SERVER_PORT] ?: AppSettings().ttsServerPort,
46+
ttsVoice = prefs[KEY_TTS_VOICE] ?: AppSettings().ttsVoice,
47+
ttsSpeed = prefs[KEY_TTS_SPEED] ?: AppSettings().ttsSpeed
3548
)
3649
}
3750

@@ -46,4 +59,20 @@ class SettingsStore(private val context: Context) {
4659
suspend fun updateAudioQuality(quality: String) {
4760
context.dataStore.edit { it[KEY_AUDIO_QUALITY] = quality }
4861
}
62+
63+
suspend fun updateTtsServerUrl(url: String) {
64+
context.dataStore.edit { it[KEY_TTS_SERVER_URL] = url }
65+
}
66+
67+
suspend fun updateTtsServerPort(port: Int) {
68+
context.dataStore.edit { it[KEY_TTS_SERVER_PORT] = port }
69+
}
70+
71+
suspend fun updateTtsVoice(voice: String) {
72+
context.dataStore.edit { it[KEY_TTS_VOICE] = voice }
73+
}
74+
75+
suspend fun updateTtsSpeed(speed: Float) {
76+
context.dataStore.edit { it[KEY_TTS_SPEED] = speed }
77+
}
4978
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package com.whispertranscriber.network
2+
3+
import com.google.gson.JsonObject
4+
import com.google.gson.JsonParser
5+
import kotlinx.coroutines.Dispatchers
6+
import kotlinx.coroutines.suspendCancellableCoroutine
7+
import kotlinx.coroutines.withContext
8+
import okhttp3.Call
9+
import okhttp3.Callback
10+
import okhttp3.MediaType.Companion.toMediaType
11+
import okhttp3.OkHttpClient
12+
import okhttp3.Request
13+
import okhttp3.RequestBody.Companion.toRequestBody
14+
import okhttp3.Response
15+
import java.io.IOException
16+
import java.util.concurrent.TimeUnit
17+
import kotlin.coroutines.resume
18+
import kotlin.coroutines.resumeWithException
19+
20+
class KokoroTtsClient {
21+
private val client = OkHttpClient.Builder()
22+
.connectTimeout(15, TimeUnit.SECONDS)
23+
.readTimeout(120, TimeUnit.SECONDS)
24+
.build()
25+
26+
suspend fun voices(serverUrl: String): List<String> = withContext(Dispatchers.IO) {
27+
val request = Request.Builder()
28+
.url(serverUrl.trimEnd('/') + "/v1/audio/voices")
29+
.header("Cache-Control", "no-cache")
30+
.build()
31+
val response = client.newCall(request).await()
32+
response.use {
33+
if (!it.isSuccessful) throw IOException("Voices HTTP ${it.code}")
34+
KokoroVoiceParser.parse(it.body?.string().orEmpty())
35+
}
36+
}
37+
38+
suspend fun synthesizeWav(
39+
serverUrl: String,
40+
text: String,
41+
voice: String,
42+
speed: Float
43+
): ByteArray = withContext(Dispatchers.IO) {
44+
val request = Request.Builder()
45+
.url(serverUrl.trimEnd('/') + "/v1/audio/speech")
46+
.post(KokoroSpeechRequest.json(text, voice, speed).toRequestBody("application/json".toMediaType()))
47+
.build()
48+
val response = client.newCall(request).await()
49+
response.use {
50+
if (!it.isSuccessful) throw IOException("Speech HTTP ${it.code}: ${it.body?.string().orEmpty()}")
51+
it.body?.bytes() ?: throw IOException("Empty speech response")
52+
}
53+
}
54+
55+
fun shutdown() {
56+
client.dispatcher.executorService.shutdown()
57+
client.connectionPool.evictAll()
58+
}
59+
60+
private suspend fun Call.await(): Response = suspendCancellableCoroutine { continuation ->
61+
continuation.invokeOnCancellation { cancel() }
62+
enqueue(object : Callback {
63+
override fun onResponse(call: Call, response: Response) {
64+
continuation.resume(response)
65+
}
66+
67+
override fun onFailure(call: Call, e: IOException) {
68+
continuation.resumeWithException(e)
69+
}
70+
})
71+
}
72+
}
73+
74+
object KokoroVoiceParser {
75+
fun parse(jsonText: String): List<String> {
76+
val json = JsonParser.parseString(jsonText).asJsonObject
77+
return json.getAsJsonArray("voices")
78+
?.mapNotNull { it.takeUnless { value -> value.isJsonNull }?.asString }
79+
?.filter { it.isNotBlank() }
80+
.orEmpty()
81+
}
82+
}
83+
84+
object KokoroSpeechRequest {
85+
fun json(text: String, voice: String, speed: Float): String {
86+
val request = JsonObject().apply {
87+
addProperty("model", "kokoro")
88+
addProperty("input", text)
89+
addProperty("voice", voice)
90+
addProperty("response_format", "wav")
91+
addProperty("speed", speed.coerceIn(0.25f, 4.0f))
92+
addProperty("stream", false)
93+
}
94+
return request.toString()
95+
}
96+
}

app/src/main/java/com/whispertranscriber/service/FloatingOverlayService.kt

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ import android.widget.Toast
2727
import com.whispertranscriber.MainActivity
2828
import com.whispertranscriber.R
2929
import com.whispertranscriber.audio.AudioRecorder
30+
import com.whispertranscriber.audio.TtsAudioPlayer
3031
import com.whispertranscriber.data.SettingsStore
3132
import com.whispertranscriber.data.TranscriptionLog
33+
import com.whispertranscriber.network.KokoroTtsClient
3234
import com.whispertranscriber.network.WhisperApiClient
3335
import com.whispertranscriber.network.WhisperLiveKitClient
3436
import com.whispertranscriber.network.WhisperLiveKitSession
@@ -58,7 +60,9 @@ class FloatingOverlayService : Service() {
5860
private val audioRecorder = AudioRecorder()
5961
private val whisperClient = WhisperApiClient()
6062
private val liveKitClient = WhisperLiveKitClient()
63+
private val ttsClient = KokoroTtsClient()
6164
private val serviceScope = CoroutineScope(SupervisorJob() + Dispatchers.Main)
65+
private lateinit var ttsAudioPlayer: TtsAudioPlayer
6266

6367
private var bubbleView: View? = null
6468
private var expandedView: View? = null
@@ -80,6 +84,7 @@ class FloatingOverlayService : Service() {
8084
windowManager = getSystemService(WINDOW_SERVICE) as WindowManager
8185
settingsStore = SettingsStore(this)
8286
transcriptionLog = TranscriptionLog(this)
87+
ttsAudioPlayer = TtsAudioPlayer(this)
8388
createNotificationChannel()
8489
startForeground(NOTIFICATION_ID, buildNotification())
8590
createBubbleView()
@@ -350,6 +355,14 @@ class FloatingOverlayService : Service() {
350355
return discovered.url
351356
}
352357

358+
private suspend fun resolveTtsServerUrl(configuredUrl: String, discoveryPort: Int): String {
359+
if (configuredUrl.isNotBlank()) return configuredUrl
360+
val discovered = WhisperServerDiscovery.discover(port = discoveryPort)
361+
?: throw IllegalStateException("No Kokoro TTS server found on local networks or Tailscale port $discoveryPort")
362+
settingsStore.updateTtsServerUrl(discovered.url)
363+
return discovered.url
364+
}
365+
353366
private fun toggleExpandedView() {
354367
if (isExpanded) {
355368
removeExpandedView()
@@ -394,6 +407,15 @@ class FloatingOverlayService : Service() {
394407
}
395408
titleBar.addView(copyButton)
396409

410+
val speakButton = TextView(this).apply {
411+
text = "SPEAK"
412+
textSize = 12f
413+
setTextColor(0xFF6750A4.toInt())
414+
setPadding((8 * density).toInt(), (4 * density).toInt(), (8 * density).toInt(), (4 * density).toInt())
415+
setOnClickListener { speakClipboardText() }
416+
}
417+
titleBar.addView(speakButton)
418+
397419
val closeButton = TextView(this).apply {
398420
text = "X"
399421
textSize = 14f
@@ -493,6 +515,40 @@ class FloatingOverlayService : Service() {
493515
Toast.makeText(this, "Copied to clipboard", Toast.LENGTH_SHORT).show()
494516
}
495517

518+
private fun speakClipboardText() {
519+
val clipboard = getSystemService(Context.CLIPBOARD_SERVICE) as ClipboardManager
520+
val text = clipboard.primaryClip
521+
?.takeIf { it.itemCount > 0 }
522+
?.getItemAt(0)
523+
?.coerceToText(this)
524+
?.toString()
525+
?.trim()
526+
.orEmpty()
527+
528+
if (text.isBlank()) {
529+
Toast.makeText(this, "Clipboard is empty", Toast.LENGTH_SHORT).show()
530+
return
531+
}
532+
533+
serviceScope.launch {
534+
try {
535+
val settings = settingsStore.settings.first()
536+
val serverUrl = resolveTtsServerUrl(settings.ttsServerUrl, settings.ttsServerPort)
537+
val audio = ttsClient.synthesizeWav(
538+
serverUrl = serverUrl,
539+
text = text,
540+
voice = settings.ttsVoice,
541+
speed = settings.ttsSpeed
542+
)
543+
ttsAudioPlayer.playWav(audio)
544+
Toast.makeText(this@FloatingOverlayService, "Playing clipboard", Toast.LENGTH_SHORT).show()
545+
} catch (e: Exception) {
546+
Toast.makeText(this@FloatingOverlayService, "TTS failed: ${e.message}", Toast.LENGTH_LONG).show()
547+
Log.e(TAG, "TTS playback failed", e)
548+
}
549+
}
550+
}
551+
496552
private fun createNotificationChannel() {
497553
val channel = NotificationChannel(
498554
CHANNEL_ID,
@@ -532,8 +588,10 @@ class FloatingOverlayService : Service() {
532588
transcriptionJob?.cancel()
533589
serviceScope.cancel()
534590
audioRecorder.release()
591+
ttsAudioPlayer.stop()
535592
whisperClient.shutdown()
536593
liveKitClient.shutdown()
594+
ttsClient.shutdown()
537595
removeExpandedView()
538596
bubbleView?.let {
539597
try {

0 commit comments

Comments
 (0)