Skip to content

Commit a5e0041

Browse files
authored
Add LiveSession testing (#8009)
Adds LiveSession integration testing using recorded clips to our daily AI integration tests. This was based on the iOS testing. Currently, there is a disabled test due to the inability of the Android SDK to specify turn complete manually.
1 parent cc4edd6 commit a5e0041

3 files changed

Lines changed: 286 additions & 0 deletions

File tree

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
/*
2+
* Copyright 2026 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.firebase.ai
18+
19+
import android.content.Context
20+
import android.graphics.Bitmap
21+
import android.media.MediaMetadataRetriever
22+
import androidx.test.core.app.ApplicationProvider
23+
import com.google.firebase.ai.type.AudioTranscriptionConfig
24+
import com.google.firebase.ai.type.Content
25+
import com.google.firebase.ai.type.FunctionResponsePart
26+
import com.google.firebase.ai.type.GenerativeBackend
27+
import com.google.firebase.ai.type.InlineData
28+
import com.google.firebase.ai.type.LiveGenerationConfig
29+
import com.google.firebase.ai.type.LiveServerContent
30+
import com.google.firebase.ai.type.LiveServerToolCall
31+
import com.google.firebase.ai.type.LiveSession
32+
import com.google.firebase.ai.type.PublicPreviewAPI
33+
import com.google.firebase.ai.type.ResponseModality
34+
import com.google.firebase.ai.type.Schema
35+
import com.google.firebase.ai.type.Tool
36+
import com.google.firebase.ai.type.content
37+
import com.google.firebase.ai.type.liveGenerationConfig
38+
import io.kotest.matchers.longs.shouldBeGreaterThan
39+
import io.kotest.matchers.nulls.shouldNotBeNull
40+
import io.kotest.matchers.shouldBe
41+
import io.kotest.matchers.shouldNotBe
42+
import io.kotest.matchers.string.shouldContain
43+
import io.ktor.util.toLowerCasePreservingASCIIRules
44+
import java.io.ByteArrayOutputStream
45+
import kotlin.time.Duration.Companion.seconds
46+
import kotlinx.coroutines.flow.filterIsInstance
47+
import kotlinx.coroutines.flow.first
48+
import kotlinx.coroutines.flow.takeWhile
49+
import kotlinx.coroutines.runBlocking
50+
import kotlinx.coroutines.withTimeoutOrNull
51+
import kotlinx.serialization.json.JsonObject
52+
import kotlinx.serialization.json.JsonPrimitive
53+
import org.junit.Ignore
54+
import org.junit.Test
55+
56+
@OptIn(PublicPreviewAPI::class)
57+
class LiveSessionTests {
58+
private val modelName = "gemini-2.5-flash-native-audio-preview-12-2025"
59+
60+
private val tools =
61+
listOf(
62+
Tool.functionDeclarations(
63+
listOf(
64+
com.google.firebase.ai.type.FunctionDeclaration(
65+
name = "getLastName",
66+
description = "Gets the last name of a person.",
67+
parameters =
68+
mapOf(
69+
"firstName" to
70+
Schema.string(description = "The first name of the person to lookup.")
71+
)
72+
)
73+
)
74+
)
75+
)
76+
77+
private val generationConfig = liveGenerationConfig {
78+
responseModality = ResponseModality.AUDIO
79+
outputAudioTranscription = AudioTranscriptionConfig()
80+
}
81+
82+
object SystemInstructions {
83+
val yesOrNo = content(role = "system") { text("You can only respond with \"yes\" or \"no\".") }
84+
85+
val helloGoodbye =
86+
content(role = "system") {
87+
text(
88+
"When you hear \"Hello\" say \"Goodbye\". If you hear anything else, say \"The audio file is broken\"."
89+
)
90+
}
91+
92+
val lastNames =
93+
content(role = "system") {
94+
text(
95+
"When you receive a message, if the message is a single word, assume it's the first name of a person, and call the getLastName tool to get the last name of said person. Once you get the response, say the response."
96+
)
97+
}
98+
99+
val animalInVideo =
100+
content(role = "system") {
101+
text(
102+
"Send a one word response of what ANIMAL is in the video. If you don't receive a video, send \"Test is broken, I didn't receive a video.\"."
103+
)
104+
}
105+
}
106+
107+
private fun getLiveModel(
108+
modelName: String,
109+
config: LiveGenerationConfig? = null,
110+
systemInstruction: Content? = null,
111+
tools: List<Tool>? = null
112+
): LiveGenerativeModel {
113+
val firebaseAI = FirebaseAI.getInstance(AIModels.app(), GenerativeBackend.googleAI())
114+
return firebaseAI.liveModel(
115+
modelName = modelName,
116+
generationConfig = config,
117+
systemInstruction = systemInstruction,
118+
tools = tools
119+
)
120+
}
121+
122+
fun resourceAsBytes(resource: Int): ByteArray {
123+
val context = ApplicationProvider.getApplicationContext<Context>()
124+
return context.resources.openRawResource(resource).use { it.readBytes() }
125+
}
126+
127+
@Test
128+
fun testSendAudioRealtime_receiveAudioOutputTranscripts(): Unit = runBlocking {
129+
val liveModel =
130+
getLiveModel(
131+
modelName = modelName,
132+
config = generationConfig,
133+
systemInstruction = SystemInstructions.helloGoodbye
134+
)
135+
136+
val session = liveModel.connect()
137+
try {
138+
val audioBytes = resourceAsBytes(R.raw.hello)
139+
session.sendAudioRealtime(InlineData(audioBytes, "audio/pcm"))
140+
session.sendAudioRealtime(InlineData(ByteArray(audioBytes.size) { 0 }, "audio/pcm"))
141+
142+
val text = withTimeoutOrNull(30.seconds) { session.collectNextAudioOutputTranscript() } ?: ""
143+
text.toLowerCasePreservingASCIIRules() shouldContain "goodbye"
144+
} finally {
145+
session.close()
146+
}
147+
}
148+
149+
@Test
150+
fun testSendVideoRealtime_receiveAudioOutputTranscripts(): Unit = runBlocking {
151+
val liveModel =
152+
getLiveModel(
153+
modelName = modelName,
154+
config = generationConfig,
155+
systemInstruction = SystemInstructions.animalInVideo
156+
)
157+
158+
val session = liveModel.connect()
159+
try {
160+
val context = ApplicationProvider.getApplicationContext<Context>()
161+
val retriever = MediaMetadataRetriever()
162+
try {
163+
val fd = context.resources.openRawResourceFd(R.raw.videoplayback)
164+
retriever.setDataSource(fd.fileDescriptor, fd.startOffset, fd.length)
165+
fd.close()
166+
167+
val durationStr = retriever.extractMetadata(MediaMetadataRetriever.METADATA_KEY_DURATION)
168+
val durationMs = durationStr?.toLong() ?: 0L
169+
170+
durationMs shouldBeGreaterThan 100
171+
172+
// Extract frames every 1 second
173+
for (timeMs in 0 until durationMs step 1000) {
174+
val bitmap =
175+
retriever.getFrameAtTime(timeMs * 1000, MediaMetadataRetriever.OPTION_CLOSEST_SYNC)
176+
177+
bitmap shouldNotBe null
178+
179+
if (bitmap != null) {
180+
val stream = ByteArrayOutputStream()
181+
bitmap.compress(Bitmap.CompressFormat.PNG, 100, stream)
182+
session.sendVideoRealtime(InlineData(stream.toByteArray(), "image/png"))
183+
}
184+
}
185+
} finally {
186+
retriever.release()
187+
}
188+
189+
// The model doesn't respond unless we send some audio too (according to iOS test)
190+
val audioBytes = resourceAsBytes(R.raw.hello)
191+
session.sendAudioRealtime(InlineData(audioBytes, "audio/pcm"))
192+
session.sendAudioRealtime(InlineData(ByteArray(audioBytes.size) { 0 }, "audio/pcm"))
193+
194+
val text = withTimeoutOrNull(30.seconds) { session.collectNextAudioOutputTranscript() } ?: ""
195+
val response = text.toLowerCasePreservingASCIIRules()
196+
// Expected responses for the video could be "cat", "kitten", "kitty"
197+
// Based on iOS: #expect(["kitten", "cat", "kitty"].contains(modelResponse))
198+
val matches = listOf("cat", "kitten", "kitty").any { response.contains(it) }
199+
matches shouldBe true // Real model calls might be flakey
200+
} finally {
201+
session.close()
202+
}
203+
}
204+
205+
@Test
206+
fun testRealtime_functionCalling(): Unit = runBlocking {
207+
val liveModel =
208+
getLiveModel(
209+
modelName = modelName,
210+
config = generationConfig,
211+
tools = tools,
212+
systemInstruction = SystemInstructions.lastNames
213+
)
214+
215+
val session = liveModel.connect()
216+
try {
217+
session.sendTextRealtime("Alex")
218+
219+
val toolCall =
220+
withTimeoutOrNull(30.seconds) {
221+
session.receive().filterIsInstance<LiveServerToolCall>().first()
222+
}
223+
224+
toolCall.shouldNotBeNull()
225+
toolCall.functionCalls.size shouldBe 1
226+
val functionCall = toolCall.functionCalls.first()
227+
functionCall.name shouldBe "getLastName"
228+
229+
val firstName = (functionCall.args["firstName"] as? JsonPrimitive)?.content
230+
firstName shouldBe "Alex"
231+
232+
val response = "Smith"
233+
session.sendFunctionResponse(
234+
listOf(
235+
FunctionResponsePart(
236+
name = functionCall.name,
237+
response = JsonObject(mapOf("lastName" to JsonPrimitive(response))),
238+
id = functionCall.id
239+
)
240+
)
241+
)
242+
243+
val text = withTimeoutOrNull(30.seconds) { session.collectNextAudioOutputTranscript() } ?: ""
244+
text.toLowerCasePreservingASCIIRules() shouldContain "smith"
245+
} finally {
246+
session.close()
247+
}
248+
}
249+
250+
@Test
251+
@Ignore("This test fails because we do not implement setting turnComplete at all")
252+
fun testIncremental_works(): Unit = runBlocking {
253+
val liveModel =
254+
getLiveModel(
255+
modelName = modelName,
256+
config = generationConfig,
257+
systemInstruction = SystemInstructions.yesOrNo
258+
)
259+
260+
val session = liveModel.connect()
261+
try {
262+
session.send("Does five plus")
263+
session.send(" five equal ten?")
264+
265+
val text = withTimeoutOrNull(30.seconds) { session.collectNextAudioOutputTranscript() } ?: ""
266+
text.toLowerCasePreservingASCIIRules() shouldContain "yes"
267+
} finally {
268+
session.close()
269+
}
270+
}
271+
272+
private suspend fun LiveSession.collectNextAudioOutputTranscript(): String {
273+
val transcriptBuilder = StringBuilder()
274+
this.receive()
275+
.takeWhile {
276+
if (it is LiveServerContent) {
277+
transcriptBuilder.append(it.outputTranscription?.text ?: "")
278+
!it.turnComplete
279+
} else {
280+
true
281+
}
282+
}
283+
.collect {}
284+
return transcriptBuilder.toString()
285+
}
286+
}
26.5 KB
Binary file not shown.
201 KB
Binary file not shown.

0 commit comments

Comments
 (0)