java-llama.cpp/src/test/java/net/ladenthin/llama/AudioInputIntegrationTest.java at 55a6fa03b1da53ea1bc922ad9422913d240bacce · vaiju1981/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.concurrent.TimeUnit;
import net.ladenthin.llama.parameters.InferenceParameters;
import net.ladenthin.llama.parameters.ModelParameters;
import net.ladenthin.llama.value.ChatMessage;
import net.ladenthin.llama.value.ContentPart;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;

/**
 * Real-model coverage for <b>audio input</b> (llama.cpp discussion #13759). Loads an audio-capable
 * model (Ultravox / Qwen2.5-Omni) with its audio {@code --mmproj} and sends a multipart message
 * carrying a {@link ContentPart#audioFile(java.nio.file.Path)} clip, exercising:
 * <ul>
 *   <li>{@link ModelParameters#setMmproj(String)} wiring an audio encoder;</li>
 *   <li>{@code ParameterJsonSerializer.buildMessages} emitting the OAI {@code input_audio} part;</li>
 *   <li>the upstream {@code oaicompat_chat_params_parse} routing {@code input_audio} through the
 *       compiled-in {@code mtmd} audio pipeline.</li>
 * </ul>
 *
 * <p>Self-skips when any of the three system properties
 * ({@link TestConstants#PROP_AUDIO_MODEL_PATH} / {@link TestConstants#PROP_AUDIO_MMPROJ_PATH} /
 * {@link TestConstants#PROP_AUDIO_PATH}) is unset or its file is missing, so it runs only in CI or on a
 * dev machine where the (large) audio model and a clip have been staged.
 */
public class AudioInputIntegrationTest {

    private static LlamaModel model;
    private static String audioPath;

    @BeforeAll
    public static void setup() {
        String modelPath = System.getProperty(TestConstants.PROP_AUDIO_MODEL_PATH);
        String mmprojPath = System.getProperty(TestConstants.PROP_AUDIO_MMPROJ_PATH);
        audioPath = System.getProperty(TestConstants.PROP_AUDIO_PATH);

        Assumptions.assumeTrue(
                modelPath != null && !modelPath.isEmpty(),
                "Audio model path not set (-D" + TestConstants.PROP_AUDIO_MODEL_PATH + "=...)");
        Assumptions.assumeTrue(
                mmprojPath != null && !mmprojPath.isEmpty(),
                "Audio mmproj path not set (-D" + TestConstants.PROP_AUDIO_MMPROJ_PATH + "=...)");
        Assumptions.assumeTrue(
                audioPath != null && !audioPath.isEmpty(),
                "Audio clip path not set (-D" + TestConstants.PROP_AUDIO_PATH + "=...)");
        Assumptions.assumeTrue(new File(modelPath).exists(), "Audio model file missing: " + modelPath);
        Assumptions.assumeTrue(new File(mmprojPath).exists(), "Audio mmproj file missing: " + mmprojPath);
        Assumptions.assumeTrue(new File(audioPath).exists(), "Audio clip missing: " + audioPath);

        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
        ModelParameters parameters = new ModelParameters()
                .setCtxSize(4096)
                .setModel(modelPath)
                .setMmproj(mmprojPath)
                .setGpuLayers(gpuLayers)
                .setFit(false);
        if (gpuLayers == 0) {
            parameters.setDevices("none").setMmprojOffload(false);
        }
        model = new LlamaModel(parameters);
        assertTrue(model.supportsAudio(), "loaded model + mmproj must advertise audio input");
    }

    @AfterAll
    public static void tearDown() {
        if (model != null) {
            model.close();
        }
    }

    @Test
    @DisplayName("an input_audio content part reaches the model and yields a non-empty reply")
    @Timeout(value = 240_000, unit = TimeUnit.MILLISECONDS)
    public void audioInputProducesNonEmptyReply() throws IOException {
        ChatMessage message = ChatMessage.userMultimodal(
                ContentPart.text("Transcribe the audio."), ContentPart.audioFile(Paths.get(audioPath)));

        String reply = model.chatCompleteText(InferenceParameters.empty()
                .withMessages(Collections.singletonList(message))
                .withNPredict(64));

        assertFalse(reply.trim().isEmpty(), "reply must be non-empty for an audio prompt; got: \"" + reply + "\"");
    }
}