java-llama.cpp/src/test/java/net/ladenthin/llama/TtsIntegrationTest.java at 02192f3c333681b9e9a7d17f2172d588770e44e5 · bernardladenthin/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.io.File;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.TimeUnit;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;

/**
 * Real-model coverage for {@link TextToSpeech} (OuteTTS audio output, llama.cpp {@code llama-tts}
 * pipeline). Loads the two-model TTS pipeline and synthesizes a short clip, checking the WAV
 * container is well-formed.
 *
 * <p>Self-skips when {@link TestConstants#PROP_TTS_TTC_MODEL} or
 * {@link TestConstants#PROP_TTS_VOCODER_MODEL} is unset or its file is missing, so it runs only where
 * the (large) OuteTTS + WavTokenizer GGUFs have been staged.
 */
public class TtsIntegrationTest {

    /** Canonical RIFF/WAVE header size in bytes (16-bit PCM, no extra chunks). */
    private static final int WAV_HEADER_BYTES = 44;

    @Test
    @DisplayName("synthesize() returns a well-formed, non-silent 24 kHz mono 16-bit WAV")
    @Timeout(value = 300_000, unit = TimeUnit.MILLISECONDS)
    public void synthesizesWellFormedWav() {
        String ttc = System.getProperty(TestConstants.PROP_TTS_TTC_MODEL);
        String vocoder = System.getProperty(TestConstants.PROP_TTS_VOCODER_MODEL);
        Assumptions.assumeTrue(
                ttc != null && !ttc.isEmpty(), "TTS model not set (-D" + TestConstants.PROP_TTS_TTC_MODEL + "=...)");
        Assumptions.assumeTrue(
                vocoder != null && !vocoder.isEmpty(),
                "TTS vocoder not set (-D" + TestConstants.PROP_TTS_VOCODER_MODEL + "=...)");
        Assumptions.assumeTrue(new File(ttc).exists(), "TTS model file missing: " + ttc);
        Assumptions.assumeTrue(new File(vocoder).exists(), "TTS vocoder file missing: " + vocoder);

        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, 0);
        try (TextToSpeech tts = new TextToSpeech(ttc, vocoder, gpuLayers, 0)) {
            byte[] wav = tts.synthesize("hello from llama");

            assertNotNull(wav, "WAV bytes must not be null");
            // A bare 44-byte header with no payload is not a valid clip: require real samples beyond it.
            assertTrue(
                    wav.length > WAV_HEADER_BYTES,
                    "WAV must carry a header plus samples; got " + wav.length + " bytes");

            // RIFF/WAVE container magic.
            assertEquals("RIFF", tag(wav, 0), "RIFF magic");
            assertEquals("WAVE", tag(wav, 8), "WAVE magic");
            assertEquals("fmt ", tag(wav, 12), "fmt subchunk tag");
            assertEquals("data", tag(wav, 36), "data subchunk tag");

            // fmt fields must match the documented output format: 24 kHz mono 16-bit PCM. A mis-loaded
            // model that still framed a header would not silently pass with the wrong rate/channels.
            ByteBuffer header = ByteBuffer.wrap(wav).order(ByteOrder.LITTLE_ENDIAN);
            assertEquals(1, header.getShort(20) & 0xFFFF, "audio format must be PCM (1)");
            assertEquals(1, header.getShort(22) & 0xFFFF, "must be mono (1 channel)");
            assertEquals(24_000, header.getInt(24), "sample rate must be 24 kHz");
            assertEquals(16, header.getShort(34) & 0xFFFF, "must be 16-bit samples");

            // Declared chunk sizes must be self-consistent with the actual byte-array length.
            assertEquals(wav.length - 8, header.getInt(4), "RIFF chunk size must equal fileLength - 8");
            int dataSize = header.getInt(40);
            assertEquals(wav.length - WAV_HEADER_BYTES, dataSize, "data chunk size must equal fileLength - 44");
            assertEquals(0, dataSize % 2, "16-bit PCM data size must be even");

            // The clip must contain real audio, not just the zeroed 0.25 s lead-in (or the all-silent
            // buffer a mis-configured model could still frame inside an otherwise valid header). The
            // original `length > 44` check passed on a single padding byte; scan the PCM payload instead.
            assertTrue(
                    hasNonZeroSample(wav, WAV_HEADER_BYTES),
                    "synthesized PCM must contain audible (non-zero) samples, not pure silence");
        }
    }

    /** Reads the 4-byte ASCII chunk tag at {@code offset}. */
    private static String tag(byte[] wav, int offset) {
        return new String(wav, offset, 4, StandardCharsets.US_ASCII);
    }

    /** Returns {@code true} if any byte of the PCM payload at or after {@code from} is non-zero. */
    private static boolean hasNonZeroSample(byte[] wav, int from) {
        for (int i = from; i < wav.length; i++) {
            if (wav[i] != 0) {
                return true;
            }
        }
        return false;
    }
}