Skip to content

Commit 2375a40

Browse files
Merge pull request #268 from vaiju1981/feat/tts-output
feat(tts): text-to-speech via the OuteTTS + WavTokenizer pipeline
2 parents 0ae24cf + 3457125 commit 2375a40

18 files changed

Lines changed: 982 additions & 8 deletions

.github/validate-models.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ OPTIONAL_MODELS=(
2626
"models/nomic-embed-text-v1.5.f16.gguf"
2727
"models/SmolVLM-500M-Instruct-Q8_0.gguf"
2828
"models/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
29+
"models/OuteTTS-0.2-500M-Q4_K_M.gguf"
30+
"models/WavTokenizer-Large-75-F16.gguf"
2931
)
3032

3133
validate_gguf() {

.github/workflows/publish.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ env:
4141
VISION_MODEL_NAME: "SmolVLM-500M-Instruct-Q8_0.gguf"
4242
VISION_MMPROJ_URL: "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
4343
VISION_MMPROJ_NAME: "mmproj-SmolVLM-500M-Instruct-Q8_0.gguf"
44+
# Text-to-speech models for AudioInputIntegrationTest's sibling TtsIntegrationTest (OuteTTS pipeline).
45+
TTS_MODEL_URL: "https://huggingface.co/second-state/OuteTTS-0.2-500M-GGUF/resolve/main/OuteTTS-0.2-500M-Q4_K_M.gguf"
46+
TTS_MODEL_NAME: "OuteTTS-0.2-500M-Q4_K_M.gguf"
47+
TTS_VOCODER_URL: "https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-F16.gguf"
48+
TTS_VOCODER_NAME: "WavTokenizer-Large-75-F16.gguf"
4449
# Test image used by MultimodalIntegrationTest is committed to the repo
4550
# at src/test/resources/images/test-image.jpg (see the README in that
4651
# directory for licensing). No download step is needed; CI just points
@@ -797,14 +802,20 @@ jobs:
797802
run: |
798803
ulimit -c unlimited
799804
echo "${{ github.workspace }}/core.%e.%p" | sudo tee /proc/sys/kernel/core_pattern
805+
- name: Download TTS model (OuteTTS)
806+
run: test -f models/${TTS_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_MODEL_URL} --create-dirs -o models/${TTS_MODEL_NAME}
807+
- name: Download TTS vocoder (WavTokenizer)
808+
run: test -f models/${TTS_VOCODER_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TTS_VOCODER_URL} --create-dirs -o models/${TTS_VOCODER_NAME}
800809
- name: Run tests
801810
run: |
802811
mvn -e --no-transfer-progress -P jcstress test \
803812
-Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
804813
-Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \
805814
-Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
806815
-Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
807-
-Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
816+
-Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH} \
817+
-Dnet.ladenthin.llama.tts.ttc.model=models/${TTS_MODEL_NAME} \
818+
-Dnet.ladenthin.llama.tts.vocoder.model=models/${TTS_VOCODER_NAME}
808819
- uses: actions/upload-artifact@v7
809820
if: success()
810821
with:

CLAUDE.md

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,38 @@ Current patches:
384384
|-------|-------|
385385
| `0001-win32-arg-parse-embed-guard.patch` | Windows JNI regression from llama.cpp **#24779** (b9739): `common_params_parse` unconditionally replaced the caller's argv with the process command line (`GetCommandLineW`), so an embedded/JNI caller (`java.exe`) lost its `--model …` args → "Failed to parse model parameters". The patch **drops the override for our build** (keeps the `make_utf8_argv()` call referenced so there's no `-Wunused-function`, but never adopts its result), so the caller's already-UTF-8 argv is always used. This is **deterministic** — an earlier count-guard variant (only override when the re-derived arg count equals `argc`) collided on the server-integration tests whose argv length happened to equal `java.exe`'s and kept them failing. The upstream PR can instead expose an opt-out / `common_params_parse_argv` that preserves the standalone tools' UTF-8 fix. |
386386

387+
## OuteTTS build-time extraction (`cmake/generate-tts-upstream.cmake`)
388+
389+
The `TextToSpeech` native pipeline reuses llama.cpp's OuteTTS helpers (`tools/tts/tts.cpp`)
390+
**without hand-copying them**. A verbatim copy would be a DRY/maintenance hazard that silently
391+
diverges on every upgrade, and `tts.cpp` cannot simply be added to `target_sources` — it defines its
392+
own `main()`, which would clash at link time (the same reason `tools/server/server.cpp` is excluded
393+
while `server-*.cpp` are compiled in), and all its helpers are `static` (internal linkage), so they
394+
are unreachable from another TU even if it were linked.
395+
396+
Instead the helpers are **DERIVED mechanically at configure time** from the pinned upstream source:
397+
398+
- **`cmake/generate-tts-upstream.cmake`** — reads `${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp`, keeps
399+
the pre-`main()` span (the DSP `fill_hann_window`/`irfft`/`fold`/`embd_to_audio`, the prompt/text
400+
helpers incl. `process_text`'s number-to-words, the `outetts_version` enum), strips `static` from
401+
the handful the JNI engine calls (giving them external linkage), and extracts the two hard-coded
402+
default-speaker literals out of `main()` into `extern const` strings. Writes
403+
`build/tts_generated/tts_upstream_gen.cpp`.
404+
- **`CMakeLists.txt`** — runs the generator via `execute_process` right after
405+
`FetchContent_MakeAvailable(llama.cpp)`, then compiles the generated TU into `jllama`. The file is
406+
**never committed** (build artifact, like the native libs / WebUI assets); it is regenerated from
407+
whatever `tts.cpp` the pinned `GIT_TAG` resolves to, so a version bump is picked up automatically.
408+
- **`src/main/cpp/tts_upstream.h`** — committed, hand-written declarations of the extracted symbols
409+
(interface facts, not the implementation). `tts_engine.cpp` includes it and links against the
410+
generated definitions. The in-memory WAV writer (`tts_wav.hpp`) is ours, not extracted.
411+
412+
**Fail-loud on drift (same contract as `patches/`):** the generator asserts every anchor — the
413+
`int main(` split point, each `static <signature>` it de-statics, and both speaker literals. If an
414+
upgrade renames a helper or moves a literal, the **configure step aborts** with a pointer to the
415+
generator; if upstream changes a *type*, `tts_upstream.h` stops matching and the **link fails**.
416+
Either way a silent divergence is impossible. On a llama.cpp bump, re-verify the generator the same
417+
way you re-verify `patches/`.
418+
387419
## Upgrading/Downgrading llama.cpp Version
388420

389421
To change the llama.cpp version, update the following **three** files (and re-verify `patches/`):
@@ -588,6 +620,8 @@ the README. The summary below covers only the optional-model bindings:
588620
| `net.ladenthin.llama.audio.model` | `AudioInputIntegrationTest` (llama.cpp discussion #13759) | audio-input model GGUF, e.g. `ultravox-v0_5-llama-3_2-1b.gguf` |
589621
| `net.ladenthin.llama.audio.mmproj` | `AudioInputIntegrationTest` | matching audio mmproj/encoder, e.g. `mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf` |
590622
| `net.ladenthin.llama.audio.input` | `AudioInputIntegrationTest` | a `.wav`/`.mp3` clip on disk (no committed default — audio is not committed) |
623+
| `net.ladenthin.llama.tts.ttc.model` | `TtsIntegrationTest` | OuteTTS text-to-codes model, e.g. `OuteTTS-0.2-500M-Q4_K_M.gguf` |
624+
| `net.ladenthin.llama.tts.vocoder.model` | `TtsIntegrationTest` | matching codes-to-speech vocoder, e.g. `WavTokenizer-Large-75-F16.gguf` |
591625

592626
Run those tests by setting the property:
593627
```bash
@@ -605,6 +639,9 @@ mvn test -Dtest=AudioInputIntegrationTest \
605639
-Dnet.ladenthin.llama.audio.model=models/ultravox-v0_5-llama-3_2-1b.gguf \
606640
-Dnet.ladenthin.llama.audio.mmproj=models/mmproj-ultravox-v0_5-llama-3_2-1b-f16.gguf \
607641
-Dnet.ladenthin.llama.audio.input=/path/to/speech.wav
642+
mvn test -Dtest=TtsIntegrationTest \
643+
-Dnet.ladenthin.llama.tts.ttc.model=models/OuteTTS-0.2-500M-Q4_K_M.gguf \
644+
-Dnet.ladenthin.llama.tts.vocoder.model=models/WavTokenizer-Large-75-F16.gguf
608645
```
609646

610647
`MultimodalIntegrationTest` self-skips when any of the three vision properties
@@ -739,6 +776,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
739776

740777
**Java layer** (`src/main/java/net/ladenthin/llama/`):
741778
- `LlamaModel` — Main API class (AutoCloseable). Wraps native context for inference, embeddings, re-ranking, and tokenization.
779+
- `TextToSpeech` — Separate AutoCloseable native type for speech synthesis over the two-model OuteTTS (text-to-codes) + WavTokenizer (codes-to-speech vocoder) pipeline; `synthesize(text)` returns a 24 kHz mono 16-bit WAV byte stream. Native orchestration in `tts_engine.{h,cpp}`; the OuteTTS DSP / prompt / text helpers + default speaker are **derived at build time from upstream `tts.cpp`** (see "OuteTTS build-time extraction" below), not hand-copied; the in-memory WAV writer is `tts_wav.hpp`.
742780
- `ModelParameters` / `InferenceParameters` — Builder-pattern parameter classes that serialize to JSON (extend `JsonParameters`) for passing to native code.
743781
- `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`.
744782
- `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
@@ -750,7 +788,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
750788
- The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`); `noInternalJdkImports` carries an explicit exception for the supported `com.sun.net.httpserver` (the exported `jdk.httpserver` module, which `module-info.java` `requires`). See README "OpenAI-compatible HTTP server".
751789

752790
**Native layer** (`src/main/cpp/`):
753-
- `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods.
791+
- `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,516 lines; 30 native methods (27 `LlamaModel` + 3 `TextToSpeech`).
754792
- `utils.hpp` — Helper utilities (format helpers, argv stripping, token-piece serialisation).
755793
- `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable.
756794
- `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`.
@@ -905,12 +943,13 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
905943
| File | Tests | Scope |
906944
|------|-------|-------|
907945
| `src/test/cpp/test_utils.cpp` | 156 | Upstream helpers: `server_tokens`, `server_grammar_trigger`, `gen_tool_call_id`, `json_value`, `json_get_nested_values`, UTF-8 helpers, `format_response_rerank`, `format_embeddings_response_oaicompat`, `oaicompat_completion_params_parse`, `oaicompat_chat_params_parse`, `are_lora_equal`, `strip_flag_from_argv`, `token_piece_value`, `json_is_array_and_contains_numbers`, `format_oai_sse`, `format_oai_resp_sse`, `format_anthropic_sse` |
908-
| `src/test/cpp/test_server.cpp` | 188 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection |
946+
| `src/test/cpp/test_server.cpp` | 189 | Upstream result types: `result_timings`, `task_params::to_json()` (incl. `dry_sequence_breakers`, `preserved_tokens`, `timings_per_token`), `completion_token_output`, `server_task_result_cmpl_partial` (non-oaicompat + `to_json_oaicompat` + logprobs + `to_json_oaicompat_chat` + `to_json_anthropic` + dispatcher), `server_task_result_cmpl_final` (non-oaicompat + `to_json_oaicompat` + `to_json_oaicompat_chat` + `to_json_oaicompat_chat_stream` + `to_json_anthropic` + `to_json_anthropic_stream` + tool_calls + dispatcher), `server_task_result_embd`, `server_task_result_rerank`, `server_task_result_metrics`, `server_task_result_slot_save_load`, `server_task_result_slot_erase`, `server_task_result_apply_lora`, `server_task_result_error`, `format_error_response`, `server_task::need_sampling()`, `server_task::n_tokens()`, `server_schema::eval_llama_cmpl_schema()` (parsing pipeline + grammar routing + error paths), `response_fields` projection |
909947
| `src/test/cpp/test_json_helpers.cpp` | 47 | All functions in `json_helpers.hpp`: `get_result_error_message`, `results_to_json`, `rerank_results_to_json`, `parse_encoding_format`, `extract_embedding_prompt`, `is_infill_request`, `parse_slot_prompt_similarity`, `parse_positive_int_config`, `wrap_stream_chunk` |
910948
| `src/test/cpp/test_log_helpers.cpp` | 13 | All functions in `log_helpers.hpp`: `log_level_name`, `format_log_as_json` |
911-
| `src/test/cpp/test_jni_helpers.cpp` | 41 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
949+
| `src/test/cpp/test_jni_helpers.cpp` | 47 | All functions in `jni_helpers.hpp` using a zero-filled `JNINativeInterface_` mock |
950+
| `src/test/cpp/test_tts_wav.cpp` | 2 | The in-memory WAV writer `pcm_to_wav16_bytes` in `tts_wav.hpp` (WAV header/payload + little-endian clamping). The OuteTTS DSP it pairs with is derived from upstream `tts.cpp` and covered end-to-end by the Java `TtsIntegrationTest`, not unit-tested here. |
912951

913-
**Current total: 445 tests (all passing).**
952+
**Current total: 454 tests (all passing).**
914953

915954
#### Upstream source location (in CMake build tree)
916955

CMakeLists.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,29 @@ FetchContent_Declare(
151151
)
152152
FetchContent_MakeAvailable(llama.cpp)
153153

154+
# OuteTTS native pipeline: DERIVE the upstream tts.cpp helpers (DSP + prompt + text + the default
155+
# speaker profile) into a compilable translation unit at configure time, rather than hand-copying
156+
# them — a hand copy is a DRY/maintenance hazard that silently diverges on every llama.cpp upgrade.
157+
# tts.cpp cannot simply be added to target_sources because it defines its own main(); the generator
158+
# drops main() and gives the helpers external linkage. See cmake/generate-tts-upstream.cmake. The
159+
# generated file is never committed; it is regenerated from whatever tts.cpp the pinned GIT_TAG
160+
# resolves to, so a version bump is picked up automatically. The tag below is cosmetic provenance in
161+
# the generated banner — keep it in sync with the llama.cpp GIT_TAG above.
162+
set(JLLAMA_TTS_GEN_DIR ${CMAKE_BINARY_DIR}/tts_generated)
163+
set(JLLAMA_TTS_GEN_CPP ${JLLAMA_TTS_GEN_DIR}/tts_upstream_gen.cpp)
164+
file(MAKE_DIRECTORY ${JLLAMA_TTS_GEN_DIR})
165+
execute_process(
166+
COMMAND ${CMAKE_COMMAND}
167+
-DTTS_SRC=${llama.cpp_SOURCE_DIR}/tools/tts/tts.cpp
168+
-DOUT_CPP=${JLLAMA_TTS_GEN_CPP}
169+
-DLLAMA_TAG=b9739
170+
-P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate-tts-upstream.cmake
171+
RESULT_VARIABLE JLLAMA_TTS_GEN_RESULT
172+
)
173+
if(NOT JLLAMA_TTS_GEN_RESULT EQUAL 0)
174+
message(FATAL_ERROR "OuteTTS extraction failed; see cmake/generate-tts-upstream.cmake")
175+
endif()
176+
154177
# b8831 added ggml_graph_next_uid() which calls _InterlockedIncrement64 via
155178
# <intrin.h> on x86. The intrinsic only exists on x64; provide the
156179
# implementation in a compat TU so the linker resolves __InterlockedIncrement64.
@@ -263,10 +286,19 @@ endif()
263286

264287
add_library(jllama SHARED
265288
src/main/cpp/jllama.cpp
289+
src/main/cpp/tts_engine.cpp
290+
${JLLAMA_TTS_GEN_CPP}
266291
src/main/cpp/utils.hpp
267292
${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
268293
${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp)
269294

295+
# The generated TU keeps the whole pre-main() span of tts.cpp, so a few upstream CLI-only
296+
# helpers (print_usage, save_wav16, xterm colour) come along unused. Silence the resulting
297+
# unused-function warning on that one file (non-MSVC; MSVC's C4505 is off by default).
298+
if(NOT MSVC)
299+
set_source_files_properties(${JLLAMA_TTS_GEN_CPP} PROPERTIES COMPILE_FLAGS "-Wno-unused-function")
300+
endif()
301+
270302
# Phase 1 refactoring: compile upstream server library units directly into jllama
271303
# server.hpp has been replaced by direct upstream includes in jllama.cpp.
272304
# server-context.cpp, server-queue.cpp, server-task.cpp compile on all platforms
@@ -411,6 +443,7 @@ if(BUILD_TESTING)
411443
src/test/cpp/test_jni_helpers.cpp
412444
src/test/cpp/test_json_helpers.cpp
413445
src/test/cpp/test_log_helpers.cpp
446+
src/test/cpp/test_tts_wav.cpp
414447
${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
415448
${llama.cpp_SOURCE_DIR}/tools/server/server-chat.cpp
416449
${llama.cpp_SOURCE_DIR}/tools/server/server-context.cpp

0 commit comments

Comments
 (0)