Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 49 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ find_package(Threads REQUIRED)
set(GGML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ggml")
set(GGML_BUILD_DIR "${GGML_DIR}/build")

# Detect Metal backend availability (requires building GGML with -DGGML_METAL=ON)
if(APPLE AND EXISTS "${GGML_BUILD_DIR}/src/ggml-metal/libggml-metal.dylib")
set(GGML_HAS_METAL ON)
message(STATUS "GGML Metal backend found — GPU acceleration enabled")
else()
set(GGML_HAS_METAL OFF)
endif()

# Text tokenizer library
add_library(text_tokenizer STATIC
src/text_tokenizer.cpp
Expand All @@ -42,13 +50,17 @@ target_include_directories(text_tokenizer PUBLIC
)
target_link_directories(text_tokenizer PUBLIC
${GGML_BUILD_DIR}/src
${GGML_BUILD_DIR}/src/ggml-metal
)
target_link_libraries(text_tokenizer PUBLIC
ggml
ggml-base
ggml-cpu
Threads::Threads
)
if(GGML_HAS_METAL)
target_link_libraries(text_tokenizer PUBLIC ggml-metal "-framework Metal" "-framework MetalKit")
endif()

# TTS Transformer library (GGML-based + optional CoreML bridge)
set(TTS_TRANSFORMER_SOURCES
Expand All @@ -71,13 +83,17 @@ target_include_directories(tts_transformer PUBLIC
)
target_link_directories(tts_transformer PUBLIC
${GGML_BUILD_DIR}/src
${GGML_BUILD_DIR}/src/ggml-metal
)
target_link_libraries(tts_transformer PUBLIC
ggml
ggml-base
ggml-cpu
Threads::Threads
)
if(GGML_HAS_METAL)
target_link_libraries(tts_transformer PUBLIC ggml-metal "-framework Metal" "-framework MetalKit")
endif()
if(APPLE AND QWEN3_TTS_COREML)
target_link_libraries(tts_transformer PUBLIC
"-framework Foundation"
Expand All @@ -96,13 +112,17 @@ target_include_directories(audio_tokenizer_encoder PUBLIC
)
target_link_directories(audio_tokenizer_encoder PUBLIC
${GGML_BUILD_DIR}/src
${GGML_BUILD_DIR}/src/ggml-metal
)
target_link_libraries(audio_tokenizer_encoder PUBLIC
ggml
ggml-base
ggml-cpu
Threads::Threads
)
if(GGML_HAS_METAL)
target_link_libraries(audio_tokenizer_encoder PUBLIC ggml-metal "-framework Metal" "-framework MetalKit")
endif()

# Audio tokenizer decoder library (GGML-based)
add_library(audio_tokenizer_decoder STATIC
Expand All @@ -115,13 +135,17 @@ target_include_directories(audio_tokenizer_decoder PUBLIC
)
target_link_directories(audio_tokenizer_decoder PUBLIC
${GGML_BUILD_DIR}/src
${GGML_BUILD_DIR}/src/ggml-metal
)
target_link_libraries(audio_tokenizer_decoder PUBLIC
ggml
ggml-base
ggml-cpu
Threads::Threads
)
if(GGML_HAS_METAL)
target_link_libraries(audio_tokenizer_decoder PUBLIC ggml-metal "-framework Metal" "-framework MetalKit")
endif()

# Qwen3 TTS library (full pipeline)
add_library(qwen3_tts STATIC
Expand All @@ -139,6 +163,22 @@ target_link_libraries(qwen3_tts PUBLIC
Threads::Threads
)

# Shared library with C API (for FFI integration with Python, Nim, Rust, etc.)
add_library(qwen3tts_shared SHARED
src/qwen3tts_c_api.cpp
)
set_target_properties(qwen3tts_shared PROPERTIES
OUTPUT_NAME "qwen3tts"
VERSION ${PROJECT_VERSION}
SOVERSION 0
)
target_include_directories(qwen3tts_shared PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/src
)
target_link_libraries(qwen3tts_shared PRIVATE
qwen3_tts
)

# CLI executable
add_executable(qwen3-tts-cli
src/main.cpp
Expand Down Expand Up @@ -184,18 +224,20 @@ target_link_libraries(test_decoder PRIVATE
)

# Install targets
install(TARGETS text_tokenizer tts_transformer audio_tokenizer_encoder audio_tokenizer_decoder qwen3_tts qwen3-tts-cli
install(TARGETS text_tokenizer tts_transformer audio_tokenizer_encoder audio_tokenizer_decoder qwen3_tts qwen3tts_shared qwen3-tts-cli
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib
RUNTIME DESTINATION bin
)
install(FILES
src/gguf_loader.h
src/text_tokenizer.h
src/tts_transformer.h
install(FILES
src/gguf_loader.h
src/text_tokenizer.h
src/tts_transformer.h
src/coreml_code_predictor.h
src/audio_tokenizer_encoder.h
src/audio_tokenizer_decoder.h
src/audio_tokenizer_encoder.h
src/audio_tokenizer_decoder.h
src/qwen3_tts.h
src/qwen3tts_c_api.h
DESTINATION include
)

Expand Down
13 changes: 11 additions & 2 deletions src/qwen3_tts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,17 @@ bool Qwen3TTS::load_models(const std::string & model_dir) {
transformer_loaded_ = false;
decoder_loaded_ = false;

// Construct model paths
std::string tts_model_path = model_dir + "/qwen3-tts-0.6b-f16.gguf";
// Construct model paths — prefer quantized (q8_0) over full-precision (f16)
std::string tts_model_path;
std::string q8_path = model_dir + "/qwen3-tts-0.6b-q8_0.gguf";
std::string f16_path = model_dir + "/qwen3-tts-0.6b-f16.gguf";
FILE * q8_check = fopen(q8_path.c_str(), "r");
if (q8_check) {
fclose(q8_check);
tts_model_path = q8_path;
} else {
tts_model_path = f16_path;
}
std::string tokenizer_model_path = model_dir + "/qwen3-tts-tokenizer-f16.gguf";
tts_model_path_ = tts_model_path;
decoder_model_path_ = tokenizer_model_path;
Expand Down
155 changes: 155 additions & 0 deletions src/qwen3tts_c_api.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/* qwen3tts_c_api.cpp — C API wrapper for qwen3-tts.cpp */

#include "qwen3tts_c_api.h"
#include "qwen3_tts.h"

#include <cstdlib>
#include <cstring>

struct Qwen3Tts {
qwen3_tts::Qwen3TTS engine;
std::string last_error;
};

/* Internal helpers (C++ linkage, not exported) */

static qwen3_tts::tts_params convert_params(const Qwen3TtsParams* params) {
qwen3_tts::tts_params p;
p.max_audio_tokens = params->max_audio_tokens;
p.temperature = params->temperature;
p.top_p = params->top_p;
p.top_k = params->top_k;
p.n_threads = params->n_threads;
p.repetition_penalty = params->repetition_penalty;
p.language_id = params->language_id;
p.print_progress = false;
p.print_timing = false;
return p;
}

static Qwen3TtsAudio* make_audio_result(const qwen3_tts::tts_result& result) {
auto* audio = static_cast<Qwen3TtsAudio*>(
std::malloc(sizeof(Qwen3TtsAudio)));
if (!audio) return nullptr;

auto n = static_cast<int32_t>(result.audio.size());
auto* buf = static_cast<float*>(std::malloc(n * sizeof(float)));
if (!buf) {
std::free(audio);
return nullptr;
}
std::memcpy(buf, result.audio.data(), n * sizeof(float));

audio->samples = buf;
audio->n_samples = n;
audio->sample_rate = result.sample_rate;
return audio;
}

extern "C" {

void qwen3_tts_default_params(Qwen3TtsParams* params) {
if (!params) return;
params->max_audio_tokens = 4096;
params->temperature = 0.9f;
params->top_p = 1.0f;
params->top_k = 50;
params->n_threads = 4;
params->repetition_penalty = 1.05f;
params->language_id = 2050; /* English */
}

Qwen3Tts* qwen3_tts_create(const char* model_dir, int32_t n_threads) {
if (!model_dir) return nullptr;

auto* tts = new (std::nothrow) Qwen3Tts();
if (!tts) return nullptr;

if (!tts->engine.load_models(model_dir)) {
tts->last_error = tts->engine.get_error();
delete tts;
return nullptr;
}
(void)n_threads; /* threads are per-synthesis via params */
return tts;
}

int qwen3_tts_is_loaded(const Qwen3Tts* tts) {
return (tts && tts->engine.is_loaded()) ? 1 : 0;
}

Qwen3TtsAudio* qwen3_tts_synthesize(
Qwen3Tts* tts,
const char* text,
const Qwen3TtsParams* params)
{
if (!tts || !text || !params) return nullptr;

auto p = convert_params(params);
auto result = tts->engine.synthesize(std::string(text), p);
if (!result.success) {
tts->last_error = result.error_msg;
return nullptr;
}
return make_audio_result(result);
}

Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_file(
Qwen3Tts* tts,
const char* text,
const char* reference_audio_path,
const Qwen3TtsParams* params)
{
if (!tts || !text || !reference_audio_path || !params) return nullptr;

auto p = convert_params(params);
auto result = tts->engine.synthesize_with_voice(
std::string(text), std::string(reference_audio_path), p);
if (!result.success) {
tts->last_error = result.error_msg;
return nullptr;
}
return make_audio_result(result);
}

Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_samples(
Qwen3Tts* tts,
const char* text,
const float* ref_samples,
int32_t n_ref_samples,
const Qwen3TtsParams* params)
{
if (!tts || !text || !ref_samples || n_ref_samples <= 0 || !params)
return nullptr;

auto p = convert_params(params);
auto result = tts->engine.synthesize_with_voice(
std::string(text), ref_samples, n_ref_samples, p);
if (!result.success) {
tts->last_error = result.error_msg;
return nullptr;
}
return make_audio_result(result);
}

int32_t qwen3_tts_sample_rate(const Qwen3Tts* tts) {
(void)tts;
return 24000;
}

void qwen3_tts_free_audio(Qwen3TtsAudio* audio) {
if (!audio) return;
std::free(const_cast<float*>(audio->samples));
std::free(audio);
}

void qwen3_tts_destroy(Qwen3Tts* tts) {
delete tts;
}

const char* qwen3_tts_get_error(const Qwen3Tts* tts) {
if (!tts) return "";
return tts->last_error.c_str();
}

} /* extern "C" */
Loading