diff --git a/CMakeLists.txt b/CMakeLists.txt index ebbfd59..67a181d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,6 +31,14 @@ find_package(Threads REQUIRED) set(GGML_DIR "${CMAKE_CURRENT_SOURCE_DIR}/ggml") set(GGML_BUILD_DIR "${GGML_DIR}/build") +# Detect Metal backend availability (requires building GGML with -DGGML_METAL=ON) +if(APPLE AND EXISTS "${GGML_BUILD_DIR}/src/ggml-metal/libggml-metal.dylib") + set(GGML_HAS_METAL ON) + message(STATUS "GGML Metal backend found — GPU acceleration enabled") +else() + set(GGML_HAS_METAL OFF) +endif() + # Text tokenizer library add_library(text_tokenizer STATIC src/text_tokenizer.cpp @@ -42,6 +50,7 @@ target_include_directories(text_tokenizer PUBLIC ) target_link_directories(text_tokenizer PUBLIC ${GGML_BUILD_DIR}/src + ${GGML_BUILD_DIR}/src/ggml-metal ) target_link_libraries(text_tokenizer PUBLIC ggml @@ -49,6 +58,9 @@ target_link_libraries(text_tokenizer PUBLIC ggml-cpu Threads::Threads ) +if(GGML_HAS_METAL) + target_link_libraries(text_tokenizer PUBLIC ggml-metal "-framework Metal" "-framework MetalKit") +endif() # TTS Transformer library (GGML-based + optional CoreML bridge) set(TTS_TRANSFORMER_SOURCES @@ -71,6 +83,7 @@ target_include_directories(tts_transformer PUBLIC ) target_link_directories(tts_transformer PUBLIC ${GGML_BUILD_DIR}/src + ${GGML_BUILD_DIR}/src/ggml-metal ) target_link_libraries(tts_transformer PUBLIC ggml @@ -78,6 +91,9 @@ target_link_libraries(tts_transformer PUBLIC ggml-cpu Threads::Threads ) +if(GGML_HAS_METAL) + target_link_libraries(tts_transformer PUBLIC ggml-metal "-framework Metal" "-framework MetalKit") +endif() if(APPLE AND QWEN3_TTS_COREML) target_link_libraries(tts_transformer PUBLIC "-framework Foundation" @@ -96,6 +112,7 @@ target_include_directories(audio_tokenizer_encoder PUBLIC ) target_link_directories(audio_tokenizer_encoder PUBLIC ${GGML_BUILD_DIR}/src + ${GGML_BUILD_DIR}/src/ggml-metal ) target_link_libraries(audio_tokenizer_encoder PUBLIC ggml @@ -103,6 +120,9 @@ target_link_libraries(audio_tokenizer_encoder PUBLIC ggml-cpu Threads::Threads ) +if(GGML_HAS_METAL) + target_link_libraries(audio_tokenizer_encoder PUBLIC ggml-metal "-framework Metal" "-framework MetalKit") +endif() # Audio tokenizer decoder library (GGML-based) add_library(audio_tokenizer_decoder STATIC @@ -115,6 +135,7 @@ target_include_directories(audio_tokenizer_decoder PUBLIC ) target_link_directories(audio_tokenizer_decoder PUBLIC ${GGML_BUILD_DIR}/src + ${GGML_BUILD_DIR}/src/ggml-metal ) target_link_libraries(audio_tokenizer_decoder PUBLIC ggml @@ -122,6 +143,9 @@ target_link_libraries(audio_tokenizer_decoder PUBLIC ggml-cpu Threads::Threads ) +if(GGML_HAS_METAL) + target_link_libraries(audio_tokenizer_decoder PUBLIC ggml-metal "-framework Metal" "-framework MetalKit") +endif() # Qwen3 TTS library (full pipeline) add_library(qwen3_tts STATIC @@ -139,6 +163,22 @@ target_link_libraries(qwen3_tts PUBLIC Threads::Threads ) +# Shared library with C API (for FFI integration with Python, Nim, Rust, etc.) +add_library(qwen3tts_shared SHARED + src/qwen3tts_c_api.cpp +) +set_target_properties(qwen3tts_shared PROPERTIES + OUTPUT_NAME "qwen3tts" + VERSION ${PROJECT_VERSION} + SOVERSION 0 +) +target_include_directories(qwen3tts_shared PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src +) +target_link_libraries(qwen3tts_shared PRIVATE + qwen3_tts +) + # CLI executable add_executable(qwen3-tts-cli src/main.cpp @@ -184,18 +224,20 @@ target_link_libraries(test_decoder PRIVATE ) # Install targets -install(TARGETS text_tokenizer tts_transformer audio_tokenizer_encoder audio_tokenizer_decoder qwen3_tts qwen3-tts-cli +install(TARGETS text_tokenizer tts_transformer audio_tokenizer_encoder audio_tokenizer_decoder qwen3_tts qwen3tts_shared qwen3-tts-cli ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib RUNTIME DESTINATION bin ) -install(FILES - src/gguf_loader.h - src/text_tokenizer.h - src/tts_transformer.h +install(FILES + src/gguf_loader.h + src/text_tokenizer.h + src/tts_transformer.h src/coreml_code_predictor.h - src/audio_tokenizer_encoder.h - src/audio_tokenizer_decoder.h + src/audio_tokenizer_encoder.h + src/audio_tokenizer_decoder.h src/qwen3_tts.h + src/qwen3tts_c_api.h DESTINATION include ) diff --git a/src/qwen3_tts.cpp b/src/qwen3_tts.cpp index c2394b5..7554554 100644 --- a/src/qwen3_tts.cpp +++ b/src/qwen3_tts.cpp @@ -114,8 +114,17 @@ bool Qwen3TTS::load_models(const std::string & model_dir) { transformer_loaded_ = false; decoder_loaded_ = false; - // Construct model paths - std::string tts_model_path = model_dir + "/qwen3-tts-0.6b-f16.gguf"; + // Construct model paths — prefer quantized (q8_0) over full-precision (f16) + std::string tts_model_path; + std::string q8_path = model_dir + "/qwen3-tts-0.6b-q8_0.gguf"; + std::string f16_path = model_dir + "/qwen3-tts-0.6b-f16.gguf"; + FILE * q8_check = fopen(q8_path.c_str(), "r"); + if (q8_check) { + fclose(q8_check); + tts_model_path = q8_path; + } else { + tts_model_path = f16_path; + } std::string tokenizer_model_path = model_dir + "/qwen3-tts-tokenizer-f16.gguf"; tts_model_path_ = tts_model_path; decoder_model_path_ = tokenizer_model_path; diff --git a/src/qwen3tts_c_api.cpp b/src/qwen3tts_c_api.cpp new file mode 100644 index 0000000..50a9267 --- /dev/null +++ b/src/qwen3tts_c_api.cpp @@ -0,0 +1,155 @@ +/* qwen3tts_c_api.cpp — C API wrapper for qwen3-tts.cpp */ + +#include "qwen3tts_c_api.h" +#include "qwen3_tts.h" + +#include +#include + +struct Qwen3Tts { + qwen3_tts::Qwen3TTS engine; + std::string last_error; +}; + +/* Internal helpers (C++ linkage, not exported) */ + +static qwen3_tts::tts_params convert_params(const Qwen3TtsParams* params) { + qwen3_tts::tts_params p; + p.max_audio_tokens = params->max_audio_tokens; + p.temperature = params->temperature; + p.top_p = params->top_p; + p.top_k = params->top_k; + p.n_threads = params->n_threads; + p.repetition_penalty = params->repetition_penalty; + p.language_id = params->language_id; + p.print_progress = false; + p.print_timing = false; + return p; +} + +static Qwen3TtsAudio* make_audio_result(const qwen3_tts::tts_result& result) { + auto* audio = static_cast( + std::malloc(sizeof(Qwen3TtsAudio))); + if (!audio) return nullptr; + + auto n = static_cast(result.audio.size()); + auto* buf = static_cast(std::malloc(n * sizeof(float))); + if (!buf) { + std::free(audio); + return nullptr; + } + std::memcpy(buf, result.audio.data(), n * sizeof(float)); + + audio->samples = buf; + audio->n_samples = n; + audio->sample_rate = result.sample_rate; + return audio; +} + +extern "C" { + +void qwen3_tts_default_params(Qwen3TtsParams* params) { + if (!params) return; + params->max_audio_tokens = 4096; + params->temperature = 0.9f; + params->top_p = 1.0f; + params->top_k = 50; + params->n_threads = 4; + params->repetition_penalty = 1.05f; + params->language_id = 2050; /* English */ +} + +Qwen3Tts* qwen3_tts_create(const char* model_dir, int32_t n_threads) { + if (!model_dir) return nullptr; + + auto* tts = new (std::nothrow) Qwen3Tts(); + if (!tts) return nullptr; + + if (!tts->engine.load_models(model_dir)) { + tts->last_error = tts->engine.get_error(); + delete tts; + return nullptr; + } + (void)n_threads; /* threads are per-synthesis via params */ + return tts; +} + +int qwen3_tts_is_loaded(const Qwen3Tts* tts) { + return (tts && tts->engine.is_loaded()) ? 1 : 0; +} + +Qwen3TtsAudio* qwen3_tts_synthesize( + Qwen3Tts* tts, + const char* text, + const Qwen3TtsParams* params) +{ + if (!tts || !text || !params) return nullptr; + + auto p = convert_params(params); + auto result = tts->engine.synthesize(std::string(text), p); + if (!result.success) { + tts->last_error = result.error_msg; + return nullptr; + } + return make_audio_result(result); +} + +Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_file( + Qwen3Tts* tts, + const char* text, + const char* reference_audio_path, + const Qwen3TtsParams* params) +{ + if (!tts || !text || !reference_audio_path || !params) return nullptr; + + auto p = convert_params(params); + auto result = tts->engine.synthesize_with_voice( + std::string(text), std::string(reference_audio_path), p); + if (!result.success) { + tts->last_error = result.error_msg; + return nullptr; + } + return make_audio_result(result); +} + +Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_samples( + Qwen3Tts* tts, + const char* text, + const float* ref_samples, + int32_t n_ref_samples, + const Qwen3TtsParams* params) +{ + if (!tts || !text || !ref_samples || n_ref_samples <= 0 || !params) + return nullptr; + + auto p = convert_params(params); + auto result = tts->engine.synthesize_with_voice( + std::string(text), ref_samples, n_ref_samples, p); + if (!result.success) { + tts->last_error = result.error_msg; + return nullptr; + } + return make_audio_result(result); +} + +int32_t qwen3_tts_sample_rate(const Qwen3Tts* tts) { + (void)tts; + return 24000; +} + +void qwen3_tts_free_audio(Qwen3TtsAudio* audio) { + if (!audio) return; + std::free(const_cast(audio->samples)); + std::free(audio); +} + +void qwen3_tts_destroy(Qwen3Tts* tts) { + delete tts; +} + +const char* qwen3_tts_get_error(const Qwen3Tts* tts) { + if (!tts) return ""; + return tts->last_error.c_str(); +} + +} /* extern "C" */ diff --git a/src/qwen3tts_c_api.h b/src/qwen3tts_c_api.h new file mode 100644 index 0000000..6b4745d --- /dev/null +++ b/src/qwen3tts_c_api.h @@ -0,0 +1,86 @@ +/* qwen3tts_c_api.h — C API wrapper for qwen3-tts.cpp (Nim FFI) */ +#ifndef QWEN3TTS_C_API_H +#define QWEN3TTS_C_API_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Opaque handle */ +typedef struct Qwen3Tts Qwen3Tts; + +/* Generation parameters */ +typedef struct Qwen3TtsParams { + int32_t max_audio_tokens; /* default: 4096 */ + float temperature; /* default: 0.9, 0=greedy */ + float top_p; /* default: 1.0 */ + int32_t top_k; /* default: 50, 0=disabled */ + int32_t n_threads; /* default: 4 */ + float repetition_penalty; /* default: 1.05 */ + int32_t language_id; /* 2050=en, 2058=ja, 2055=zh, etc. */ +} Qwen3TtsParams; + +/* Generated audio result */ +typedef struct Qwen3TtsAudio { + const float* samples; /* PCM float32 mono */ + int32_t n_samples; + int32_t sample_rate; /* always 24000 */ +} Qwen3TtsAudio; + +/* Fill params with defaults */ +void qwen3_tts_default_params(Qwen3TtsParams* params); + +/* Create TTS engine and load models from directory. + * model_dir must contain qwen3-tts-0.6b-f16.gguf and + * qwen3-tts-tokenizer-f16.gguf. + * Returns NULL on failure. */ +Qwen3Tts* qwen3_tts_create(const char* model_dir, int32_t n_threads); + +/* Check if models are loaded */ +int qwen3_tts_is_loaded(const Qwen3Tts* tts); + +/* Synthesize text to audio. Returns NULL on failure. + * Caller must free with qwen3_tts_free_audio(). */ +Qwen3TtsAudio* qwen3_tts_synthesize( + Qwen3Tts* tts, + const char* text, + const Qwen3TtsParams* params); + +/* Get sample rate (always 24000) */ +int32_t qwen3_tts_sample_rate(const Qwen3Tts* tts); + +/* Free generated audio */ +void qwen3_tts_free_audio(Qwen3TtsAudio* audio); + +/* Destroy TTS engine */ +void qwen3_tts_destroy(Qwen3Tts* tts); + +/* Synthesize with voice cloning from WAV file. + * reference_audio_path: path to reference WAV (24kHz mono recommended). + * Returns NULL on failure. Caller must free with qwen3_tts_free_audio(). */ +Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_file( + Qwen3Tts* tts, + const char* text, + const char* reference_audio_path, + const Qwen3TtsParams* params); + +/* Synthesize with voice cloning from raw samples. + * ref_samples: 24kHz mono float32 normalized to [-1, 1]. + * Returns NULL on failure. Caller must free with qwen3_tts_free_audio(). */ +Qwen3TtsAudio* qwen3_tts_synthesize_with_voice_samples( + Qwen3Tts* tts, + const char* text, + const float* ref_samples, + int32_t n_ref_samples, + const Qwen3TtsParams* params); + +/* Get last error message (or empty string) */ +const char* qwen3_tts_get_error(const Qwen3Tts* tts); + +#ifdef __cplusplus +} +#endif + +#endif /* QWEN3TTS_C_API_H */