diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 633be55f5..488cdbfd9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -35,6 +35,11 @@ jobs: run: | python -m pip install pytest python -m pip install -e . + python -m pip install -r requirements-dev.txt + + - name: Build documentation + run: | + mkdocs build --strict - name: Run Python tests (TTS) run: | diff --git a/README.md b/README.md index 821074c3d..7856da7ec 100644 --- a/README.md +++ b/README.md @@ -217,7 +217,16 @@ mx.save_safetensors("./8bit/kokoro-v1_0.safetensors", weights, metadata={"format - For the web interface and API: - FastAPI - Uvicorn - + +## Documentation + +To build the documentation locally: + +```bash +pip install -r requirements-dev.txt +mkdocs build --strict +``` + ## License [MIT License](LICENSE) @@ -227,3 +236,4 @@ mx.save_safetensors("./8bit/kokoro-v1_0.safetensors", weights, metadata={"format - Thanks to the Apple MLX team for providing a great framework for building TTS and STS models. - This project uses the Kokoro model architecture for text-to-speech synthesis. - The 3D visualization uses Three.js for rendering. + diff --git a/docs/api.md b/docs/api.md new file mode 100644 index 000000000..c7175dd0a --- /dev/null +++ b/docs/api.md @@ -0,0 +1,8 @@ +# API Overview + +MLX-Audio exposes several modules for generating speech and running a web server. + +- `mlx_audio.tts.generate` – command line entry point and Python functions for TTS generation. +- `mlx_audio.server` – launch the interactive web interface and REST API. + +For full details see the source code and docstrings. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..c5c8f7c9f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,13 @@ +# MLX-Audio Documentation + +Welcome to **MLX-Audio**, a text-to-speech (TTS) and speech-to-speech (STS) library built on Apple's MLX framework. + +## Features + +- Fast inference on Apple Silicon +- Multiple language and voice options +- Adjustable speaking speed from 0.5x to 2.0x +- Interactive web interface with 3D visualization +- REST API for TTS generation +- Quantization support for optimized performance + diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 000000000..3cf53963c --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,33 @@ +# Usage + +## Installation + +```bash +pip install mlx-audio + +# For web interface and API dependencies +pip install -r requirements.txt +``` + +## Command Line Example + +```bash +# Basic usage +mlx_audio.tts.generate --text "Hello, world" + +# Specify prefix for output file +mlx_audio.tts.generate --text "Hello, world" --file_prefix hello + +# Adjust speaking speed +mlx_audio.tts.generate --text "Hello, world" --speed 1.4 +``` + +## Python Example + +```python +from mlx_audio.tts.generate import generate_audio + +text = "The MLX King lives. Let him cook!" +generate_audio(text=text) +``` + diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..62eb7aa2a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,10 @@ +site_name: MLX-Audio +nav: + - Home: index.md + - Usage: usage.md + - API: api.md +markdown_extensions: + - toc + - tables +plugins: + - search diff --git a/mlx_audio/tts/models/base.py b/mlx_audio/tts/models/base.py index 10bc444cd..1959845a9 100644 --- a/mlx_audio/tts/models/base.py +++ b/mlx_audio/tts/models/base.py @@ -75,7 +75,6 @@ class GenerationResult: sample_rate: int segment_idx: int token_count: int - audio_samples: int audio_duration: str real_time_factor: float prompt: dict diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..1e6b80d6f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +mkdocs>=1.6.0 +mkdocs-material>=9.5.13 + diff --git a/requirements.txt b/requirements.txt index 23ac2ac67..42f8b0dbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,4 +23,4 @@ einops==0.8.1 einx==0.3.0 fastrtc[vad, stt] webrtcvad>=2.0.10 -dacite>=1.9.2 \ No newline at end of file +dacite>=1.9.2