Skip to content

Commit 2b30aca

Browse files
committed
feat(audio): add whisper audio transcription; fix bugs; update docs and CI
Audio transcription: - Add audio_model_transcribe() accepting TEXT (file path) or BLOB input - Auto-detect audio format from file extension and magic bytes (WAV, MP3, FLAC) - Convert audio to mono 16kHz PCM as required by Whisper - Support 15 transcription options (language, translate, n_threads, temperature, etc.) - Uncomment and register audio_model_load/audio_model_free functions - Add 8 new unit tests covering transcription, options, error handling, and lifecycle Bug fixes: - Fix strncasecmp prefix-matching bug: add KEY_MATCHES macro that checks both key length and case-insensitive match, replacing all 39 occurrences - Fix wrong variable in logger call: warn_buf was formatted but buffer was passed to ai_logger, logging raw option values instead of the warning message - Fix NULL dereference in role_normalize when role is NULL - Fix NULL dereference in llm_model_cls_label when label is NULL - Fix potential buffer overflow in prompt_len underflow (negative-to-size_t wraparound) - Fix uint16_t type mismatch in audio PCM functions: miniaudio writes unsigned int (4 bytes) for channels, but uint16_t (2 bytes) was used, causing stack corruption - Fix whisper params string lifetime: track default vs dynamically allocated strings to avoid sqlite3_free on static string literals - Fix PCM buffer free: use sqlite3_free instead of free for buffers allocated via miniaudio's custom sqlite3_malloc allocator - Remove redundant NULL assignment in buffer_destroy Submodule updates: - Update miniaudio from 0.11.22 to 0.11.25 - Update llama.cpp and whisper.cpp submodules CI (main.yml): - Add whisper model and audio test file download with caching - Rename download-model job to download-models - Cache whisper model (ggml-tiny.bin) and audio test WAV (jfk.wav) across runs Makefile: - Add whisper model and audio test file download targets - Pass --whisper-model and --audio arguments to unit test binary Documentation: - API.md: add llm_chat_system_prompt, audio_model_load, audio_model_free, audio_model_transcribe with full options table; fix version and typos - README.md: move Getting Started after Features; add examples for embedding generation, chat, and audio transcription; add Audio Transcription feature; update description to reflect shipped audio support Version bump: 0.7.59 → 0.9.0
1 parent 7de5728 commit 2b30aca

File tree

12 files changed

+1879
-420
lines changed

12 files changed

+1879
-420
lines changed

.github/workflows/main.yml

Lines changed: 82 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,44 +15,91 @@ env:
1515
GGUF_MODEL_DIR: tests/models/unsloth/gemma-3-270m-it-GGUF
1616
GGUF_MODEL_NAME: gemma-3-270m-it-UD-IQ2_M.gguf
1717
GGUF_MODEL_URL: https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-UD-IQ2_M.gguf
18+
WHISPER_MODEL_DIR: tests/models/ggerganov/whisper-tiny
19+
WHISPER_MODEL_NAME: ggml-tiny.bin
20+
WHISPER_MODEL_URL: https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin
21+
AUDIO_TEST_DIR: tests/audio
22+
AUDIO_TEST_WAV: tests/audio/jfk.wav
23+
AUDIO_TEST_WAV_URL: https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav
1824

1925
jobs:
20-
download-model:
26+
download-models:
2127
outputs:
22-
cache-key: gguf-${{ steps.meta.outputs.hash }}
23-
model-path: ${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}
24-
name: Download GGUF model
28+
gguf-cache-key: gguf-${{ steps.meta.outputs.gguf-hash }}
29+
gguf-model-path: ${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}
30+
whisper-cache-key: whisper-${{ steps.meta.outputs.whisper-hash }}
31+
whisper-model-path: ${{ env.WHISPER_MODEL_DIR }}/${{ env.WHISPER_MODEL_NAME }}
32+
audio-cache-key: audio-${{ steps.meta.outputs.audio-hash }}
33+
audio-test-path: ${{ env.AUDIO_TEST_WAV }}
34+
name: Download models and test assets
2535
runs-on: ubuntu-22.04
2636
steps:
27-
- name: Compute model URL hash
37+
- name: Compute URL hashes
2838
id: meta
2939
run: |
3040
if command -v sha256sum >/dev/null 2>&1; then
31-
hash=$(echo -n "${{ env.GGUF_MODEL_URL }}" | sha256sum | cut -d' ' -f1)
41+
gguf_hash=$(echo -n "${{ env.GGUF_MODEL_URL }}" | sha256sum | cut -d' ' -f1)
42+
whisper_hash=$(echo -n "${{ env.WHISPER_MODEL_URL }}" | sha256sum | cut -d' ' -f1)
43+
audio_hash=$(echo -n "${{ env.AUDIO_TEST_WAV_URL }}" | sha256sum | cut -d' ' -f1)
3244
else
33-
hash=$(echo -n "${{ env.GGUF_MODEL_URL }}" | shasum -a 256 | cut -d' ' -f1)
45+
gguf_hash=$(echo -n "${{ env.GGUF_MODEL_URL }}" | shasum -a 256 | cut -d' ' -f1)
46+
whisper_hash=$(echo -n "${{ env.WHISPER_MODEL_URL }}" | shasum -a 256 | cut -d' ' -f1)
47+
audio_hash=$(echo -n "${{ env.AUDIO_TEST_WAV_URL }}" | shasum -a 256 | cut -d' ' -f1)
3448
fi
35-
echo "hash=$hash" >> "$GITHUB_OUTPUT"
49+
echo "gguf-hash=$gguf_hash" >> "$GITHUB_OUTPUT"
50+
echo "whisper-hash=$whisper_hash" >> "$GITHUB_OUTPUT"
51+
echo "audio-hash=$audio_hash" >> "$GITHUB_OUTPUT"
3652
37-
- name: Prepare model directory
38-
run: mkdir -p "${{ env.GGUF_MODEL_DIR }}"
53+
- name: Prepare directories
54+
run: |
55+
mkdir -p "${{ env.GGUF_MODEL_DIR }}"
56+
mkdir -p "${{ env.WHISPER_MODEL_DIR }}"
57+
mkdir -p "${{ env.AUDIO_TEST_DIR }}"
3958
4059
- name: Restore GGUF cache
41-
id: cache
60+
id: cache-gguf
4261
uses: actions/cache@v4
4362
with:
4463
path: ${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}
45-
key: gguf-${{ steps.meta.outputs.hash }}
64+
key: gguf-${{ steps.meta.outputs.gguf-hash }}
4665

4766
- name: Download GGUF model
48-
if: steps.cache.outputs.cache-hit != 'true'
49-
run: |
50-
curl -L --fail --retry 3 "${{ env.GGUF_MODEL_URL }}" -o "${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}"
67+
if: steps.cache-gguf.outputs.cache-hit != 'true'
68+
run: curl -L --fail --retry 3 "${{ env.GGUF_MODEL_URL }}" -o "${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}"
69+
5170
- name: Verify GGUF model
5271
run: test -f "${{ env.GGUF_MODEL_DIR }}/${{ env.GGUF_MODEL_NAME }}"
5372

73+
- name: Restore Whisper cache
74+
id: cache-whisper
75+
uses: actions/cache@v4
76+
with:
77+
path: ${{ env.WHISPER_MODEL_DIR }}/${{ env.WHISPER_MODEL_NAME }}
78+
key: whisper-${{ steps.meta.outputs.whisper-hash }}
79+
80+
- name: Download Whisper model
81+
if: steps.cache-whisper.outputs.cache-hit != 'true'
82+
run: curl -L --fail --retry 3 "${{ env.WHISPER_MODEL_URL }}" -o "${{ env.WHISPER_MODEL_DIR }}/${{ env.WHISPER_MODEL_NAME }}"
83+
84+
- name: Verify Whisper model
85+
run: test -f "${{ env.WHISPER_MODEL_DIR }}/${{ env.WHISPER_MODEL_NAME }}"
86+
87+
- name: Restore audio test file cache
88+
id: cache-audio
89+
uses: actions/cache@v4
90+
with:
91+
path: ${{ env.AUDIO_TEST_WAV }}
92+
key: audio-${{ steps.meta.outputs.audio-hash }}
93+
94+
- name: Download audio test file
95+
if: steps.cache-audio.outputs.cache-hit != 'true'
96+
run: curl -L --fail --retry 3 "${{ env.AUDIO_TEST_WAV_URL }}" -o "${{ env.AUDIO_TEST_WAV }}"
97+
98+
- name: Verify audio test file
99+
run: test -f "${{ env.AUDIO_TEST_WAV }}"
100+
54101
build:
55-
needs: download-model
102+
needs: download-models
56103
runs-on: ${{ matrix.os }}
57104
container: ${{ matrix.container && matrix.container || '' }}
58105
name: ${{ matrix.name }}${{ matrix.arch && format('-{0}', matrix.arch) || '' }} build${{ matrix.arch != 'arm64-v8a' && matrix.name != 'ios-sim' && matrix.name != 'ios' && matrix.name != 'apple-xcframework' && matrix.name != 'android-aar' && ( matrix.name != 'macos' || matrix.arch != 'x86_64' ) && ' + test' || ''}}
@@ -150,14 +197,29 @@ jobs:
150197
with:
151198
submodules: true
152199

153-
- name: Prepare GGUF model directory
154-
run: mkdir -p "${{ env.GGUF_MODEL_DIR }}"
200+
- name: Prepare test asset directories
201+
run: |
202+
mkdir -p "${{ env.GGUF_MODEL_DIR }}"
203+
mkdir -p "${{ env.WHISPER_MODEL_DIR }}"
204+
mkdir -p "${{ env.AUDIO_TEST_DIR }}"
155205
156206
- name: Restore GGUF cache
157207
uses: actions/cache@v4
158208
with:
159-
path: ${{ needs.download-model.outputs.model-path }}
160-
key: ${{ needs.download-model.outputs.cache-key }}
209+
path: ${{ needs.download-models.outputs.gguf-model-path }}
210+
key: ${{ needs.download-models.outputs.gguf-cache-key }}
211+
212+
- name: Restore Whisper cache
213+
uses: actions/cache@v4
214+
with:
215+
path: ${{ needs.download-models.outputs.whisper-model-path }}
216+
key: ${{ needs.download-models.outputs.whisper-cache-key }}
217+
218+
- name: Restore audio test file cache
219+
uses: actions/cache@v4
220+
with:
221+
path: ${{ needs.download-models.outputs.audio-test-path }}
222+
key: ${{ needs.download-models.outputs.audio-cache-key }}
161223

162224
- name: android setup java
163225
if: matrix.name == 'android-aar'

API.md

Lines changed: 108 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Returns the current version of the SQLite-AI extension.
1616

1717
```sql
1818
SELECT ai_version();
19-
-- e.g., '0.5.1'
19+
-- e.g., '0.9.0'
2020
```
2121

2222
---
@@ -706,7 +706,7 @@ SELECT llm_chat_restore('b59e...');
706706

707707
**Description:**
708708
Generates a context-aware reply using chat memory, returned as a single, complete response.
709-
For a streams model reply, use the llm_chat virtual table.
709+
For a streaming model reply, use the llm_chat virtual table.
710710

711711
**Example:**
712712

@@ -716,6 +716,112 @@ SELECT llm_chat_respond('What are the most visited cities in Italy?');
716716

717717
---
718718

719+
## `llm_chat_system_prompt(text TEXT)`
720+
721+
**Returns:** `TEXT` or `NULL`
722+
723+
**Description:**
724+
Gets or sets the system prompt for chat sessions. When called without arguments, returns the current system prompt (or `NULL` if none is set). When called with a text argument, sets the system prompt and returns `NULL`. The system prompt is automatically prepended as a system-role message at the beginning of chat conversations.
725+
726+
**Example:**
727+
728+
```sql
729+
-- Set a system prompt
730+
SELECT llm_chat_system_prompt('You are a helpful assistant that speaks Italian.');
731+
732+
-- Get the current system prompt
733+
SELECT llm_chat_system_prompt();
734+
```
735+
736+
---
737+
738+
## Audio Functions
739+
740+
### `audio_model_load(path TEXT, options TEXT)`
741+
742+
**Returns:** `NULL`
743+
744+
**Description:**
745+
Loads a Whisper model from the specified file path with optional comma-separated key=value configuration. The model is used for audio transcription via `audio_model_transcribe`. Only one whisper model can be loaded at a time per connection.
746+
747+
**Example:**
748+
749+
```sql
750+
-- Load with defaults
751+
SELECT audio_model_load('./models/ggml-tiny.bin');
752+
753+
-- Load with options
754+
SELECT audio_model_load('./models/ggml-base.bin', 'gpu_layers=0');
755+
```
756+
757+
---
758+
759+
### `audio_model_free()`
760+
761+
**Returns:** `NULL`
762+
763+
**Description:**
764+
Unloads the current Whisper model and frees associated memory.
765+
766+
**Example:**
767+
768+
```sql
769+
SELECT audio_model_free();
770+
```
771+
772+
---
773+
774+
### `audio_model_transcribe(input TEXT/BLOB, options TEXT)`
775+
776+
**Returns:** `TEXT`
777+
778+
**Description:**
779+
Transcribes audio to text using the loaded Whisper model. The input can be either:
780+
- **TEXT**: A file path to an audio file (WAV, MP3, or FLAC)
781+
- **BLOB**: Raw audio data (format auto-detected from magic bytes)
782+
783+
An optional second parameter accepts comma-separated key=value pairs to configure transcription behavior.
784+
785+
Supported audio formats: WAV, MP3, FLAC. Audio is automatically converted to mono 16kHz PCM as required by Whisper.
786+
787+
**Transcription options:**
788+
789+
| Key | Type | Default | Meaning |
790+
| ------------------ | -------- | ------- | ---------------------------------------------------------- |
791+
| `language` | `text` | `en` | Language code (e.g., `en`, `it`, `fr`, `de`). |
792+
| `translate` | `1 or 0` | `0` | Translate to English. |
793+
| `n_threads` | `number` | `4` | Number of threads for decoding. |
794+
| `offset_ms` | `number` | `0` | Start transcription at this offset (milliseconds). |
795+
| `duration_ms` | `number` | `0` | Transcribe only this duration (0 = full audio). |
796+
| `no_timestamps` | `1 or 0` | `0` | Suppress timestamps in output. |
797+
| `single_segment` | `1 or 0` | `0` | Force single segment output. |
798+
| `token_timestamps` | `1 or 0` | `0` | Enable token-level timestamps. |
799+
| `initial_prompt` | `text` | | Initial prompt to guide the model. |
800+
| `temperature` | `float` | `0.0` | Sampling temperature. |
801+
| `beam_size` | `number` | `-1` | Beam search size (-1 = use default). |
802+
| `audio_ctx` | `number` | `0` | Audio context size (0 = use default). |
803+
| `suppress_regex` | `text` | | Regex pattern for suppressing tokens. |
804+
| `max_len` | `number` | `0` | Maximum segment length in characters (0 = no limit). |
805+
| `print_timestamps` | `1 or 0` | `0` | Include timestamps in transcribed text. |
806+
807+
**Examples:**
808+
809+
```sql
810+
-- Transcribe from a file path
811+
SELECT audio_model_transcribe('./audio/speech.wav');
812+
813+
-- Transcribe from a BLOB column
814+
SELECT audio_model_transcribe(audio_data) FROM recordings WHERE id = 1;
815+
816+
-- Transcribe with options
817+
SELECT audio_model_transcribe('./audio/speech.mp3', 'language=it,translate=1');
818+
819+
-- Transcribe a single segment with no timestamps
820+
SELECT audio_model_transcribe('./audio/clip.flac', 'single_segment=1,no_timestamps=1');
821+
```
822+
823+
---
824+
719825
## Model Metadata
720826

721827
These functions return internal model properties:

Makefile

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ GGUF_MODEL_DIR ?= tests/models/unsloth/gemma-3-270m-it-GGUF
4444
GGUF_MODEL_NAME ?= gemma-3-270m-it-UD-IQ2_M.gguf
4545
GGUF_MODEL_URL ?= https://huggingface.co/unsloth/gemma-3-270m-it-GGUF/resolve/main/gemma-3-270m-it-UD-IQ2_M.gguf
4646
GGUF_MODEL_PATH := $(GGUF_MODEL_DIR)/$(GGUF_MODEL_NAME)
47+
48+
WHISPER_MODEL_DIR ?= tests/models/ggerganov/whisper-tiny
49+
WHISPER_MODEL_NAME ?= ggml-tiny.bin
50+
WHISPER_MODEL_URL ?= https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin
51+
WHISPER_MODEL_PATH := $(WHISPER_MODEL_DIR)/$(WHISPER_MODEL_NAME)
52+
53+
AUDIO_TEST_DIR ?= tests/audio
54+
AUDIO_TEST_WAV ?= $(AUDIO_TEST_DIR)/jfk.wav
55+
AUDIO_TEST_WAV_URL ?= https://github.com/ggml-org/whisper.cpp/raw/master/samples/jfk.wav
4756
SKIP_UNITTEST ?= 0
4857

4958
# Compiler and flags
@@ -88,7 +97,8 @@ ifeq ($(PLATFORM),windows)
8897
STRIP = strip --strip-unneeded $@
8998
else ifeq ($(PLATFORM),macos)
9099
TARGET := $(DIST_DIR)/ai.dylib
91-
LLAMA_LIBS += $(BUILD_GGML)/lib/libggml-metal.a
100+
LLAMA_LIBS += $(BUILD_GGML)/lib/libggml-metal.a $(BUILD_GGML)/lib/libggml-blas.a
101+
LLAMA_LDFLAGS += $(L)ggml-blas$(A)
92102
ifndef ARCH
93103
LDFLAGS += -arch x86_64 -arch arm64
94104
CFLAGS += -arch x86_64 -arch arm64
@@ -102,7 +112,7 @@ else ifeq ($(PLATFORM),macos)
102112
WHISPER_OPTIONS += -DGGML_OPENMP=OFF -DCMAKE_OSX_ARCHITECTURES="$(ARCH)" -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
103113
MINIAUDIO_OPTIONS += -DCMAKE_OSX_ARCHITECTURES="$(ARCH)" -DCMAKE_OSX_DEPLOYMENT_TARGET=11.0
104114
endif
105-
LDFLAGS += -L./$(BUILD_GGML)/lib -lggml-metal -L./$(BUILD_GGML)/lib -framework Metal -framework Foundation -framework CoreFoundation -framework QuartzCore -dynamiclib -undefined dynamic_lookup -headerpad_max_install_names
115+
LDFLAGS += -L./$(BUILD_GGML)/lib -lggml-metal -L./$(BUILD_GGML)/lib -framework Accelerate -framework Metal -framework Foundation -framework CoreFoundation -framework QuartzCore -dynamiclib -undefined dynamic_lookup -headerpad_max_install_names
106116
STRIP = strip -x -S $@
107117
else ifeq ($(PLATFORM),android)
108118
ifndef ARCH # Set ARCH to find Android NDK's Clang compiler, the user should set the ARCH
@@ -235,16 +245,24 @@ $(GGUF_MODEL_PATH):
235245
@mkdir -p $(GGUF_MODEL_DIR)
236246
curl -L --fail --retry 3 -o $@ $(GGUF_MODEL_URL)
237247

248+
$(WHISPER_MODEL_PATH):
249+
@mkdir -p $(WHISPER_MODEL_DIR)
250+
curl -L --fail --retry 3 -o $@ $(WHISPER_MODEL_URL)
251+
252+
$(AUDIO_TEST_WAV):
253+
@mkdir -p $(AUDIO_TEST_DIR)
254+
curl -L --fail --retry 3 -o $@ $(AUDIO_TEST_WAV_URL)
255+
238256
TEST_DEPS := $(TARGET)
239257
ifeq ($(SKIP_UNITTEST),0)
240-
TEST_DEPS += $(CTEST_BIN) $(GGUF_MODEL_PATH)
258+
TEST_DEPS += $(CTEST_BIN) $(GGUF_MODEL_PATH) $(WHISPER_MODEL_PATH) $(AUDIO_TEST_WAV)
241259
endif
242260

243261
test: $(TEST_DEPS)
244262
@echo "Running sqlite3 CLI smoke test (ensures .load works)..."
245263
$(SQLITE3) ":memory:" -cmd ".bail on" ".load ./dist/ai" "SELECT ai_version();"
246264
ifeq ($(SKIP_UNITTEST),0)
247-
$(CTEST_BIN) --extension "$(TARGET)" --model "$(GGUF_MODEL_PATH)"
265+
$(CTEST_BIN) --extension "$(TARGET)" --model "$(GGUF_MODEL_PATH)" --whisper-model "$(WHISPER_MODEL_PATH)" --audio "$(AUDIO_TEST_WAV)"
248266
else
249267
@echo "Skipping C unit tests (SKIP_UNITTEST=$(SKIP_UNITTEST))."
250268
endif

0 commit comments

Comments
 (0)