danbev
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/parakeet.h‎
Lines changed: 455 additions & 0 deletions b/‎include/parakeet.h‎
Lines changed: 455 additions & 0 deletions
diff --git a/‎models/__pycache__/convert-parakeet-to-ggml.cpython-312.pyc‎
11.8 KB b/‎models/__pycache__/convert-parakeet-to-ggml.cpython-312.pyc‎
11.8 KB
diff --git a/‎models/convert-parakeet-to-ggml.py‎
Lines changed: 375 additions & 0 deletions b/‎models/convert-parakeet-to-ggml.py‎
Lines changed: 375 additions & 0 deletions
diff --git a/‎models/requirements-parakeet.txt‎
Lines changed: 1 addition & 0 deletions b/‎models/requirements-parakeet.txt‎
Lines changed: 1 addition & 0 deletions
@@ -185,6 +185,10 @@ target_compile_definitions(whisper PRIVATE
     WHISPER_VERSION="${PROJECT_VERSION}"
 )
 
+target_compile_definitions(parakeet PRIVATE
+    PARAKEET_VERSION="${PROJECT_VERSION}"
+)
+
 configure_package_config_file(
         ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
         ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
 
@@ -0,0 +1,375 @@
+#!/usr/bin/env python3
+# Convert Parakeet TDT model from NeMo format to ggml format
+#
+# Usage: python convert-parakeet-to-ggml.py --model parakeet-model.nemo --output-dir output-dir [--use-f32]
+#
+# The NeMo file is a tar archive containing:
+#   - model_weights.ckpt (PyTorch checkpoint)
+#   - model_config.yaml (model configuration)
+#   - tokenizer files (BPE tokenizer)
+#
+# This script extracts the NeMo archive, loads the model weights and configuration,
+# and saves them in ggml format compatible with whisper.cpp.
+#
+
+import torch
+import argparse
+import io
+import os
+import sys
+import struct
+import tarfile
+import tempfile
+import shutil
+import yaml
+import numpy as np
+from pathlib import Path
+from typing import Optional
+
+def hz_to_mel(freq):
+    """Convert Hz to mel scale"""
+    return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+def mel_to_hz(mel):
+    """Convert mel scale to Hz"""
+    return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+def create_mel_filterbank(
+    sample_rate: int = 16000,
+    n_fft: int = 512,
+    n_mels: int = 128,
+    fmin: float = 0.0,
+    fmax: Optional[float] = None
+) -> np.ndarray:
+    """
+    Create mel filterbank matrix compatible with Whisper's implementation.
+
+    Args:
+        sample_rate: Audio sample rate (Hz)
+        n_fft: FFT size
+        n_mels: Number of mel bands
+        fmin: Minimum frequency (Hz)
+        fmax: Maximum frequency (Hz), defaults to sample_rate/2
+
+    Returns:
+        Mel filterbank matrix of shape (n_mels, n_fft//2 + 1)
+    """
+    if fmax is None:
+        fmax = float(sample_rate / 2)
+
+    # Number of FFT frequency bins
+    n_freqs = n_fft // 2 + 1
+
+    # FFT bin frequencies
+    fft_freqs = np.linspace(0, sample_rate / 2, n_freqs)
+
+    # Mel scale boundaries
+    mel_min = hz_to_mel(fmin)
+    mel_max = hz_to_mel(fmax)
+
+    # Equally spaced mel points
+    mel_points = np.linspace(mel_min, mel_max, n_mels + 2)
+    hz_points = mel_to_hz(mel_points)
+
+    # Convert Hz to FFT bin indices
+    bin_points = np.floor((n_fft + 1) * hz_points / sample_rate).astype(int)
+
+    # Create filterbank
+    filterbank = np.zeros((n_mels, n_freqs))
+
+    for m in range(n_mels):
+        # Left, center, right points for this filter
+        left = bin_points[m]
+        center = bin_points[m + 1]
+        right = bin_points[m + 2]
+
+        # Rising slope
+        for k in range(left, center):
+            if center != left:
+                filterbank[m, k] = (k - left) / (center - left)
+
+        # Falling slope
+        for k in range(center, right):
+            if right != center:
+                filterbank[m, k] = (right - k) / (right - center)
+
+    # Normalize filters to have unit area (like librosa)
+    enorm = 2.0 / (hz_points[2:n_mels+2] - hz_points[:n_mels])
+    filterbank *= enorm[:, np.newaxis]
+
+    return filterbank.astype(np.float32)
+
+def extract_nemo_archive(nemo_path, extract_dir):
+    """Extract .nemo archive to temporary directory"""
+    print(f"Extracting {nemo_path} to {extract_dir}")
+    with tarfile.open(nemo_path, 'r') as tar:
+        tar.extractall(path=extract_dir)
+    print("Extraction complete")
+
+def load_model_config(config_path):
+    """Load model configuration from YAML"""
+    with open(config_path, 'r') as f:
+        config = yaml.safe_load(f)
+    return config
+
+def load_tokenizer(extract_dir, config):
+    """Load BPE tokenizer from NeMo files"""
+    # NeMo uses sentencepiece BPE tokenizer
+    tokenizer_model_path = None
+    tokenizer_vocab_path = None
+
+    # Find tokenizer files - prefer .vocab file which has all 8192 tokens with special tokens
+    for file in os.listdir(extract_dir):
+        if file.endswith('_tokenizer.model'):
+            tokenizer_model_path = os.path.join(extract_dir, file)
+        elif file.endswith('tokenizer.vocab'):
+            tokenizer_vocab_path = os.path.join(extract_dir, file)
+
+    if not tokenizer_model_path:
+        raise FileNotFoundError("Tokenizer model file not found")
+
+    if not tokenizer_vocab_path:
+        raise FileNotFoundError("Tokenizer vocab file not found")
+
+    # Load complete vocabulary from .vocab file (SentencePiece format: token\tscore)
+    # This file contains all 8192 tokens in the correct order including special tokens
+    tokens = {}
+    with open(tokenizer_vocab_path, 'r', encoding='utf-8') as f:
+        for idx, line in enumerate(f):
+            parts = line.strip().split('\t')
+            if len(parts) >= 1:
+                token = parts[0]
+                tokens[token.encode('utf-8')] = idx
+
+    print(f"Loaded {len(tokens)} tokens from {os.path.basename(tokenizer_vocab_path)}")
+
+    if len(tokens) != 8192:
+        print(f"WARNING: Expected 8192 tokens, got {len(tokens)}")
+
+    return tokens
+
+def convert_parakeet_to_ggml(nemo_path, output_dir, use_f16=True, out_name=None):
+    nemo_path = Path(nemo_path)
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Create temporary directory for extraction
+    with tempfile.TemporaryDirectory() as temp_dir:
+        extract_nemo_archive(nemo_path, temp_dir)
+
+        config_path = os.path.join(temp_dir, 'model_config.yaml')
+        config = load_model_config(config_path)
+
+        print("Model configuration:")
+        print(f"  Sample rate: {config['sample_rate']}")
+        print(f"  Encoder layers: {config['encoder']['n_layers']}")
+        print(f"  Encoder d_model: {config['encoder']['d_model']}")
+        print(f"  Mel features: {config['preprocessor']['features']}")
+
+        weights_path = os.path.join(temp_dir, 'model_weights.ckpt')
+        print(f"\nLoading model weights from {weights_path}")
+        checkpoint = torch.load(weights_path, map_location='cpu')
+
+        # Extract state dict
+        if 'state_dict' in checkpoint:
+            state_dict = checkpoint['state_dict']
+        else:
+            state_dict = checkpoint
+
+        print(f"Loaded {len(state_dict)} tensors")
+
+        # Load tokenizer
+        print("\nLoading tokenizer...")
+        tokens = load_tokenizer(temp_dir, config)
+        print(f"Loaded {len(tokens)} tokens")
+
+        # Prepare hyperparameters for ggml format
+        # Map Parakeet config to Whisper-like structure
+        hparams = {
+            'n_audio_ctx': 1500,  # Parakeet uses different context, approximate
+            'n_audio_state': config['encoder']['d_model'],
+            'n_audio_head': config['encoder']['n_heads'],
+            'n_audio_layer': config['encoder']['n_layers'],
+            'n_text_ctx': 448,  # Placeholder - Parakeet TDT doesn't have decoder
+            'n_text_state': config.get('model_defaults', {}).get('pred_hidden', 640),
+            'n_text_head': 8,  # Placeholder
+            'n_text_layer': 0,  # No text decoder layers
+            'n_mels': config['preprocessor']['features'],
+            'n_fft': config['preprocessor']['n_fft'],
+            'subsampling_factor': config['encoder']['subsampling_factor'],
+            'n_subsampling_channels': config['encoder']['subsampling_conv_channels'],
+            'n_pos_max_len': config['encoder']['pos_emb_max_len'],
+
+            'n_pred_dim': config['decoder']['prednet']['pred_hidden'],
+            'n_pred_layers': config['decoder']['prednet']['pred_rnn_layers'],
+            'n_vocab': config['decoder']['vocab_size'],
+        }
+
+        print("\nGGML hyperparameters:")
+        for key, value in hparams.items():
+            print(f"  {key}: {value}")
+
+        # Create output file
+        if out_name:
+            fname_out = output_dir / out_name
+        else:
+            fname_out = output_dir / ("ggml-model-f32.bin" if not use_f16 else "ggml-model.bin")
+        print(f"\nWriting to {fname_out}")
+
+        with open(fname_out, 'wb') as fout:
+            # Write magic number
+            fout.write(struct.pack("i", 0x67676d6c))  # 'ggml' in hex
+
+            # Write hyperparameters
+            fout.write(struct.pack("i", hparams['n_vocab']))
+            fout.write(struct.pack("i", hparams['n_audio_ctx']))
+            fout.write(struct.pack("i", hparams['n_audio_state']))
+            fout.write(struct.pack("i", hparams['n_audio_head']))
+            fout.write(struct.pack("i", hparams['n_audio_layer']))
+            fout.write(struct.pack("i", hparams['n_text_ctx']))
+            fout.write(struct.pack("i", hparams['n_text_state']))
+            fout.write(struct.pack("i", hparams['n_text_head']))
+            fout.write(struct.pack("i", hparams['n_text_layer']))
+            fout.write(struct.pack("i", hparams['n_mels']))
+            fout.write(struct.pack("i", 1 if use_f16 else 0))
+            fout.write(struct.pack("i", hparams['n_fft']))
+            fout.write(struct.pack("i", hparams['subsampling_factor']))
+            fout.write(struct.pack("i", hparams['n_subsampling_channels']))
+            fout.write(struct.pack("i", hparams['n_pos_max_len']))
+            fout.write(struct.pack("i", hparams['n_pred_dim']))
+            fout.write(struct.pack("i", hparams['n_pred_layers']))
+
+            # Extract mel filterbank from model
+            fb_key = None
+            for key in state_dict.keys():
+                if 'featurizer.fb' in key or 'filterbank' in key.lower():
+                    fb_key = key
+                    break
+
+            if not fb_key:
+                print("\nERROR: Mel filterbank not found in model!")
+                print("Expected tensor with 'featurizer.fb' or 'filterbank' in name")
+                print("\nAvailable preprocessor tensors:")
+                for key in sorted(state_dict.keys()):
+                    if 'preprocessor' in key or 'featurizer' in key:
+                        print(f"  {key}: {state_dict[key].shape}")
+                raise ValueError("Mel filterbank tensor not found in model")
+
+            print(f"\nUsing model's mel filterbank from: {fb_key}")
+            mel_filters = state_dict[fb_key].squeeze().numpy().astype(np.float32)
+            print(f"  Filterbank shape: {mel_filters.shape}")
+
+            if len(mel_filters.shape) != 2:
+                raise ValueError(f"Expected 2D filterbank, got shape {mel_filters.shape}")
+
+            n_mels, n_freqs = mel_filters.shape
+            fout.write(struct.pack("i", n_mels))      # n_mel
+            fout.write(struct.pack("i", n_freqs))     # n_fb (frequency bins)
+
+            # Write mel filterbank
+            for i in range(n_mels):
+                for j in range(n_freqs):
+                    fout.write(struct.pack("f", mel_filters[i, j]))
+
+            # Extract window function from model
+            window_key = None
+            for key in state_dict.keys():
+                if 'featurizer.window' in key or 'preproc' in key and 'window' in key:
+                    window_key = key
+                    break
+
+            if not window_key:
+                print("\nERROR: Window function not found in model!")
+                print("Expected tensor with 'featurizer.window' in name")
+                raise ValueError("Window function tensor not found in model")
+
+            print(f"\nUsing model's window function from: {window_key}")
+            window = state_dict[window_key].squeeze().numpy().astype(np.float32)
+            print(f"  Window shape: {window.shape}")
+
+            if len(window.shape) != 1:
+                raise ValueError(f"Expected 1D window, got shape {window.shape}")
+
+            n_window = window.shape[0]
+            fout.write(struct.pack("i", n_window))
+
+            # Write window function
+            for i in range(n_window):
+                fout.write(struct.pack("f", window[i]))
+
+            fout.write(struct.pack("i", len(tokens)))
+            for token_bytes, idx in sorted(tokens.items(), key=lambda x: x[1]):
+                fout.write(struct.pack("i", len(token_bytes)))
+                fout.write(token_bytes)
+
+            print("\nConverting model weights...")
+            for name, tensor in state_dict.items():
+                # Skip the filterbank and window - already written in preprocessing section
+                if name == fb_key:
+                    print(f"Skipping {name} (already written as mel filterbank)")
+                    continue
+                if name == window_key:
+                    print(f"Skipping {name} (already written as window function)")
+                    continue
+
+                # Don't squeeze Conv2d weights - they need to preserve all 4 dimensions
+                if 'conv' in name and 'weight' in name and len(tensor.shape) == 4:
+                    data = tensor.numpy()
+                else:
+                    data = tensor.squeeze().numpy()
+
+                # Reshape Conv2d bias from [out_channels] to [1, out_channels, 1, 1] for broadcasting
+                # This will be written reversed as [1, 1, out_channels, 1] in the file
+                # which matches ggml conv2d output layout [W, H, C, batch]
+                if 'pre_encode.conv' in name and 'bias' in name and len(data.shape) == 1:
+                    data = data.reshape(1, -1, 1, 1)
+                    print(f"  Reshaped conv bias {name} to {data.shape}")
+
+                n_dims = len(data.shape)
+
+                ftype = 1 if use_f16 else 0
+                if use_f16:
+                    # Keep some tensors in f32 for better accuracy
+                    if n_dims < 2 or 'bias' in name or 'norm' in name:
+                        data = data.astype(np.float32)
+                        ftype = 0
+                    else:
+                        data = data.astype(np.float16)
+                else:
+                    data = data.astype(np.float32)
+
+                dims_reversed = [data.shape[n_dims - 1 - i] for i in range(n_dims)]
+                print(f"Processing: {name} {list(data.shape)}, dtype: {data.dtype}, n_dims: {n_dims}, reversed: {dims_reversed}")
+                name_bytes = name.encode('utf-8')
+                fout.write(struct.pack("iii", n_dims, len(name_bytes), ftype))
+                for i in range(n_dims):
+                    fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+                fout.write(name_bytes)
+
+                data.tofile(fout)
+
+        print(f"\nConversion complete!")
+        print(f"Output file: {fname_out}")
+        print(f"File size: {fname_out.stat().st_size / (1024**2):.2f} MB")
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Convert Parakeet TDT model from NeMo format to ggml format'
+    )
+    parser.add_argument('--model', type=str, required=True,
+                        help='Path to Parakeet .nemo model file')
+    parser.add_argument('--out-dir', type=str, required=True,
+                        help='Directory to write ggml model file')
+    parser.add_argument('--use-f32', action='store_true', default=False,
+                        help='Use f32 instead of f16 (default: f16)')
+    parser.add_argument('--out-name', type=str, default=None,
+                        help='Output file name (default: ggml-model.bin or ggml-model-f32.bin)')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.model):
+        print(f"Error: {args.model} not found")
+        sys.exit(1)
+
+    use_f16 = not args.use_f32
+    convert_parakeet_to_ggml(args.model, args.out_dir, use_f16, args.out_name)
@@ -0,0 +1 @@
+pyyaml