parakeet : got up to pre_conv_6_relu working

danbev · danbev · commit 7de9732014a2 · 2026-03-19T07:12:36.000+01:00
diff --git a/src/parakeet.cpp b/src/parakeet.cpp
@@ -1462,24 +1462,34 @@ static bool parakeet_model_load(struct parakeet_model_loader * loader, parakeet_
     // Encoder pre_encode
     const int n_subsampling_channels = hparams.n_subsampling_channels;
     model.enc_pre_out_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_OUT_WEIGHT, ggml_new_tensor_2d(ctx, wtype, 4096, n_audio_state));
+    ggml_set_name(model.enc_pre_out_w, "enc_pre_out_w");
     model.enc_pre_out_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_OUT_BIAS, ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state));
+    ggml_set_name(model.enc_pre_out_b, "enc_pre_out_b");
 
     model.enc_pre_conv_0_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_0_WEIGHT, ggml_new_tensor_4d(ctx, vtype, 3, 3, 1, n_subsampling_channels));
     ggml_set_name(model.enc_pre_conv_0_w, "enc_pre_conv_0_w");
     model.enc_pre_conv_0_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_0_BIAS, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, 1, n_subsampling_channels, 1));
     ggml_set_name(model.enc_pre_conv_0_b, "enc_pre_conv_0_b");
 
     model.enc_pre_conv_2_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_2_WEIGHT, ggml_new_tensor_4d(ctx, vtype, 3, 3, 1, n_subsampling_channels));
+    ggml_set_name(model.enc_pre_conv_2_w, "enc_pre_conv_2_w");
     model.enc_pre_conv_2_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_2_BIAS, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, 1, n_subsampling_channels, 1));
+    ggml_set_name(model.enc_pre_conv_2_b, "enc_pre_conv_2_b");
 
     model.enc_pre_conv_3_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_3_WEIGHT, ggml_new_tensor_4d(ctx, wtype, 1, 1, n_subsampling_channels, n_subsampling_channels));
+    ggml_set_name(model.enc_pre_conv_3_w, "enc_pre_conv_3_w");
     model.enc_pre_conv_3_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_3_BIAS, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, 1, n_subsampling_channels, 1));
+    ggml_set_name(model.enc_pre_conv_3_b, "enc_pre_conv_3_b");
 
     model.enc_pre_conv_5_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_5_WEIGHT, ggml_new_tensor_4d(ctx, vtype, 3, 3, 1, n_subsampling_channels));
+    ggml_set_name(model.enc_pre_conv_5_w, "enc_pre_conv_5_w");
     model.enc_pre_conv_5_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_5_BIAS, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, 1, n_subsampling_channels, 1));
+    ggml_set_name(model.enc_pre_conv_5_b, "enc_pre_conv_5_b");
 
     model.enc_pre_conv_6_w = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_6_WEIGHT, ggml_new_tensor_4d(ctx, wtype, 1, 1, n_subsampling_channels, n_subsampling_channels));
+    ggml_set_name(model.enc_pre_conv_6_w, "enc_pre_conv_6_w");
     model.enc_pre_conv_6_b = create_tensor(PARAKEET_TENSOR_ENC_PRE_CONV_6_BIAS, ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 1, 1, n_subsampling_channels, 1));
+    ggml_set_name(model.enc_pre_conv_6_b, "enc_pre_conv_6_b");
 
     // Encoder layers
     for (int i = 0; i < n_audio_layer; ++i) {
@@ -1722,69 +1732,64 @@ static struct ggml_cgraph * parakeet_build_graph_conv(parakeet_context & pctx, p
     struct ggml_context * ctx0 = ggml_init(params);
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    // [n_time, n_mels] {1500, 128, 1, 1}
-    struct ggml_tensor * mel = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_time, n_mels, 1, 1);
+    // [freq, time]
+    struct ggml_tensor * mel = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_mels, n_time, 1, 1);
     ggml_set_name(mel, "mel");
     ggml_set_input(mel);
     ggml_set_output(mel);
 
-    struct ggml_tensor * cur = mel;
-    //ggml_set_name(cur, "input_to_conv_0");
-    //ggml_set_output(cur);
-
-    // enc_pre_conv_0_w: {3, 3, 1, 256}
-    // {1500, 128, 1, 1} -> {750, 64, 256, 1}
-    cur = ggml_conv_2d(ctx0, model.enc_pre_conv_0_w, cur, 2, 2, 1, 1, 1, 1);
-    ggml_set_name(cur, "pre_conv_0");
-    ggml_set_output(cur);
-
+    // [freq, time, channels, batch]
+    struct ggml_tensor * cur = ggml_conv_2d(ctx0, model.enc_pre_conv_0_w, mel, 2, 2, 1, 1, 1, 1);
     cur = ggml_add(ctx0, cur, model.enc_pre_conv_0_b);
-    ggml_set_name(cur, "pre_conv_0_bias");
+    ggml_set_name(cur, "pre_conv_0");
     ggml_set_output(cur);
 
     cur = ggml_relu(ctx0, cur);
     ggml_set_name(cur, "pre_conv_0_relu");
     ggml_set_output(cur);
 
-    // enc_pre_conv_2_w: {3, 3, 1, 256}
-    // {750, 64, 256, 1} -> {375, 32, 256, 1}
+    // enc_pre_conv_2_w: {3, 3, 1, 256} (depthwise)
+    // [freq, time, channels, batch]
     cur = ggml_conv_2d_dw_direct(ctx0, model.enc_pre_conv_2_w, cur, 2, 2, 1, 1, 1, 1);
-    ggml_set_name(cur, "pre_conv_2");
     cur = ggml_add(ctx0, cur, model.enc_pre_conv_2_b);
+    ggml_set_output(cur);
+    ggml_set_name(cur, "pre_conv_2");
 
-    // enc_pre_conv: {1, 1, 256, 256}
-    // {375, 32, 256, 1} -> {375, 32, 256, 1}
+    // enc_pre_conv_3_w: {1, 1, 256, 256} (pointwise)
+    // [freq, time, channels, batch]
     cur = ggml_conv_2d(ctx0, model.enc_pre_conv_3_w, cur, 1, 1, 0, 0, 1, 1);
-    ggml_set_name(cur, "pre_conv_3");
     cur = ggml_add(ctx0, cur, model.enc_pre_conv_3_b);
+    ggml_set_name(cur, "pre_conv_3");
+    ggml_set_output(cur);
 
     cur = ggml_relu(ctx0, cur);
+    ggml_set_output(cur);
     ggml_set_name(cur, "pre_conv_3_relu");
 
-    // enc_pre_conv_5_w: {3, 3, 1, 256}
-    // {375, 32, 256, 1} -> {188, 16, 256, 1}
+    // enc_pre_conv_5_w: {3, 3, 1, 256} (depthwise)
+    // [freq, time, channels, batch]
     cur = ggml_conv_2d_dw_direct(ctx0, model.enc_pre_conv_5_w, cur, 2, 2, 1, 1, 1, 1);
-    ggml_set_name(cur, "pre_conv_5");
+    ggml_set_name(cur, "pre_conv_5_direct");
+    ggml_set_output(cur);
     cur = ggml_add(ctx0, cur, model.enc_pre_conv_5_b);
+    ggml_set_name(cur, "pre_conv_5");
+    ggml_set_output(cur);
 
-    // enc_pre_conv_6_w: {1, 1, 256, 256}
-    // {188, 16, 256, 1} -> {188, 16, 256, 1}
+    // enc_pre_conv_6_w: {1, 1, 256, 256} (pointwise)
+    // [freq, time, channels, batch]
     cur = ggml_conv_2d(ctx0, model.enc_pre_conv_6_w, cur, 1, 1, 0, 0, 1, 1);
-    ggml_set_name(cur, "pre_conv_6");
     cur = ggml_add(ctx0, cur, model.enc_pre_conv_6_b);
+    ggml_set_output(cur);
+    ggml_set_name(cur, "pre_conv_6");
 
     cur = ggml_relu(ctx0, cur);
     ggml_set_name(cur, "pre_conv_6_relu");
+    ggml_set_output(cur);
 
-    const int n_frames = cur->ne[0]; // 188
-    const int n_freq   = cur->ne[1]; // 16
+    const int n_freq   = cur->ne[0]; // 16
+    const int n_frames = cur->ne[1]; // 188
     const int n_chan   = cur->ne[2]; // 256
 
-    // {n_frames, n_freq, n_chan, batch} -> {n_chan, n_frames, n_freq, batch}
-    // {188, 16, 256, 1} -> {256, 188, 16, 1}
-    cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
-    cur = ggml_cont(ctx0, cur);
-
     // {n_chan, n_frames, n_freq, batch} -> {d_feat (n_freq * n_chan), n_frames}
     // {256, 188, 16, 1} -> {4096, 188, 1, 1}
     cur = ggml_reshape_2d(ctx0, cur, n_freq * n_chan, n_frames);
@@ -2077,7 +2082,7 @@ static bool parakeet_encode_internal(
 
             for (int j = 0; j < mel_inp.n_mel; ++j) {
                 for (int i = i0; i < i1; ++i) {
-                    dst[j*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+                    dst[i * mel_inp.n_mel + j] = mel_inp.data[j * mel_inp.n_len + (i + i0)];
                 }
             }
 
@@ -2090,11 +2095,38 @@ static bool parakeet_encode_internal(
 
         // TODO: remove after debugging
         {
-            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_0_w, sched, gf, 10);
-            // Comparing bias as that is what the pytorch conv layer does.
-            parakeet_print_tensor_gf(pctx.model.enc_pre_conv_0_b, sched, gf, 10);
             parakeet_print_tensor_gf("mel", sched, gf, 10);
-            parakeet_print_tensor_gf("pre_conv_0_bias", sched, gf, 10);
+            // applied bias result:
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_0_w, sched, gf, 10);
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_0_b, sched, gf, 10);
+            parakeet_print_tensor_gf("pre_conv_0", sched, gf, 10);
+
+            parakeet_print_tensor_gf("pre_conv_0_relu", sched, gf, 10);
+
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_2_w, sched, gf, 10);
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_2_b, sched, gf, 10);
+            parakeet_print_tensor_gf("pre_conv_2", sched, gf, 10);
+
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_3_w, sched, gf, 10);
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_3_b, sched, gf, 10);
+            parakeet_print_tensor_gf("pre_conv_3", sched, gf, 10);
+
+            parakeet_print_tensor_gf("pre_conv_3_relu", sched, gf, 10);
+
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_5_w, sched, gf, 10);
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_5_b, sched, gf, 10);
+            parakeet_print_tensor_gf("pre_conv_5", sched, gf, 10);
+
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_6_w, sched, gf, 10);
+            //parakeet_print_tensor_gf(pctx.model.enc_pre_conv_6_b, sched, gf, 10);
+            parakeet_print_tensor_gf("pre_conv_6", sched, gf, 10);
+
+            parakeet_print_tensor_gf("pre_conv_6_relu", sched, gf, 10);
+
+            parakeet_print_tensor_gf("embd_conv", sched, gf, 10);
+
+            // compare final output:
+            //parakeet_print_tensor_gf(pstate.embd_conv, sched, gf, 10);
         }
 
     }
@@ -2634,13 +2666,32 @@ static bool log_mel_spectrogram(
     {
         std::vector<std::thread> workers(n_threads - 1);
         for (int iw = 0; iw < n_threads - 1; ++iw) {
-            workers[iw] = std::thread(
-                    log_mel_spectrogram_worker_thread, iw + 1, window_func, window_size, std::cref(samples_padded),
-                    samples_padded.size(), frame_size, frame_step, n_threads,
-                    std::cref(filters), std::ref(mel), std::cref(cache));
+            workers[iw] = std::thread(log_mel_spectrogram_worker_thread,
+                    iw + 1,                      // thread index
+                    window_func,
+                    window_size,
+                    std::cref(samples_padded),
+                    samples_padded.size(),
+                    frame_size,
+                    frame_step,
+                    n_threads,
+                    std::cref(filters),
+                    std::ref(mel),
+                    std::cref(cache));
         }
 
-        log_mel_spectrogram_worker_thread(0, window_func, window_size, samples_padded, samples_padded.size(), frame_size, frame_step, n_threads, filters, mel, cache);
+        log_mel_spectrogram_worker_thread(
+                0,
+                window_func,
+                window_size,
+                samples_padded,
+                samples_padded.size(),
+                frame_size,
+                frame_step,
+                n_threads,
+                filters,
+                mel,
+                cache);
 
         for (int iw = 0; iw < n_threads - 1; ++iw) {
             workers[iw].join();
@@ -3027,7 +3078,18 @@ void parakeet_free_params(struct parakeet_full_params * params) {
 }
 
 int parakeet_pcm_to_mel_with_state(struct parakeet_context * ctx, struct parakeet_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, PARAKEET_SAMPLE_RATE, ctx->model.hparams.n_fft, PARAKEET_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel, ctx->mel_cache)) {
+    if (!log_mel_spectrogram(*state,
+                samples,
+                n_samples,
+                PARAKEET_SAMPLE_RATE,
+                ctx->model.hparams.n_fft,
+                PARAKEET_HOP_LENGTH,
+                ctx->model.filters.n_mel,
+                n_threads,
+                ctx->model.filters,
+                false,                        // debug
+                state->mel,
+                ctx->mel_cache)) {
         PARAKEET_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
         return -1;
     }
diff --git a/test-parakeet.sh b/test-parakeet.sh
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+set -e
+
+build_dir=build-debug
+cmd=test-parakeet
+
+cmake --build $build_dir --target $cmd -j 12
+
+#ctest -R ^$cmd$ --test-dir $build_dir --output-on-failure -VV
+echo "running ${build_dir}/$cmd with gdb" 
+gdb --args ${build_dir}/bin/${cmd}
+#${build_dir}/bin/${cmd}