vad : fix tensor dimensions for VAD operations

danbev · danbev · commit fab4ec8bb88b · 2025-04-10T10:51:55.000+02:00
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4477,10 +4477,11 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
 
 static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
-    // We need the stft tensor to be in {258, 1, 256},
-    // that is a kernel size of 258, 1 channel, and 256 frequency bins (output)
-    struct ggml_tensor * stft_reshaped = ggml_reshape_3d(ctx0, model.stft_forward_basis, 258, 1, 256);
-    cur = ggml_conv_1d(ctx0, stft_reshaped, cur, 1, 1, 1);
+    ggml_tensor* padded = ggml_pad(ctx0, cur, 64, 0, 0, 0);
+    // We need the stft tensor to be in {256, 1, 258},
+    // 256 frequency bins (output), 1 channel (input), and 258 kernel size.
+    struct ggml_tensor * stft_reshaped = ggml_reshape_3d(ctx0, model.stft_forward_basis, 256, 1, 258);
+    cur = ggml_conv_1d(ctx0, stft_reshaped, padded, 128, 0, 1);
     ggml_set_name(cur, "stft");
     ggml_set_output(cur);
     return cur;
@@ -4489,19 +4490,20 @@ static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
 static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
     WHISPER_LOG_INFO("%s: building encoder layer\n", __func__);
-    // Reshape from the STFT output which is [258, 1, 1, 1] where the first
+    // Reshape from the STFT output which is [4, 258, 1, 1] where the second
     // dimension are complex number pairs. I think we can ignore the imaginary
     // part and just use the real part here.
-    struct ggml_tensor * real_part = ggml_view_1d(ctx0, cur, 129, 0);
-    struct ggml_tensor * reshaped = ggml_reshape_3d(ctx0, real_part, 1, 129, 1);
+    struct ggml_tensor * real_part = ggml_view_2d(ctx0, cur, 4, 129,
+                                             cur->nb[0],  // stride for moving between frequency bins
+                                             0);          // offset = 0 to start from the beginning
 
     // First Conv1D: expands to 128 channels.
-    cur = ggml_conv_1d(ctx0, model.encoder_0_weight, reshaped, 1, 1, 1);
+    cur = ggml_conv_1d(ctx0, model.encoder_0_weight, real_part, 2, 1, 1);
     cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_0_bias, 1, 128, 1));
     cur = ggml_relu(ctx0, cur);
 
-    // First Conv1D: reduces to 64 channels.
-    cur = ggml_conv_1d(ctx0, model.encoder_1_weight, cur, 1, 1, 1);
+    // Second Conv1D: reduces to 64 channels.
+    cur = ggml_conv_1d(ctx0, model.encoder_1_weight, cur, 2, 1, 1);
     cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_1_bias, 1, 64, 1));
     cur = ggml_relu(ctx0, cur);
 
@@ -4523,8 +4525,6 @@ static ggml_tensor * whisper_vad_build_lstm_layer(ggml_context* ctx0,
     WHISPER_LOG_INFO("%s: building LSTM layer\n", __func__);
 
     const whisper_vad_model & model = vctx.model;
-    const int seq_length = cur->ne[0];
-    const int input_dim  = cur->ne[1];
     const int hdim = model.hparams.lstm_hidden_size;
     const int hdim_bytes = hdim * sizeof(float);
 
@@ -4597,23 +4597,27 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
 
     WHISPER_LOG_INFO("%s: n_window = %d\n", __func__, n_window);
     // We process one frame/segment at a time of size n_window.
-    struct ggml_tensor * frame = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_window);
+    struct ggml_tensor * frame = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_window, 1, 1);
     ggml_set_name(frame, "frame");
     ggml_set_input(frame);
 
     struct ggml_tensor * cur = nullptr;
     {
         cur = whisper_vad_build_stft_layer(ctx0, model, frame);
+        WHISPER_LOG_INFO("%s: stft output shape = [%d, %d, %d]\n", __func__, cur->ne[0], cur->ne[1], cur->ne[2]);
 
         cur = whisper_vad_build_encoder_layer(ctx0, model, cur);
+        WHISPER_LOG_INFO("%s: endoder output shape = [%d, %d, %d]\n", __func__, cur->ne[0], cur->ne[1], cur->ne[2]);
 
         cur = whisper_vad_build_lstm_layer(ctx0, vctx, cur);
+        WHISPER_LOG_INFO("%s: lstm output shape = [%d, %d, %d]\n", __func__, cur->ne[0], cur->ne[1], cur->ne[2]);
 
         cur = ggml_relu(ctx0, cur);
 
         // Final output layer
         cur = ggml_conv_1d(ctx0, model.final_conv_weight, cur, 1, 0, 1);
         cur = ggml_add(ctx0, cur, model.final_conv_bias);
+        WHISPER_LOG_INFO("%s: final decoder output shape = [%d, %d, %d]\n", __func__, cur->ne[0], cur->ne[1], cur->ne[2]);
 
         // Apply sigmoid to get probability between 0 and 1
         cur = ggml_sigmoid(ctx0, cur);