vad : add LSTM hidden state to VAD model

danbev · danbev · commit 1ad095d1427a · 2025-04-07T06:47:24.000+02:00
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4408,11 +4408,24 @@ struct whisper_vad_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
+struct whisper_vad_state {
+    // Hidden state for LSTM
+    std::vector<float> h;  // Hidden state dimension
+    std::vector<float> c;  // Cell state dimension
+
+    // Reset state
+    void reset() {
+        h.clear();
+        c.clear();
+    }
+};
+
 struct whisper_vad_context {
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
 
     whisper_vad_model model;
+    whisper_vad_state state;
 
     std::string path_model;
 };
@@ -4428,18 +4441,6 @@ struct whisper_vad_params whisper_vad_default_params(void) {
     return result;
 }
 
-struct whisper_vad_state {
-    // Hidden state for LSTM
-    float h[128];  // Hidden state dimension
-    float c[128];  // Cell state dimension
-
-    // Reset state
-    void reset() {
-        memset(h, 0, sizeof(h));
-        memset(c, 0, sizeof(c));
-    }
-};
-
 struct whisper_vad_result {
     float probability;  // Speech probability (0-1)
 };
@@ -4526,10 +4527,14 @@ whisper_vad_context * whisper_vad_init_from_file_with_params(
             return nullptr;
         }
     }
+    whisper_vad_model model;
+    whisper_vad_state state;
+
     whisper_vad_context * vctx = new whisper_vad_context;
+    vctx->model = model;
+    vctx->state = state;
     vctx->path_model = path_model;
 
-    whisper_vad_model model;
     auto & hparams = model.hparams;
 
     // load model hyper params (hparams)
@@ -4572,6 +4577,9 @@ whisper_vad_context * whisper_vad_init_from_file_with_params(
         WHISPER_LOG_INFO("%s: final_conv_out = %d\n", __func__, hparams.final_conv_out);
     }
 
+    vctx->state.h.resize(hparams.lstm_hidden_size);
+    vctx->state.c.resize(hparams.lstm_hidden_size);
+
     // 1 STFT, 4*2 encoder tensors, 4 LSTM tensors, 2 final output tensors
     const size_t n_tensors = hparams.n_encoder_layers * 2 + 4 + 2 + 1;
 
@@ -4676,41 +4684,45 @@ whisper_vad_context * whisper_vad_init_from_file_with_params(
         model.encoder_3_bias = create_tensor(VAD_TENSOR_ENC_3_BIAS,
                 ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.encoder_out_channels[3]));
 
+        // Hidden State dimension (input gate, forget gate, cell gate, output gate)
+        const int hstate_dim = hparams.lstm_hidden_size * 4;
+
         // LSTM weights - input to hidden
         model.lstm_weight_ih = create_tensor(
             VAD_TENSOR_LSTM_WEIGHT_IH,
-            ggml_new_tensor_2d(ctx, type, 128, 512)
+            ggml_new_tensor_2d(ctx, type, hparams.lstm_hidden_size, hstate_dim)
         );
 
         // LSTM weights - hidden to hidden
         model.lstm_weight_hh = create_tensor(
             VAD_TENSOR_LSTM_WEIGHT_HH,
-            ggml_new_tensor_2d(ctx, type, 128, 512)
+            ggml_new_tensor_2d(ctx, type, hparams.lstm_hidden_size, hstate_dim)
         );
 
         // LSTM bias - input to hidden
         model.lstm_bias_ih = create_tensor(
             VAD_TENSOR_LSTM_BIAS_IH,
-            ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 512)
+            ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hstate_dim)
         );
 
         // LSTM bias - hidden to hidden
         model.lstm_bias_hh = create_tensor(
             VAD_TENSOR_LSTM_BIAS_HH,
-            ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 512)
+            ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hstate_dim)
         );
 
         // Final conv layer weight
         model.final_conv_weight = create_tensor(
             VAD_TENSOR_FINAL_CONV_WEIGHT,
-            ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 1)
+            ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.final_conv_in, 1)
         );
 
         // Final conv layer bias
         model.final_conv_bias = create_tensor(
             VAD_TENSOR_FINAL_CONV_BIAS,
             ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1)
         );
+
         ggml_free(ctx);
     }