vad : add context to frames

danbev · danbev · commit cf46fc9710b3 · 2025-04-14T08:49:10.000+02:00
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4429,6 +4429,7 @@ struct whisper_vad_context {
     int64_t t_start_us = 0;
 
     int n_window;
+    int n_context;
     std::string path_model;
 
     whisper_vad_model model;
@@ -4573,7 +4574,7 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
         whisper_vad_state & vstate) {
     const auto & model   = vctx.model;
     const auto & hparams = model.hparams;
-    const int n_window = vctx.n_window;
+    const int frame_size = vctx.n_window + vctx.n_context;
 
     WHISPER_LOG_INFO("%s: Building VAD graph\n", __func__);
     struct ggml_init_params params = {
@@ -4586,9 +4587,8 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
 
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    WHISPER_LOG_INFO("%s: n_window = %d\n", __func__, n_window);
-    // We process one frame/segment at a time of size n_window.
-    struct ggml_tensor * frame = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_window, 1, 1);
+    WHISPER_LOG_INFO("%s: n_window = %d\n", __func__, frame_size);
+    struct ggml_tensor * frame = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, frame_size, 1, 1);
     ggml_set_name(frame, "frame");
     ggml_set_input(frame);
 
@@ -4725,7 +4725,9 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
 
     whisper_vad_context * vctx = new whisper_vad_context;
     vctx->path_model = path_model;
+    // TODO(danbev) Read these from the model since they are tied to the model.
     vctx->n_window = 512;
+    vctx->n_context = 64;
 
     auto & model = vctx->model;
     auto & hparams = model.hparams;
@@ -5196,6 +5198,8 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
     auto & sched = vctx->state->sched.sched;
     const int hidden_dim = vctx->model.hparams.lstm_hidden_size;
+    const int n_context = vctx->n_context;
+    const int frame_size = vctx->n_window + n_context;
 
     struct whisper_vad_segments segments {
         /* n_segments = */ 0,
@@ -5212,8 +5216,9 @@ struct whisper_vad_segments whisper_vad_detect_speech(
         return segments;
     }
 
-    WHISPER_LOG_INFO("%s: n_window: %u\n", __func__, vctx->n_window);
-    std::vector<float> window_with_context(vctx->n_window);
+    WHISPER_LOG_INFO("%s: frame_size: %u\n", __func__, frame_size);
+    std::vector<float> previous_context(n_context, 0.0f);
+    std::vector<float> window_with_context(frame_size + previous_context.size());
 
     struct ggml_tensor * frame = ggml_graph_get_tensor(gf, "frame");
     struct ggml_tensor * c_out = ggml_graph_get_tensor(gf, "c_out");
@@ -5231,18 +5236,25 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     std::vector<float> h_state(hidden_dim, 0.0f);
     std::vector<float> c_state(hidden_dim, 0.0f);
 
-    int n_frames = n_samples / vctx->n_window;
+    int n_frames = n_samples / window_with_context.size();
     std::vector<float> probs(n_frames, 0.0f);
 
     WHISPER_LOG_INFO("%s: frame tensor size: %ld\n", __func__, frame->ne[0]);
 
-    for (int i = 0; i < n_samples; i += vctx->n_window) {
+    for (int i = 0; i < n_samples; i += frame_size) {
         // Skip if we don't have enough samples for a full window
-        if (i + vctx->n_window > n_samples) {
+        if (i + frame_size > n_samples) {
             break;
         }
 
-        ggml_backend_tensor_set(frame, pcmf32 + i, 0, ggml_nelements(frame) * sizeof(float));
+        // Copy the previous context to the beginning of window_with_context.
+        std::copy(previous_context.begin(), previous_context.end(), window_with_context.begin());
+
+        // Copy current frame samples to after the context.
+        std::copy(pcmf32 + i, pcmf32 + i + frame_size, window_with_context.begin() + previous_context.size());
+
+        // Set the frame tensor data with the context + the samples.
+        ggml_backend_tensor_set(frame, window_with_context.data(), 0, ggml_nelements(frame) * sizeof(float));
 
         ggml_backend_tensor_set(h_in, h_state.data(), 0, hidden_dim * sizeof(float));
         ggml_backend_tensor_set(c_in, c_state.data(), 0, hidden_dim * sizeof(float));
@@ -5252,7 +5264,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
             break;
         }
 
-        // Print out some intermediate results
+        // Print out some intermediate results for debugging
         WHISPER_LOG_INFO("%s:###### Intermediate results #####\n", __func__);
         {
             struct ggml_tensor * tensor = ggml_graph_get_tensor(gf, "stft");
@@ -5262,6 +5274,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
                 WHISPER_LOG_INFO("%s: sftf: [%d]: %f\n", __func__, i, read_b[i]);
             }
         }
+        // Print out some intermediate results for debugging
         {
             struct ggml_tensor * tensor = ggml_graph_get_tensor(gf, "final_conv");
             std::vector<float> read_b(ggml_nbytes(tensor));
@@ -5271,16 +5284,18 @@ struct whisper_vad_segments whisper_vad_detect_speech(
             }
         }
 
+        // Update the LSTM states
         ggml_backend_tensor_get(h_out, h_state.data(), 0, hidden_dim * sizeof(float));
         ggml_backend_tensor_get(c_out, c_state.data(), 0, hidden_dim * sizeof(float));
 
-        WHISPER_LOG_INFO("%s: h_state first 3 values: %f, %f, %f\n",
-                __func__, h_state[0], h_state[1], h_state[2]);
-        WHISPER_LOG_INFO("%s: c_state first 3 values: %f, %f, %f\n",
-                __func__, c_state[0], c_state[1], c_state[2]);
+        WHISPER_LOG_INFO("%s: h_state first 3 values: %f, %f, %f\n", __func__, h_state[0], h_state[1], h_state[2]);
+        WHISPER_LOG_INFO("%s: c_state first 3 values: %f, %f, %f\n", __func__, c_state[0], c_state[1], c_state[2]);
 
-        ggml_backend_tensor_get(prob, &probs[i/vctx->n_window], 0, sizeof(float));
+        // Get the probabilities.
+        ggml_backend_tensor_get(prob, &probs[i/frame_size], 0, sizeof(float));
 
+        // Copy the last n_context to add to the next frame.
+        std::copy(window_with_context.end() - n_context, window_with_context.end(), previous_context.begin());
     }
     WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
     for (size_t i = 0; i < probs.size(); i++) {