vad : remove sigmoid activation from VAD output

danbev · danbev · commit 5365dab53822 · 2025-04-08T09:45:59.000+02:00
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4608,10 +4608,7 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
         // Final output layer - linear transformation from LSTM output
         cur = ggml_mul_mat(ctx0, model.final_conv_weight, cur);
         cur = ggml_add(ctx0, cur, model.final_conv_bias);
-
-        // Apply sigmoid to get probability between 0 and 1
-        cur = ggml_sigmoid(ctx0, cur);
-        ggml_set_name(cur, "prob");
+        ggml_set_name(cur, "logits");
         ggml_set_output(cur);
     }
 
@@ -4773,7 +4770,6 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
         WHISPER_LOG_INFO("%s: final_conv_out = %d\n", __func__, hparams.final_conv_out);
     }
 
-
     // 1 STFT tensor, 4*2 encoder tensors, 4 LSTM tensors, 2 final output tensors
     const size_t n_tensors = hparams.n_encoder_layers * 2 + 4 + 2 + 1;
 
@@ -5221,7 +5217,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     struct ggml_tensor * frame = ggml_graph_get_tensor(gf, "frame");
     struct ggml_tensor * c_out = ggml_graph_get_tensor(gf, "c_out");
     struct ggml_tensor * h_out = ggml_graph_get_tensor(gf, "h_out");
-    struct ggml_tensor * prob = ggml_graph_get_tensor(gf, "prob");
+    struct ggml_tensor * logits_tensor = ggml_graph_get_tensor(gf, "logits");
 
     struct ggml_tensor * c_in = ggml_graph_get_tensor(gf, "c_in");
     struct ggml_tensor * h_in = ggml_graph_get_tensor(gf, "h_in");
@@ -5232,7 +5228,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     std::vector<float> c_state(hidden_dim, 0.0f);
 
     int n_frames = n_samples / vctx->window_size_samples;
-    std::vector<float> probs(n_frames, 0.0f);
+    std::vector<float> logits(n_frames, 0.0f);
 
     WHISPER_LOG_INFO("%s: frame tensor size: %ld\n", __func__, frame->ne[0]);
 
@@ -5267,13 +5263,13 @@ struct whisper_vad_segments whisper_vad_detect_speech(
 
         ggml_backend_tensor_get(h_out, h_state.data(), 0, hidden_dim * sizeof(float));
         ggml_backend_tensor_get(c_out, c_state.data(), 0, hidden_dim * sizeof(float));
-        ggml_backend_tensor_get(prob, &probs[i/vctx->window_size_samples], 0, sizeof(float));
+        ggml_backend_tensor_get(logits_tensor, &logits[i/vctx->window_size_samples], 0, sizeof(float));
 
         vctx->current_sample += vctx->window_size_samples;
     }
     WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
-    for (int i = 0; i < probs.size(); i++) {
-        //WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
+    for (int i = 0; i < logits.size(); i++) {
+        WHISPER_LOG_INFO("%s: logits[%d]: %f\n", __func__, i, logits[i]);
     }
 
     segments.n_segments = n_frames;