vad : add relu and conv_1d to VAD graph

danbev · danbev · commit a19d9aebd056 · 2025-04-08T12:06:11.000+02:00
diff --git a/models/convert-silero-vad-to-ggml.py b/models/convert-silero-vad-to-ggml.py
@@ -141,6 +141,9 @@ def convert_silero_vad(output_path, print_tensors=True):
         if "encoder" in name and "weight" in name:
             print(f"  This tensor will be forced to F16 for GGML im2col compatibility")
             force_f16 = True
+        if "_model.decoder.decoder.2.weight" in name:
+            print(f"  This tensor will be forced to F16 for GGML im2col compatibility")
+            force_f16 = True
 
         # Set ftype based on the original dtype or force to F16 for certain tensors
         if force_f16:
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4605,8 +4605,10 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
 
         cur = whisper_vad_build_lstm_layer(ctx0, vctx, cur);
 
+        cur = ggml_relu(ctx0, cur);
+
         // Final output layer - linear transformation from LSTM output
-        cur = ggml_mul_mat(ctx0, model.final_conv_weight, cur);
+        cur = ggml_conv_1d(ctx0, model.final_conv_weight, cur, 1, 1, 1);
         cur = ggml_add(ctx0, cur, model.final_conv_bias);
 
         // Apply sigmoid to get probability between 0 and 1
@@ -4905,7 +4907,7 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
         // Final conv layer weight
         model.final_conv_weight = create_tensor(
             VAD_TENSOR_FINAL_CONV_WEIGHT,
-            ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.final_conv_in, 1)
+            ggml_new_tensor_2d(ctx, GGML_TYPE_F16, hparams.final_conv_in, 1)
         );
         model.final_conv_bias = create_tensor(
             VAD_TENSOR_FINAL_CONV_BIAS,
@@ -5161,14 +5163,19 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
         }
 
         {
-            // Print as F32
+            // Print as F16
             struct ggml_tensor * tensor = model.final_conv_weight;
-            std::vector<float> read_b(ggml_nbytes(tensor));
-            ggml_backend_tensor_get(tensor, read_b.data(), 0, ggml_nbytes(tensor));
+            std::vector<uint16_t> raw_data(ggml_nbytes(tensor) / sizeof(uint16_t));
+            ggml_backend_tensor_get(tensor, raw_data.data(), 0, ggml_nbytes(tensor));
+
+            // Convert first 10 values from F16 to F32 for display
             for (int i = 0; i < 10; i++) {
-                WHISPER_LOG_INFO("%s: final_conv_weight: [%d]: %f\n", __func__, i, read_b[i]);
+                float converted_value = ggml_fp16_to_fp32(raw_data[i]);
+                WHISPER_LOG_INFO("%s: final_conv_weight: [%d]: %f (raw: 0x%04x)\n",
+                            __func__, i, converted_value, raw_data[i]);
             }
         }
+
         {
             // Print as F32
             struct ggml_tensor * tensor = model.final_conv_bias;
@@ -5273,7 +5280,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
     }
     WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
     for (int i = 0; i < probs.size(); i++) {
-        //WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
+        WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
     }
 
     segments.n_segments = n_frames;