Skip to content

Commit a19d9ae

Browse files
committed
vad : add relu and conv_1d to VAD graph
1 parent e97c7e8 commit a19d9ae

File tree

2 files changed

+17
-7
lines changed

2 files changed

+17
-7
lines changed

models/convert-silero-vad-to-ggml.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,9 @@ def convert_silero_vad(output_path, print_tensors=True):
141141
if "encoder" in name and "weight" in name:
142142
print(f" This tensor will be forced to F16 for GGML im2col compatibility")
143143
force_f16 = True
144+
if "_model.decoder.decoder.2.weight" in name:
145+
print(f" This tensor will be forced to F16 for GGML im2col compatibility")
146+
force_f16 = True
144147

145148
# Set ftype based on the original dtype or force to F16 for certain tensors
146149
if force_f16:

src/whisper.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4605,8 +4605,10 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
46054605

46064606
cur = whisper_vad_build_lstm_layer(ctx0, vctx, cur);
46074607

4608+
cur = ggml_relu(ctx0, cur);
4609+
46084610
// Final output layer - linear transformation from LSTM output
4609-
cur = ggml_mul_mat(ctx0, model.final_conv_weight, cur);
4611+
cur = ggml_conv_1d(ctx0, model.final_conv_weight, cur, 1, 1, 1);
46104612
cur = ggml_add(ctx0, cur, model.final_conv_bias);
46114613

46124614
// Apply sigmoid to get probability between 0 and 1
@@ -4905,7 +4907,7 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
49054907
// Final conv layer weight
49064908
model.final_conv_weight = create_tensor(
49074909
VAD_TENSOR_FINAL_CONV_WEIGHT,
4908-
ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hparams.final_conv_in, 1)
4910+
ggml_new_tensor_2d(ctx, GGML_TYPE_F16, hparams.final_conv_in, 1)
49094911
);
49104912
model.final_conv_bias = create_tensor(
49114913
VAD_TENSOR_FINAL_CONV_BIAS,
@@ -5161,14 +5163,19 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
51615163
}
51625164

51635165
{
5164-
// Print as F32
5166+
// Print as F16
51655167
struct ggml_tensor * tensor = model.final_conv_weight;
5166-
std::vector<float> read_b(ggml_nbytes(tensor));
5167-
ggml_backend_tensor_get(tensor, read_b.data(), 0, ggml_nbytes(tensor));
5168+
std::vector<uint16_t> raw_data(ggml_nbytes(tensor) / sizeof(uint16_t));
5169+
ggml_backend_tensor_get(tensor, raw_data.data(), 0, ggml_nbytes(tensor));
5170+
5171+
// Convert first 10 values from F16 to F32 for display
51685172
for (int i = 0; i < 10; i++) {
5169-
WHISPER_LOG_INFO("%s: final_conv_weight: [%d]: %f\n", __func__, i, read_b[i]);
5173+
float converted_value = ggml_fp16_to_fp32(raw_data[i]);
5174+
WHISPER_LOG_INFO("%s: final_conv_weight: [%d]: %f (raw: 0x%04x)\n",
5175+
__func__, i, converted_value, raw_data[i]);
51705176
}
51715177
}
5178+
51725179
{
51735180
// Print as F32
51745181
struct ggml_tensor * tensor = model.final_conv_bias;
@@ -5273,7 +5280,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
52735280
}
52745281
WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
52755282
for (int i = 0; i < probs.size(); i++) {
5276-
//WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
5283+
WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
52775284
}
52785285

52795286
segments.n_segments = n_frames;

0 commit comments

Comments
 (0)