@@ -4605,8 +4605,10 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
46054605
46064606 cur = whisper_vad_build_lstm_layer (ctx0, vctx, cur);
46074607
4608+ cur = ggml_relu (ctx0, cur);
4609+
46084610 // Final output layer - linear transformation from LSTM output
4609- cur = ggml_mul_mat (ctx0, model.final_conv_weight , cur);
4611+ cur = ggml_conv_1d (ctx0, model.final_conv_weight , cur, 1 , 1 , 1 );
46104612 cur = ggml_add (ctx0, cur, model.final_conv_bias );
46114613
46124614 // Apply sigmoid to get probability between 0 and 1
@@ -4905,7 +4907,7 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
49054907 // Final conv layer weight
49064908 model.final_conv_weight = create_tensor (
49074909 VAD_TENSOR_FINAL_CONV_WEIGHT,
4908- ggml_new_tensor_2d (ctx, GGML_TYPE_F32 , hparams.final_conv_in , 1 )
4910+ ggml_new_tensor_2d (ctx, GGML_TYPE_F16 , hparams.final_conv_in , 1 )
49094911 );
49104912 model.final_conv_bias = create_tensor (
49114913 VAD_TENSOR_FINAL_CONV_BIAS,
@@ -5161,14 +5163,19 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
51615163 }
51625164
51635165 {
5164- // Print as F32
5166+ // Print as F16
51655167 struct ggml_tensor * tensor = model.final_conv_weight ;
5166- std::vector<float > read_b (ggml_nbytes (tensor));
5167- ggml_backend_tensor_get (tensor, read_b.data (), 0 , ggml_nbytes (tensor));
5168+ std::vector<uint16_t > raw_data (ggml_nbytes (tensor) / sizeof (uint16_t ));
5169+ ggml_backend_tensor_get (tensor, raw_data.data (), 0 , ggml_nbytes (tensor));
5170+
5171+ // Convert first 10 values from F16 to F32 for display
51685172 for (int i = 0 ; i < 10 ; i++) {
5169- WHISPER_LOG_INFO (" %s: final_conv_weight: [%d]: %f\n " , __func__, i, read_b[i]);
5173+ float converted_value = ggml_fp16_to_fp32 (raw_data[i]);
5174+ WHISPER_LOG_INFO (" %s: final_conv_weight: [%d]: %f (raw: 0x%04x)\n " ,
5175+ __func__, i, converted_value, raw_data[i]);
51705176 }
51715177 }
5178+
51725179 {
51735180 // Print as F32
51745181 struct ggml_tensor * tensor = model.final_conv_bias ;
@@ -5273,7 +5280,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
52735280 }
52745281 WHISPER_LOG_INFO (" %s: finished processing %d samples\n " , __func__, n_samples);
52755282 for (int i = 0 ; i < probs.size (); i++) {
5276- // WHISPER_LOG_INFO("%s: prob[%d]: %f\n", __func__, i, probs[i]);
5283+ WHISPER_LOG_INFO (" %s: prob[%d]: %f\n " , __func__, i, probs[i]);
52775284 }
52785285
52795286 segments.n_segments = n_frames;
0 commit comments