@@ -4438,7 +4438,6 @@ struct whisper_vad_context {
44384438 bool triggered;
44394439 std::vector<float > context_buffer;
44404440 unsigned int current_sample;
4441- unsigned int temp_end;
44424441
44434442 std::vector<whisper_vad_segment> detected_segments;
44444443
@@ -4478,13 +4477,10 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
44784477
44794478static ggml_tensor * whisper_vad_build_stft_layer (ggml_context* ctx0,
44804479 const whisper_vad_model & model, ggml_tensor * cur) {
4481- struct ggml_tensor * padded_frame = ggml_pad (ctx0, cur, 64 , 0 , 0 , 0 );
4482- struct ggml_tensor * reshaped_frame = ggml_reshape_3d (ctx0, padded_frame, 640 , 1 , 1 );
4483-
44844480 // We need the stft tensor to be in {258, 1, 256},
44854481 // that is a kernel size of 258, 1 channel, and 256 frequency bins (output)
4486- struct ggml_tensor * reshaped_stft = ggml_reshape_3d (ctx0, model.stft_forward_basis , 258 , 1 , 256 );
4487- cur = ggml_conv_1d (ctx0, reshaped_stft, reshaped_frame , 1 , 1 , 1 );
4482+ struct ggml_tensor * stft_reshaped = ggml_reshape_3d (ctx0, model.stft_forward_basis , 258 , 1 , 256 );
4483+ cur = ggml_conv_1d (ctx0, stft_reshaped, cur , 1 , 1 , 1 );
44884484 ggml_set_name (cur, " stft" );
44894485 ggml_set_output (cur);
44904486 return cur;
@@ -4739,7 +4735,6 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
47394735 vctx->triggered = false ;
47404736 vctx->context_buffer .resize (vctx->context_samples , 0 .0f );
47414737 vctx->current_sample = 0 ;
4742- vctx->temp_end = 0 ;
47434738
47444739 auto & model = vctx->model ;
47454740 auto & hparams = model.hparams ;
@@ -5215,7 +5210,6 @@ struct whisper_vad_segments whisper_vad_detect_speech(
52155210 // Reset state for this detection
52165211 vctx->triggered = false ;
52175212 vctx->current_sample = 0 ;
5218- vctx->temp_end = 0 ;
52195213 std::fill (vctx->context_buffer .begin (), vctx->context_buffer .end (), 0 .0f );
52205214 vctx->detected_segments .clear ();
52215215
@@ -5227,12 +5221,11 @@ struct whisper_vad_segments whisper_vad_detect_speech(
52275221 }
52285222
52295223 std::vector<float > window_with_context (vctx->effective_window_size );
5230- WHISPER_LOG_INFO (" %s: window_with_context.size() = %zu\n " , __func__, window_with_context.size ());
52315224 WHISPER_LOG_INFO (" %s: window_sample_size: %u\n " , __func__, vctx->window_size_samples );
52325225 WHISPER_LOG_INFO (" %s: context_sample_size: %u\n " , __func__, vctx->context_samples );
5226+ WHISPER_LOG_INFO (" %s: window_with_context: %zu\n " , __func__, window_with_context.size ());
52335227 WHISPER_LOG_INFO (" %s: effective_window_size: %u\n " , __func__, vctx->effective_window_size );
52345228
5235- whisper_vad_segment current_segment = {-1 .0f , -1 .0f };
52365229 struct ggml_tensor * frame = ggml_graph_get_tensor (gf, " frame" );
52375230 struct ggml_tensor * c_out = ggml_graph_get_tensor (gf, " c_out" );
52385231 struct ggml_tensor * h_out = ggml_graph_get_tensor (gf, " h_out" );
@@ -5256,7 +5249,7 @@ struct whisper_vad_segments whisper_vad_detect_speech(
52565249 if (i + vctx->window_size_samples > n_samples) {
52575250 break ;
52585251 }
5259- // Copy the previous context buffer into the next window to be processed next
5252+ // Copy the previous context buffer into the next window to be processed next.
52605253 // context_buffer contains the 64 samples from the previous window and this is
52615254 // part of the overlapping windows to avoid spectral leakage.
52625255 std::copy (vctx->context_buffer .begin (), vctx->context_buffer .end (), window_with_context.begin ());
0 commit comments