@@ -4369,35 +4369,35 @@ struct whisper_vad_model {
43694369 e_vad_model type = VAD_MODEL_UNKNOWN;
43704370 whisper_vad_hparams hparams;
43714371
4372- struct ggml_tensor * stft_forward_basis; // [258, 256 ]
4372+ struct ggml_tensor * stft_forward_basis; // [256, 258 ]
43734373
43744374 // Encoder tensors - 4 convolutional layers
4375- struct ggml_tensor * encoder_0_weight; // [128 , 129, 3 ]
4375+ struct ggml_tensor * encoder_0_weight; // [3 , 129, 128 ]
43764376 struct ggml_tensor * encoder_0_bias; // [128]
43774377
43784378 // Second encoder layer
4379- struct ggml_tensor * encoder_1_weight; // [64 , 128, 3 ]
4379+ struct ggml_tensor * encoder_1_weight; // [3 , 128, 64 ]
43804380 struct ggml_tensor * encoder_1_bias; // [64]
43814381
43824382 // Third encoder layer
4383- struct ggml_tensor * encoder_2_weight; // [64 , 64, 3 ]
4383+ struct ggml_tensor * encoder_2_weight; // [3 , 64, 64 ]
43844384 struct ggml_tensor * encoder_2_bias; // [64]
43854385
43864386 // Fourth encoder layer
4387- struct ggml_tensor * encoder_3_weight; // [128 , 64, 3 ]
4387+ struct ggml_tensor * encoder_3_weight; // [3 , 64, 128 ]
43884388 struct ggml_tensor * encoder_3_bias; // [128]
43894389
43904390 // LSTM decoder tensors
4391- struct ggml_tensor * lstm_ih_weight; // [512, 128 ] input-to-hidden
4391+ struct ggml_tensor * lstm_ih_weight; // [128, 512 ] input-to-hidden
43924392 struct ggml_tensor * lstm_ih_bias; // [512]
4393- struct ggml_tensor * lstm_hh_weight; // [512, 128 ] hidden-to-hidden
4393+ struct ggml_tensor * lstm_hh_weight; // [128, 512 ] hidden-to-hidden
43944394 struct ggml_tensor * lstm_hh_bias; // [512]
43954395
43964396 // Final conv layer
4397- struct ggml_tensor * final_conv_weight; // [1, 128, 1 ]
4397+ struct ggml_tensor * final_conv_weight; // [128]
43984398 struct ggml_tensor * final_conv_bias; // [1]
43994399
4400- // ggml context
4400+ // ggml contexts
44014401 std::vector<ggml_context *> ctxs;
44024402
44034403 // buffer for the model tensors
@@ -4887,20 +4887,16 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
48874887 VAD_TENSOR_LSTM_WEIGHT_IH,
48884888 ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.lstm_hidden_size , hstate_dim)
48894889 );
4890+ model.lstm_ih_bias = create_tensor (
4891+ VAD_TENSOR_LSTM_BIAS_IH,
4892+ ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
4893+ );
48904894
48914895 // LSTM weights - hidden to hidden
48924896 model.lstm_hh_weight = create_tensor (
48934897 VAD_TENSOR_LSTM_WEIGHT_HH,
48944898 ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.lstm_hidden_size , hstate_dim)
48954899 );
4896-
4897- // LSTM bias - input to hidden
4898- model.lstm_ih_bias = create_tensor (
4899- VAD_TENSOR_LSTM_BIAS_IH,
4900- ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
4901- );
4902-
4903- // LSTM bias - hidden to hidden
49044900 model.lstm_hh_bias = create_tensor (
49054901 VAD_TENSOR_LSTM_BIAS_HH,
49064902 ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
@@ -4911,8 +4907,6 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
49114907 VAD_TENSOR_FINAL_CONV_WEIGHT,
49124908 ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.final_conv_in , 1 )
49134909 );
4914-
4915- // Final conv layer bias
49164910 model.final_conv_bias = create_tensor (
49174911 VAD_TENSOR_FINAL_CONV_BIAS,
49184912 ggml_new_tensor_1d (ctx, GGML_TYPE_F32, 1 )
0 commit comments