danbev
diff --git a/‎include/whisper.h‎
Lines changed: 41 additions & 0 deletions b/‎include/whisper.h‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎models/convert-silero-vad-to-ggml.py‎
Lines changed: 135 additions & 0 deletions b/‎models/convert-silero-vad-to-ggml.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎src/whisper-arch.h‎
Lines changed: 56 additions & 0 deletions b/‎src/whisper-arch.h‎
Lines changed: 56 additions & 0 deletions
@@ -652,6 +652,47 @@ extern "C" {
     WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
     WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
 
+    // Voice Activity Detection (VAD)
+    struct whisper_vad_context;
+    struct whisper_vad_state;
+
+    struct whisper_vad_params {
+        float threshold;             // Probability threshold for speech detection
+        int min_speech_duration_ms;  // Minimum speech segment duration
+        int min_silence_duration_ms; // Minimum silence segment duration
+        int window_size_samples;     // Window size for processing
+        int sample_rate;             // 16000
+    };
+    WHISPER_API struct whisper_vad_params  whisper_vad_default_params(void);
+
+    WHISPER_API struct whisper_vad_state * whisper_vad_init_state(struct whisper_vad_context * ctx);
+
+    WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(
+        const char * path_model,
+        const whisper_vad_params params);
+
+    WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
+        const char * path_model,
+        const whisper_vad_params params);
+
+    struct whisper_vad_segment {
+        float start; // Start time in seconds
+        float end;   // End time in seconds
+    };
+
+    struct whisper_vad_segments {
+        int n_segments;
+        whisper_vad_segment * segments;
+    };
+
+    WHISPER_API struct whisper_vad_segments whisper_vad_detect_speech(
+            whisper_vad_context * vctx,
+            const float * pcmf32, int n_samples);
+
+    WHISPER_API void whisper_vad_free       (struct whisper_vad_context * ctx);
+    WHISPER_API void whisper_vad_free_state (struct whisper_vad_state   * state);
+    WHISPER_API void whisper_vad_free_params(struct whisper_vad_params  * params);
+
     ////////////////////////////////////////////////////////////////////////////
 
     // Temporary helpers needed for exposing ggml interface
 
@@ -0,0 +1,135 @@
+import os
+import struct
+import argparse
+import torch
+import numpy as np
+from silero_vad import load_silero_vad, __version__ as silero_version
+
+def convert_silero_vad(output_path, use_f16=True):
+    model = load_silero_vad()
+    state_dict = model.state_dict()
+
+    # Clean up state dict keys - filter out 8k model
+    cleaned_dict = {}
+    for key, value in state_dict.items():
+        # Skip 8k model
+        if "_8k" not in key:
+            if not key.startswith("_model."):
+                key = "_model." + clean_key
+            cleaned_dict[key] = value
+
+    base, ext = os.path.splitext(output_path)
+    output_file = f"{base}-v{silero_version}-ggml{ext}"
+    print(f"Saving GGML Silero-VAD model to {output_file}")
+
+    print("\nTensors to be written:")
+    for key, tensor in cleaned_dict.items():
+        print(f"{key}: {tensor.shape}")
+    print()
+
+    fout = open(output_file, "wb")
+
+    # Write magic and version
+    fout.write(struct.pack("i", 0x67676d6c))  # "ggml" in hex
+
+    # Write the use_f16 flag
+    fout.write(struct.pack("i", 1 if use_f16 else 0))
+
+    n_encoder_layers = 4
+    fout.write(struct.pack("i", n_encoder_layers))
+
+    # Write encoder dimensions
+    input_channels = 129
+    encoder_in_channels = [input_channels, 128, 64, 64]
+    encoder_out_channels = [128, 64, 64, 128]
+    kernel_size = 3
+
+    for i in range(n_encoder_layers):
+        fout.write(struct.pack("i", encoder_in_channels[i]))
+        fout.write(struct.pack("i", encoder_out_channels[i]))
+        fout.write(struct.pack("i", kernel_size))
+
+    # Write LSTM dimensions
+    lstm_input_size = 128
+    lstm_hidden_size = 128
+    fout.write(struct.pack("i", lstm_input_size))
+    fout.write(struct.pack("i", lstm_hidden_size))
+
+    # Write final conv dimensions
+    final_conv_in = 128
+    final_conv_out = 1
+    fout.write(struct.pack("i", final_conv_in))
+    fout.write(struct.pack("i", final_conv_out))
+
+    print("Writing model weights:")
+
+    tensor_keys_to_write = []
+
+    for i in range(n_encoder_layers):
+        weight_key = f"_model.encoder.{i}.reparam_conv.weight"
+        bias_key = f"_model.encoder.{i}.reparam_conv.bias"
+        if weight_key in cleaned_dict and bias_key in cleaned_dict:
+            tensor_keys_to_write.append(weight_key)
+            tensor_keys_to_write.append(bias_key)
+
+    lstm_keys = [
+        "_model.decoder.rnn.weight_ih",
+        "_model.decoder.rnn.weight_hh",
+        "_model.decoder.rnn.bias_ih",
+        "_model.decoder.rnn.bias_hh"
+    ]
+    tensor_keys_to_write.extend([k for k in lstm_keys if k in cleaned_dict])
+
+    final_keys = [
+        "_model.decoder.decoder.2.weight",
+        "_model.decoder.decoder.2.bias"
+    ]
+    tensor_keys_to_write.extend([k for k in final_keys if k in cleaned_dict])
+
+    stft_tensor = "_model.stft.forward_basis_buffer"
+    tensor_keys_to_write.extend([stft_tensor])
+
+    for name in tensor_keys_to_write:
+        if name not in cleaned_dict:
+            print(f"Warning: Missing tensor {name}, skipping")
+            continue
+
+        tensor = cleaned_dict[name]
+        data = tensor.squeeze().numpy()
+        print(f"Processing variable: {name} with shape: {data.shape}")
+
+        if name.endswith(".reparam_conv.weight") and len(data.shape) == 3:
+            print(f"  Keeping original convolution weight shape: {data.shape}")
+
+        # Determine if we should use float16 or float32
+        ftype = 1  # default to float16
+        if not use_f16 or len(data.shape) < 2 or name.endswith(".bias"):
+            ftype = 0  # use float32
+            if use_f16:
+                print(f"  Converting to float32")
+        else:
+            data = data.astype(np.float16)
+
+        n_dims = len(data.shape)
+
+        # Write header
+        str_bytes = name.encode('utf-8')
+        fout.write(struct.pack("iii", n_dims, len(str_bytes), ftype))
+
+        for i in range(n_dims):
+            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+
+        fout.write(str_bytes)
+
+        data.tofile(fout)
+
+    fout.close()
+    print(f"Done! Model has been converted to GGML format: {output_file}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
+    parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
+    parser.add_argument("--use-f16", action="store_true", help="Use float16 precision", default=True)
+    args = parser.parse_args()
+
+    convert_silero_vad(args.output, args.use_f16)
@@ -139,3 +139,59 @@ static const std::map<asr_tensor, ggml_op> ASR_TENSOR_INFO = {
     {ASR_TENSOR_ATTN_OUT_WEIGHT,       GGML_OP_MUL_MAT},
     {ASR_TENSOR_ATTN_OUT_BIAS,         GGML_OP_ADD},
 };
+
+enum vad_tensor {
+    VAD_TENSOR_STFT_BASIS,
+    VAD_TENSOR_ENC_0_WEIGHT,
+    VAD_TENSOR_ENC_0_BIAS,
+    VAD_TENSOR_ENC_1_WEIGHT,
+    VAD_TENSOR_ENC_1_BIAS,
+    VAD_TENSOR_ENC_2_WEIGHT,
+    VAD_TENSOR_ENC_2_BIAS,
+    VAD_TENSOR_ENC_3_WEIGHT,
+    VAD_TENSOR_ENC_3_BIAS,
+    VAD_TENSOR_LSTM_WEIGHT_IH,
+    VAD_TENSOR_LSTM_WEIGHT_HH,
+    VAD_TENSOR_LSTM_BIAS_IH,
+    VAD_TENSOR_LSTM_BIAS_HH,
+    VAD_TENSOR_FINAL_CONV_WEIGHT,
+    VAD_TENSOR_FINAL_CONV_BIAS,
+};
+
+static const std::map<vad_tensor, ggml_op> VAD_TENSOR_OPS = {
+    {VAD_TENSOR_STFT_BASIS,          GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_0_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_0_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_1_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_1_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_2_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_2_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_3_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_3_BIAS,          GGML_OP_ADD},
+
+    {VAD_TENSOR_LSTM_WEIGHT_IH,      GGML_OP_MUL_MAT},
+    {VAD_TENSOR_LSTM_WEIGHT_HH,      GGML_OP_MUL_MAT},
+    {VAD_TENSOR_LSTM_BIAS_IH,        GGML_OP_ADD},
+    {VAD_TENSOR_LSTM_BIAS_HH,        GGML_OP_ADD},
+
+    {VAD_TENSOR_FINAL_CONV_WEIGHT,   GGML_OP_MUL_MAT},
+    {VAD_TENSOR_FINAL_CONV_BIAS,     GGML_OP_ADD}
+};
+
+static const std::map<vad_tensor, const char *> VAD_TENSOR_NAMES = {
+    {VAD_TENSOR_STFT_BASIS,          "_model.stft.forward_basis_buffer"},
+    {VAD_TENSOR_ENC_0_WEIGHT,        "_model.encoder.0.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_0_BIAS,          "_model.encoder.0.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_1_WEIGHT,        "_model.encoder.1.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_1_BIAS,          "_model.encoder.1.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_2_WEIGHT,        "_model.encoder.2.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_2_BIAS,          "_model.encoder.2.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_3_WEIGHT,        "_model.encoder.3.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_3_BIAS,          "_model.encoder.3.reparam_conv.bias"},
+    {VAD_TENSOR_LSTM_WEIGHT_IH,      "_model.decoder.rnn.weight_ih"},
+    {VAD_TENSOR_LSTM_WEIGHT_HH,      "_model.decoder.rnn.weight_hh"},
+    {VAD_TENSOR_LSTM_BIAS_IH,        "_model.decoder.rnn.bias_ih"},
+    {VAD_TENSOR_LSTM_BIAS_HH,        "_model.decoder.rnn.bias_hh"},
+    {VAD_TENSOR_FINAL_CONV_WEIGHT,   "_model.decoder.decoder.2.weight"},
+    {VAD_TENSOR_FINAL_CONV_BIAS,     "_model.decoder.decoder.2.bias"}
+};