Skip to content

Commit 93883e7

Browse files
committed
vad : add initial Voice Activity Detection (VAD) support
wip
1 parent ada745f commit 93883e7

File tree

6 files changed

+894
-0
lines changed

6 files changed

+894
-0
lines changed

include/whisper.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -652,6 +652,47 @@ extern "C" {
652652
WHISPER_API float whisper_full_get_token_p (struct whisper_context * ctx, int i_segment, int i_token);
653653
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
654654

655+
// Voice Activity Detection (VAD)
656+
struct whisper_vad_context;
657+
struct whisper_vad_state;
658+
659+
struct whisper_vad_params {
660+
float threshold; // Probability threshold for speech detection
661+
int min_speech_duration_ms; // Minimum speech segment duration
662+
int min_silence_duration_ms; // Minimum silence segment duration
663+
int window_size_samples; // Window size for processing
664+
int sample_rate; // 16000
665+
};
666+
WHISPER_API struct whisper_vad_params whisper_vad_default_params(void);
667+
668+
WHISPER_API struct whisper_vad_state * whisper_vad_init_state(struct whisper_vad_context * ctx);
669+
670+
WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(
671+
const char * path_model,
672+
const whisper_vad_params params);
673+
674+
WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
675+
const char * path_model,
676+
const whisper_vad_params params);
677+
678+
struct whisper_vad_segment {
679+
float start; // Start time in seconds
680+
float end; // End time in seconds
681+
};
682+
683+
struct whisper_vad_segments {
684+
int n_segments;
685+
whisper_vad_segment * segments;
686+
};
687+
688+
WHISPER_API struct whisper_vad_segments whisper_vad_detect_speech(
689+
whisper_vad_context * vctx,
690+
const float * pcmf32, int n_samples);
691+
692+
WHISPER_API void whisper_vad_free (struct whisper_vad_context * ctx);
693+
WHISPER_API void whisper_vad_free_state (struct whisper_vad_state * state);
694+
WHISPER_API void whisper_vad_free_params(struct whisper_vad_params * params);
695+
655696
////////////////////////////////////////////////////////////////////////////
656697

657698
// Temporary helpers needed for exposing ggml interface
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
import struct
3+
import argparse
4+
import torch
5+
import numpy as np
6+
from silero_vad import load_silero_vad, __version__ as silero_version
7+
8+
def convert_silero_vad(output_path, use_f16=True):
9+
model = load_silero_vad()
10+
state_dict = model.state_dict()
11+
12+
# Clean up state dict keys - filter out 8k model
13+
cleaned_dict = {}
14+
for key, value in state_dict.items():
15+
# Skip 8k model
16+
if "_8k" not in key:
17+
if not key.startswith("_model."):
18+
key = "_model." + clean_key
19+
cleaned_dict[key] = value
20+
21+
base, ext = os.path.splitext(output_path)
22+
output_file = f"{base}-v{silero_version}-ggml{ext}"
23+
print(f"Saving GGML Silero-VAD model to {output_file}")
24+
25+
print("\nTensors to be written:")
26+
for key, tensor in cleaned_dict.items():
27+
print(f"{key}: {tensor.shape}")
28+
print()
29+
30+
fout = open(output_file, "wb")
31+
32+
# Write magic and version
33+
fout.write(struct.pack("i", 0x67676d6c)) # "ggml" in hex
34+
35+
# Write the use_f16 flag
36+
fout.write(struct.pack("i", 1 if use_f16 else 0))
37+
38+
n_encoder_layers = 4
39+
fout.write(struct.pack("i", n_encoder_layers))
40+
41+
# Write encoder dimensions
42+
input_channels = 129
43+
encoder_in_channels = [input_channels, 128, 64, 64]
44+
encoder_out_channels = [128, 64, 64, 128]
45+
kernel_size = 3
46+
47+
for i in range(n_encoder_layers):
48+
fout.write(struct.pack("i", encoder_in_channels[i]))
49+
fout.write(struct.pack("i", encoder_out_channels[i]))
50+
fout.write(struct.pack("i", kernel_size))
51+
52+
# Write LSTM dimensions
53+
lstm_input_size = 128
54+
lstm_hidden_size = 128
55+
fout.write(struct.pack("i", lstm_input_size))
56+
fout.write(struct.pack("i", lstm_hidden_size))
57+
58+
# Write final conv dimensions
59+
final_conv_in = 128
60+
final_conv_out = 1
61+
fout.write(struct.pack("i", final_conv_in))
62+
fout.write(struct.pack("i", final_conv_out))
63+
64+
print("Writing model weights:")
65+
66+
tensor_keys_to_write = []
67+
68+
for i in range(n_encoder_layers):
69+
weight_key = f"_model.encoder.{i}.reparam_conv.weight"
70+
bias_key = f"_model.encoder.{i}.reparam_conv.bias"
71+
if weight_key in cleaned_dict and bias_key in cleaned_dict:
72+
tensor_keys_to_write.append(weight_key)
73+
tensor_keys_to_write.append(bias_key)
74+
75+
lstm_keys = [
76+
"_model.decoder.rnn.weight_ih",
77+
"_model.decoder.rnn.weight_hh",
78+
"_model.decoder.rnn.bias_ih",
79+
"_model.decoder.rnn.bias_hh"
80+
]
81+
tensor_keys_to_write.extend([k for k in lstm_keys if k in cleaned_dict])
82+
83+
final_keys = [
84+
"_model.decoder.decoder.2.weight",
85+
"_model.decoder.decoder.2.bias"
86+
]
87+
tensor_keys_to_write.extend([k for k in final_keys if k in cleaned_dict])
88+
89+
stft_tensor = "_model.stft.forward_basis_buffer"
90+
tensor_keys_to_write.extend([stft_tensor])
91+
92+
for name in tensor_keys_to_write:
93+
if name not in cleaned_dict:
94+
print(f"Warning: Missing tensor {name}, skipping")
95+
continue
96+
97+
tensor = cleaned_dict[name]
98+
data = tensor.squeeze().numpy()
99+
print(f"Processing variable: {name} with shape: {data.shape}")
100+
101+
if name.endswith(".reparam_conv.weight") and len(data.shape) == 3:
102+
print(f" Keeping original convolution weight shape: {data.shape}")
103+
104+
# Determine if we should use float16 or float32
105+
ftype = 1 # default to float16
106+
if not use_f16 or len(data.shape) < 2 or name.endswith(".bias"):
107+
ftype = 0 # use float32
108+
if use_f16:
109+
print(f" Converting to float32")
110+
else:
111+
data = data.astype(np.float16)
112+
113+
n_dims = len(data.shape)
114+
115+
# Write header
116+
str_bytes = name.encode('utf-8')
117+
fout.write(struct.pack("iii", n_dims, len(str_bytes), ftype))
118+
119+
for i in range(n_dims):
120+
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
121+
122+
fout.write(str_bytes)
123+
124+
data.tofile(fout)
125+
126+
fout.close()
127+
print(f"Done! Model has been converted to GGML format: {output_file}")
128+
129+
if __name__ == "__main__":
130+
parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
131+
parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
132+
parser.add_argument("--use-f16", action="store_true", help="Use float16 precision", default=True)
133+
args = parser.parse_args()
134+
135+
convert_silero_vad(args.output, args.use_f16)

src/whisper-arch.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,59 @@ static const std::map<asr_tensor, ggml_op> ASR_TENSOR_INFO = {
139139
{ASR_TENSOR_ATTN_OUT_WEIGHT, GGML_OP_MUL_MAT},
140140
{ASR_TENSOR_ATTN_OUT_BIAS, GGML_OP_ADD},
141141
};
142+
143+
enum vad_tensor {
144+
VAD_TENSOR_STFT_BASIS,
145+
VAD_TENSOR_ENC_0_WEIGHT,
146+
VAD_TENSOR_ENC_0_BIAS,
147+
VAD_TENSOR_ENC_1_WEIGHT,
148+
VAD_TENSOR_ENC_1_BIAS,
149+
VAD_TENSOR_ENC_2_WEIGHT,
150+
VAD_TENSOR_ENC_2_BIAS,
151+
VAD_TENSOR_ENC_3_WEIGHT,
152+
VAD_TENSOR_ENC_3_BIAS,
153+
VAD_TENSOR_LSTM_WEIGHT_IH,
154+
VAD_TENSOR_LSTM_WEIGHT_HH,
155+
VAD_TENSOR_LSTM_BIAS_IH,
156+
VAD_TENSOR_LSTM_BIAS_HH,
157+
VAD_TENSOR_FINAL_CONV_WEIGHT,
158+
VAD_TENSOR_FINAL_CONV_BIAS,
159+
};
160+
161+
static const std::map<vad_tensor, ggml_op> VAD_TENSOR_OPS = {
162+
{VAD_TENSOR_STFT_BASIS, GGML_OP_MUL_MAT},
163+
{VAD_TENSOR_ENC_0_WEIGHT, GGML_OP_MUL_MAT},
164+
{VAD_TENSOR_ENC_0_BIAS, GGML_OP_ADD},
165+
{VAD_TENSOR_ENC_1_WEIGHT, GGML_OP_MUL_MAT},
166+
{VAD_TENSOR_ENC_1_BIAS, GGML_OP_ADD},
167+
{VAD_TENSOR_ENC_2_WEIGHT, GGML_OP_MUL_MAT},
168+
{VAD_TENSOR_ENC_2_BIAS, GGML_OP_ADD},
169+
{VAD_TENSOR_ENC_3_WEIGHT, GGML_OP_MUL_MAT},
170+
{VAD_TENSOR_ENC_3_BIAS, GGML_OP_ADD},
171+
172+
{VAD_TENSOR_LSTM_WEIGHT_IH, GGML_OP_MUL_MAT},
173+
{VAD_TENSOR_LSTM_WEIGHT_HH, GGML_OP_MUL_MAT},
174+
{VAD_TENSOR_LSTM_BIAS_IH, GGML_OP_ADD},
175+
{VAD_TENSOR_LSTM_BIAS_HH, GGML_OP_ADD},
176+
177+
{VAD_TENSOR_FINAL_CONV_WEIGHT, GGML_OP_MUL_MAT},
178+
{VAD_TENSOR_FINAL_CONV_BIAS, GGML_OP_ADD}
179+
};
180+
181+
static const std::map<vad_tensor, const char *> VAD_TENSOR_NAMES = {
182+
{VAD_TENSOR_STFT_BASIS, "_model.stft.forward_basis_buffer"},
183+
{VAD_TENSOR_ENC_0_WEIGHT, "_model.encoder.0.reparam_conv.weight"},
184+
{VAD_TENSOR_ENC_0_BIAS, "_model.encoder.0.reparam_conv.bias"},
185+
{VAD_TENSOR_ENC_1_WEIGHT, "_model.encoder.1.reparam_conv.weight"},
186+
{VAD_TENSOR_ENC_1_BIAS, "_model.encoder.1.reparam_conv.bias"},
187+
{VAD_TENSOR_ENC_2_WEIGHT, "_model.encoder.2.reparam_conv.weight"},
188+
{VAD_TENSOR_ENC_2_BIAS, "_model.encoder.2.reparam_conv.bias"},
189+
{VAD_TENSOR_ENC_3_WEIGHT, "_model.encoder.3.reparam_conv.weight"},
190+
{VAD_TENSOR_ENC_3_BIAS, "_model.encoder.3.reparam_conv.bias"},
191+
{VAD_TENSOR_LSTM_WEIGHT_IH, "_model.decoder.rnn.weight_ih"},
192+
{VAD_TENSOR_LSTM_WEIGHT_HH, "_model.decoder.rnn.weight_hh"},
193+
{VAD_TENSOR_LSTM_BIAS_IH, "_model.decoder.rnn.bias_ih"},
194+
{VAD_TENSOR_LSTM_BIAS_HH, "_model.decoder.rnn.bias_hh"},
195+
{VAD_TENSOR_FINAL_CONV_WEIGHT, "_model.decoder.decoder.2.weight"},
196+
{VAD_TENSOR_FINAL_CONV_BIAS, "_model.decoder.decoder.2.bias"}
197+
};

0 commit comments

Comments
 (0)