vad : fix handling of stft tensor

danbev · danbev · commit 096afd6ec6ab · 2025-04-09T11:15:44.000+02:00
Still trying to figure out what the issue with the incorrect
probabilities are. I'll go through the conversion script later to clean
it up as it contains leftover code from previous interations.
diff --git a/models/convert-silero-vad-to-ggml.py b/models/convert-silero-vad-to-ggml.py
@@ -23,159 +23,155 @@ def convert_silero_vad(output_path, print_tensors=True):
     output_file = f"{base}-v{silero_version}-ggml{ext}"
     print(f"Saving GGML Silero-VAD model to {output_file}")
 
-    # Create a lookup to identify which tensors might need special handling
-    op_tensors = {
-        'encoder': [],
-        'decoder': [],
-        'stft': []
-    }
-
-    for key in cleaned_dict.keys():
-        if 'encoder' in key:
-            op_tensors['encoder'].append(key)
-        elif 'decoder' in key:
-            op_tensors['decoder'].append(key)
-        elif 'stft' in key:
-            op_tensors['stft'].append(key)
-
-    print("\nTensor groups for debugging:")
-    for group, tensors in op_tensors.items():
-        print(f"{group}: {len(tensors)} tensors")
-        for t in tensors:
-            print(f"  - {t}: {cleaned_dict[t].shape} ({cleaned_dict[t].dtype})")
+    print("\nTensor info for debugging:")
+    for key, tensor in cleaned_dict.items():
+        print(f"  - {key}: {tensor.shape} ({tensor.dtype})")
     print()
 
-    fout = open(output_file, "wb")
-
-    # Write magic and version
-    fout.write(struct.pack("i", 0x67676d6c))  # "ggml" in hex
-
-    # Write a flag to indicate we're preserving original tensor types
-    fout.write(struct.pack("i", 2))  # 2 = preserve original type
-
-    n_encoder_layers = 4
-    fout.write(struct.pack("i", n_encoder_layers))
-
-    # Write encoder dimensions
-    input_channels = 129
-    encoder_in_channels = [input_channels, 128, 64, 64]
-    encoder_out_channels = [128, 64, 64, 128]
-    kernel_size = 3
-
-    for i in range(n_encoder_layers):
-        fout.write(struct.pack("i", encoder_in_channels[i]))
-        fout.write(struct.pack("i", encoder_out_channels[i]))
-        fout.write(struct.pack("i", kernel_size))
-
-    # Write LSTM dimensions
-    lstm_input_size = 128
-    lstm_hidden_size = 128
-    fout.write(struct.pack("i", lstm_input_size))
-    fout.write(struct.pack("i", lstm_hidden_size))
-
-    # Write final conv dimensions
-    final_conv_in = 128
-    final_conv_out = 1
-    fout.write(struct.pack("i", final_conv_in))
-    fout.write(struct.pack("i", final_conv_out))
-
-    print("Writing model weights:")
-
-    tensor_keys_to_write = []
-
-    for i in range(n_encoder_layers):
-        weight_key = f"_model.encoder.{i}.reparam_conv.weight"
-        bias_key = f"_model.encoder.{i}.reparam_conv.bias"
-        if weight_key in cleaned_dict and bias_key in cleaned_dict:
-            tensor_keys_to_write.append(weight_key)
-            tensor_keys_to_write.append(bias_key)
-
-    lstm_keys = [
-        "_model.decoder.rnn.weight_ih",
-        "_model.decoder.rnn.weight_hh",
-        "_model.decoder.rnn.bias_ih",
-        "_model.decoder.rnn.bias_hh"
-    ]
-    tensor_keys_to_write.extend([k for k in lstm_keys if k in cleaned_dict])
-
-    final_keys = [
-        "_model.decoder.decoder.2.weight",
-        "_model.decoder.decoder.2.bias"
-    ]
-    tensor_keys_to_write.extend([k for k in final_keys if k in cleaned_dict])
-
-    stft_tensor = "_model.stft.forward_basis_buffer"
-    tensor_keys_to_write.extend([stft_tensor])
-
-    for name in tensor_keys_to_write:
-        if name not in cleaned_dict:
-            print(f"Warning: Missing tensor {name}, skipping")
-            continue
-
-        tensor = cleaned_dict[name]
-        data = tensor.squeeze().numpy()
-        print(f"Processing variable: {name} with shape: {data.shape}")
-
-        # Print values of the tensor (original values)
-        if print_tensors:
-            if name == "_model.stft.forward_basis_buffer":
-                first_values = tensor.flatten()[:258].tolist()
-                print(f"  First 258 values for {name}:")
-                for i, val in enumerate(first_values):
-                    print(f"    [{i}]: {val}")
+    with open(output_file, "wb") as fout:
+        # Write magic and version
+        fout.write(struct.pack("i", 0x67676d6c))
+
+        # Write model version - Try version 0 for simplicity
+        fout.write(struct.pack("i", 0))
+
+        # Write model architecture parameters
+        n_encoder_layers = 4
+        fout.write(struct.pack("i", n_encoder_layers))
+
+        # Write encoder dimensions
+        input_channels = 129
+        encoder_in_channels = [input_channels, 128, 64, 64]
+        encoder_out_channels = [128, 64, 64, 128]
+        kernel_size = 3
+
+        for i in range(n_encoder_layers):
+            fout.write(struct.pack("i", encoder_in_channels[i]))
+            fout.write(struct.pack("i", encoder_out_channels[i]))
+            fout.write(struct.pack("i", kernel_size))
+
+        # Write LSTM dimensions
+        lstm_input_size = 128
+        lstm_hidden_size = 128
+        fout.write(struct.pack("i", lstm_input_size))
+        fout.write(struct.pack("i", lstm_hidden_size))
+
+        # Write final conv dimensions
+        final_conv_in = 128
+        final_conv_out = 1
+        fout.write(struct.pack("i", final_conv_in))
+        fout.write(struct.pack("i", final_conv_out))
+
+        # Define tensor keys to write
+        tensor_keys = []
+
+        # Encoder weights
+        for i in range(n_encoder_layers):
+            weight_key = f"_model.encoder.{i}.reparam_conv.weight"
+            bias_key = f"_model.encoder.{i}.reparam_conv.bias"
+            if weight_key in cleaned_dict and bias_key in cleaned_dict:
+                tensor_keys.append(weight_key)
+                tensor_keys.append(bias_key)
+
+        # LSTM weights
+        lstm_keys = [
+            "_model.decoder.rnn.weight_ih",
+            "_model.decoder.rnn.weight_hh",
+            "_model.decoder.rnn.bias_ih",
+            "_model.decoder.rnn.bias_hh"
+        ]
+        tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])
+
+        # Final conv weights
+        final_keys = [
+            "_model.decoder.decoder.2.weight",
+            "_model.decoder.decoder.2.bias"
+        ]
+        tensor_keys.extend([k for k in final_keys if k in cleaned_dict])
+
+        # STFT basis - add this last
+        stft_tensor = "_model.stft.forward_basis_buffer"
+        tensor_keys.append(stft_tensor)
+
+        print(f"Writing {len(tensor_keys)} tensors:")
+        for key in tensor_keys:
+            if key in cleaned_dict:
+                print(f"  - {key}: {cleaned_dict[key].shape}")
             else:
-                first_values = tensor.flatten()[:10].tolist()
-                print(f"  First 10 values for {name}:")
-                for i, val in enumerate(first_values):
-                    print(f"    [{i}]: {val}")
-
-        if name.endswith(".reparam_conv.weight") and len(data.shape) == 3:
-            print(f"  Keeping original convolution weight shape: {data.shape}")
-
-        # Get original dtype
-        orig_dtype = tensor.dtype
-        print(f"  Original tensor dtype: {orig_dtype}")
-
-        # Check if this is an encoder convolution weight that needs to be F16
-        force_f16 = False
-        if "encoder" in name and "weight" in name:
-            print(f"  This tensor will be forced to F16 for GGML im2col compatibility")
-            force_f16 = True
-        if "_model.decoder.decoder.2.weight" in name:
-            print(f"  This tensor will be forced to F16 for GGML im2col compatibility")
-            force_f16 = True
-        if "_model.stft.forward_basis_buffer" in name:
-            print(f"  This tensor will be forced to F16 for GGML im2col compatibility")
-            force_f16 = True
-
-        # Set ftype based on the original dtype or force to F16 for certain tensors
-        if force_f16:
-            ftype = 1  # float16
-            data = data.astype(np.float16)
-        elif orig_dtype == torch.float16:
-            ftype = 1  # float16
-        else:
-            ftype = 0  # float32
-
-        # Ensure data has the same type as the original tensor
-        if ftype == 1 and not np.issubdtype(data.dtype, np.float16):
-            data = data.astype(np.float16)
-
-        n_dims = len(data.shape)
-
-        # Write header
-        str_bytes = name.encode('utf-8')
-        fout.write(struct.pack("iii", n_dims, len(str_bytes), ftype))
-
-        for i in range(n_dims):
-            fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-
-        fout.write(str_bytes)
-
-        data.tofile(fout)
-
-    fout.close()
-    print(f"Done! Model has been converted to GGML format: {output_file}")
+                print(f"  - {key}: MISSING")
+
+        # Process each tensor
+        for key in tensor_keys:
+            if key not in cleaned_dict:
+                print(f"Warning: Missing tensor {key}, skipping")
+                continue
+
+            tensor = cleaned_dict[key]
+
+            # Special handling for STFT tensor
+            if key == "_model.stft.forward_basis_buffer":
+                # Get the original numpy array without squeezing
+                data = tensor.detach().cpu().numpy()
+                # Ensure it has the expected shape
+                print(f"STFT tensor original shape: {data.shape}")
+                n_dims = 3
+                tensor_shape = [data.shape[0], data.shape[1], data.shape[2]]
+                is_conv_weight = True
+            else:
+                # For other tensors, we can use standard processing
+                data = tensor.detach().cpu().squeeze().numpy()
+                tensor_shape = list(data.shape)
+
+                # Ensure we have at most 4 dimensions for GGML
+                n_dims = min(len(tensor_shape), 4)
+
+                # Reverse dimensions for GGML
+                tensor_shape = tensor_shape[:n_dims]
+                tensor_shape.reverse()
+
+                # Check if this is a convolution weight tensor
+                is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)
+
+            # Convert to float16 for convolution weights
+            if is_conv_weight:
+                data = data.astype(np.float16)
+                ftype = 1  # float16
+            else:
+                ftype = 0  # float32
+
+            # Debug printing of tensor info
+            print(f"\nWriting tensor: {key}")
+            print(f"  Original shape: {tensor.shape}")
+            print(f"  Processed shape: {data.shape}")
+            print(f"  GGML dimensions: {n_dims}")
+            print(f"  GGML shape: {tensor_shape}")
+            print(f"  Type: {'float16' if ftype == 1 else 'float32'}")
+
+            # Convert tensor name to bytes
+            name_bytes = key.encode('utf-8')
+            name_length = len(name_bytes)
+
+            # Write tensor header
+            fout.write(struct.pack("i", n_dims))
+            fout.write(struct.pack("i", name_length))
+            fout.write(struct.pack("i", ftype))
+
+            # Write tensor dimensions
+            for i in range(n_dims):
+                size = tensor_shape[i] if i < len(tensor_shape) else 1
+                fout.write(struct.pack("i", size))
+                print(f"  Writing dimension {i}: {size}")
+
+            # Write tensor name
+            fout.write(name_bytes)
+
+            # Write tensor data
+            data.tofile(fout)
+
+            print(f"  Wrote {data.size * (2 if ftype==1 else 4)} bytes")
+
+    print(f"\nDone! Model has been converted to GGML format: {output_file}")
+    print(f"File size: {os.path.getsize(output_file)} bytes")
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4480,10 +4480,11 @@ static ggml_tensor * whisper_vad_build_stft_layer(ggml_context* ctx0,
         const whisper_vad_model & model, ggml_tensor * cur) {
     struct ggml_tensor * padded_frame = ggml_pad(ctx0, cur, 64, 0, 0, 0);
     struct ggml_tensor * reshaped_frame = ggml_reshape_3d(ctx0, padded_frame, 640, 1, 1);
-    struct ggml_tensor * reshaped_basis = ggml_reshape_3d(ctx0, model.stft_forward_basis, 256, 1, 258);
-    struct ggml_tensor * permuted_basis = ggml_permute(ctx0, reshaped_basis, 2, 1, 0, 3);
-    permuted_basis = ggml_cont(ctx0, permuted_basis);
-    cur = ggml_conv_1d(ctx0, permuted_basis, reshaped_frame, 1, 0, 1);
+
+    // We need the stft tensor to be in {258, 1, 256},
+    // that is a kernel size of 258, 1 channel, and 256 frequency bins (output)
+    struct ggml_tensor * reshaped_stft = ggml_reshape_3d(ctx0, model.stft_forward_basis, 258, 1, 256);
+    cur = ggml_conv_1d(ctx0, reshaped_stft, reshaped_frame, 1, 1, 1);
     ggml_set_name(cur, "stft");
     ggml_set_output(cur);
     return cur;
@@ -4842,7 +4843,7 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
 
         // SFTF precomputed basis matrix
         model.stft_forward_basis = create_tensor(VAD_TENSOR_STFT_BASIS,
-            ggml_new_tensor_2d(ctx, GGML_TYPE_F16, 256, 258));
+            ggml_new_tensor_3d(ctx, GGML_TYPE_F16, 258, 1, 256));
 
         model.encoder_0_weight = create_tensor(VAD_TENSOR_ENC_0_WEIGHT,
             ggml_new_tensor_3d(