danbev
diff --git a/‎include/whisper.h
Lines changed: 82 additions & 0 deletions b/‎include/whisper.h
Lines changed: 82 additions & 0 deletions
diff --git a/‎models/convert-silero-vad-to-ggml.py
Lines changed: 196 additions & 0 deletions b/‎models/convert-silero-vad-to-ggml.py
Lines changed: 196 additions & 0 deletions
diff --git a/‎models/for-tests-silero-v5.1.2-ggml.bin
864 KB b/‎models/for-tests-silero-v5.1.2-ggml.bin
864 KB
diff --git a/‎src/whisper-arch.h
Lines changed: 56 additions & 0 deletions b/‎src/whisper-arch.h
Lines changed: 56 additions & 0 deletions
@@ -570,6 +570,17 @@ extern "C" {
         size_t                           n_grammar_rules;
         size_t                           i_start_rule;
         float                            grammar_penalty;
+
+        // Voice Activity Detection (VAD) params
+        bool         vad;                         // Enable VAD
+        const char * vad_model_path;              // Path to VAD model
+        float        vad_threshold;               // Probability threshold to consider as speech.
+        int          vad_min_speech_duration_ms;  // Min duration for a valid speech segment.
+        int          vad_min_silence_duration_ms; // Min silence duration to consider speech as ended.
+        float        vad_max_speech_duration_s;   // Max duration of a speech segment before forcing a break.
+        int          vad_speech_pad_ms;           // Padding added before and after speech segments.
+        int          vad_window_size_samples;     // Number of audio samples in each probability window.
+        float        vad_samples_overlap;         // Overlap in seconds when copying audio samples from speech segment.
     };
 
     // NOTE: this function allocates memory, and it is the responsibility of the caller to free the pointer - see whisper_free_context_params & whisper_free_params()
@@ -652,6 +663,77 @@ extern "C" {
     WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
     WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
 
+    // Voice Activity Detection (VAD)
+    struct whisper_vad_context;
+    struct whisper_vad_state;
+
+    struct whisper_vad_params {
+        float threshold;               // Probability threshold to consider as speech.
+        int   min_speech_duration_ms;  // Min duration for a valid speech segment.
+        int   min_silence_duration_ms; // Min silence duration to consider speech as ended.
+        float max_speech_duration_s;   // Max duration of a speech segment before forcing a new segment.
+        int   speech_pad_ms;           // Padding added before and after speech segments.
+        int   window_size_samples;     // Number of audio samples in each probability window.
+        float samples_overlap;         // Overlap in seconds when copying audio samples from speech segment.
+    };
+    WHISPER_API struct whisper_vad_params  whisper_vad_default_params(void);
+    WHISPER_API struct whisper_vad_params  whisper_vad_params_from(struct whisper_full_params wparams);
+
+    struct whisper_vad_context_params {
+        int   n_threads;  // The number of threads to use for processing.
+        bool  use_gpu;
+        int   gpu_device;  // CUDA device
+    };
+    WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
+
+    WHISPER_API struct whisper_vad_state * whisper_vad_init_state(struct whisper_vad_context * ctx);
+
+    WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params(
+        const char * path_model,
+        const struct whisper_vad_context_params params);
+
+    WHISPER_API struct whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
+        const char * path_model,
+        const struct whisper_vad_context_params params);
+
+    WHISPER_API struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whisper_model_loader * loader,
+            struct whisper_vad_context_params params);
+
+    struct whisper_vad_speech {
+        int     n_probs;
+        float * probs;
+    };
+
+    WHISPER_API struct whisper_vad_speech whisper_vad_detect_speech(
+            struct whisper_vad_context * vctx,
+            const float * samples, int n_samples);
+
+    struct whisper_vad_segment {
+        float start; // Start time in seconds
+        float end;   // End time in seconds
+    };
+
+    struct whisper_vad_timestamps {
+        int n_segments;
+        struct whisper_vad_segment * segments;
+    };
+
+    WHISPER_API struct whisper_vad_timestamps whisper_vad_detect_speech_timestamps(
+            struct whisper_vad_context * vctx,
+            struct whisper_vad_params params,
+            const float * samples, int n_samples);
+
+    WHISPER_API struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(
+            struct whisper_vad_context * vctx,
+            struct whisper_vad_params params,
+            struct whisper_vad_speech * probs);
+
+    WHISPER_API void whisper_vad_free           (struct whisper_vad_context    * ctx);
+    WHISPER_API void whisper_vad_free_state     (struct whisper_vad_state      * state);
+    WHISPER_API void whisper_vad_free_params    (struct whisper_vad_params     * params);
+    WHISPER_API void whisper_vad_free_speech    (struct whisper_vad_speech     * speech);
+    WHISPER_API void whisper_vad_free_timestamps(struct whisper_vad_timestamps * timestamps);
+
     ////////////////////////////////////////////////////////////////////////////
 
     // Temporary helpers needed for exposing ggml interface
 
@@ -0,0 +1,196 @@
+import os
+import struct
+import argparse
+import torch
+import numpy as np
+from silero_vad import load_silero_vad, __version__ as silero_version
+
+def convert_silero_vad(output_path, print_tensors=True):
+    model = load_silero_vad()
+    state_dict = model.state_dict()
+
+    # Clean up state dict keys - filter out 8k model
+    cleaned_dict = {}
+    for key, value in state_dict.items():
+        # Skip 8k model
+        if "_8k" not in key:
+            clean_key = key
+            if not key.startswith("_model."):
+                clean_key = "_model." + key
+            cleaned_dict[clean_key] = value
+
+    base, ext = os.path.splitext(output_path)
+    output_file = f"{base}-v{silero_version}-ggml{ext}"
+    print(f"Saving GGML Silero-VAD model to {output_file}")
+
+    print("\nTensor info for debugging:")
+    for key, tensor in cleaned_dict.items():
+        print(f"  - {key}: {tensor.shape} ({tensor.dtype})")
+    print()
+
+    with open(output_file, "wb") as fout:
+        # Write magic and version
+        fout.write(struct.pack("i", 0x67676d6c))
+
+        model_type = "silero-16k"
+        str_len = len(model_type)
+        fout.write(struct.pack("i", str_len))
+        fout.write(model_type.encode('utf-8'))
+
+        version_parts = silero_version.split('.')
+        major, minor, patch = map(int, version_parts)
+        print(f"Version: {major}.{minor}.{patch}")
+        fout.write(struct.pack("i", major))
+        fout.write(struct.pack("i", minor))
+        fout.write(struct.pack("i", patch))
+
+        # Write model architecture parameters
+        window_size = 512
+        fout.write(struct.pack("i", window_size))
+        context_size = 64
+        fout.write(struct.pack("i", context_size))
+
+        n_encoder_layers = 4
+        fout.write(struct.pack("i", n_encoder_layers))
+
+        # Write encoder dimensions
+        input_channels = 129
+        encoder_in_channels = [input_channels, 128, 64, 64]
+        encoder_out_channels = [128, 64, 64, 128]
+        kernel_size = 3
+
+        for i in range(n_encoder_layers):
+            fout.write(struct.pack("i", encoder_in_channels[i]))
+            fout.write(struct.pack("i", encoder_out_channels[i]))
+            fout.write(struct.pack("i", kernel_size))
+
+        # Write LSTM dimensions
+        lstm_input_size = 128
+        lstm_hidden_size = 128
+        fout.write(struct.pack("i", lstm_input_size))
+        fout.write(struct.pack("i", lstm_hidden_size))
+
+        # Write final conv dimensions
+        final_conv_in = 128
+        final_conv_out = 1
+        fout.write(struct.pack("i", final_conv_in))
+        fout.write(struct.pack("i", final_conv_out))
+
+        # Define tensor keys to write
+        tensor_keys = []
+
+        # Encoder weights
+        for i in range(n_encoder_layers):
+            weight_key = f"_model.encoder.{i}.reparam_conv.weight"
+            bias_key = f"_model.encoder.{i}.reparam_conv.bias"
+            if weight_key in cleaned_dict and bias_key in cleaned_dict:
+                tensor_keys.append(weight_key)
+                tensor_keys.append(bias_key)
+
+        # LSTM weights
+        lstm_keys = [
+            "_model.decoder.rnn.weight_ih",
+            "_model.decoder.rnn.weight_hh",
+            "_model.decoder.rnn.bias_ih",
+            "_model.decoder.rnn.bias_hh"
+        ]
+        tensor_keys.extend([k for k in lstm_keys if k in cleaned_dict])
+
+        # Final conv weights
+        final_keys = [
+            "_model.decoder.decoder.2.weight",
+            "_model.decoder.decoder.2.bias"
+        ]
+        tensor_keys.extend([k for k in final_keys if k in cleaned_dict])
+
+        # STFT basis - add this last
+        stft_tensor = "_model.stft.forward_basis_buffer"
+        tensor_keys.append(stft_tensor)
+
+        print(f"Writing {len(tensor_keys)} tensors:")
+        for key in tensor_keys:
+            if key in cleaned_dict:
+                print(f"  - {key}: {cleaned_dict[key].shape}")
+            else:
+                print(f"  - {key}: MISSING")
+
+        # Process each tensor
+        for key in tensor_keys:
+            if key not in cleaned_dict:
+                print(f"Warning: Missing tensor {key}, skipping")
+                continue
+
+            tensor = cleaned_dict[key]
+
+            # Special handling for STFT tensor
+            if key == "_model.stft.forward_basis_buffer":
+                # Get the original numpy array without squeezing
+                data = tensor.detach().cpu().numpy()
+                # Ensure it has the expected shape
+                print(f"STFT tensor original shape: {data.shape}")
+                n_dims = 3
+                tensor_shape = [data.shape[0], data.shape[1], data.shape[2]]
+                is_conv_weight = True
+            else:
+                # For other tensors, we can use standard processing
+                data = tensor.detach().cpu().squeeze().numpy()
+                tensor_shape = list(data.shape)
+
+                # Ensure we have at most 4 dimensions for GGML
+                n_dims = min(len(tensor_shape), 4)
+
+                # Reverse dimensions for GGML
+                tensor_shape = tensor_shape[:n_dims]
+                tensor_shape.reverse()
+
+                # Check if this is a convolution weight tensor
+                is_conv_weight = "weight" in key and ("encoder" in key or "_model.decoder.decoder.2" in key)
+
+            # Convert to float16 for convolution weights
+            if is_conv_weight:
+                data = data.astype(np.float16)
+                ftype = 1  # float16
+            else:
+                ftype = 0  # float32
+
+            # Debug printing of tensor info
+            print(f"\nWriting tensor: {key}")
+            print(f"  Original shape: {tensor.shape}")
+            print(f"  Processed shape: {data.shape}")
+            print(f"  GGML dimensions: {n_dims}")
+            print(f"  GGML shape: {tensor_shape}")
+            print(f"  Type: {'float16' if ftype == 1 else 'float32'}")
+
+            # Convert tensor name to bytes
+            name_bytes = key.encode('utf-8')
+            name_length = len(name_bytes)
+
+            # Write tensor header
+            fout.write(struct.pack("i", n_dims))
+            fout.write(struct.pack("i", name_length))
+            fout.write(struct.pack("i", ftype))
+
+            # Write tensor dimensions
+            for i in range(n_dims):
+                size = tensor_shape[i] if i < len(tensor_shape) else 1
+                fout.write(struct.pack("i", size))
+                print(f"  Writing dimension {i}: {size}")
+
+            # Write tensor name
+            fout.write(name_bytes)
+
+            # Write tensor data
+            data.tofile(fout)
+
+            print(f"  Wrote {data.size * (2 if ftype==1 else 4)} bytes")
+
+    print(f"\nDone! Model has been converted to GGML format: {output_file}")
+    print(f"File size: {os.path.getsize(output_file)} bytes")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Silero-VAD PyTorch model to GGML format")
+    parser.add_argument("--output", type=str, required=True, help="Path to output GGML model file")
+    parser.add_argument("--print-tensors", action="store_true", help="Print tensor values", default=True)
+    args = parser.parse_args()
+
+    convert_silero_vad(args.output, args.print_tensors)
@@ -139,3 +139,59 @@ static const std::map<asr_tensor, ggml_op> ASR_TENSOR_INFO = {
     {ASR_TENSOR_ATTN_OUT_WEIGHT,       GGML_OP_MUL_MAT},
     {ASR_TENSOR_ATTN_OUT_BIAS,         GGML_OP_ADD},
 };
+
+enum vad_tensor {
+    VAD_TENSOR_STFT_BASIS,
+    VAD_TENSOR_ENC_0_WEIGHT,
+    VAD_TENSOR_ENC_0_BIAS,
+    VAD_TENSOR_ENC_1_WEIGHT,
+    VAD_TENSOR_ENC_1_BIAS,
+    VAD_TENSOR_ENC_2_WEIGHT,
+    VAD_TENSOR_ENC_2_BIAS,
+    VAD_TENSOR_ENC_3_WEIGHT,
+    VAD_TENSOR_ENC_3_BIAS,
+    VAD_TENSOR_LSTM_WEIGHT_IH,
+    VAD_TENSOR_LSTM_WEIGHT_HH,
+    VAD_TENSOR_LSTM_BIAS_IH,
+    VAD_TENSOR_LSTM_BIAS_HH,
+    VAD_TENSOR_FINAL_CONV_WEIGHT,
+    VAD_TENSOR_FINAL_CONV_BIAS,
+};
+
+static const std::map<vad_tensor, ggml_op> VAD_TENSOR_OPS = {
+    {VAD_TENSOR_STFT_BASIS,          GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_0_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_0_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_1_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_1_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_2_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_2_BIAS,          GGML_OP_ADD},
+    {VAD_TENSOR_ENC_3_WEIGHT,        GGML_OP_MUL_MAT},
+    {VAD_TENSOR_ENC_3_BIAS,          GGML_OP_ADD},
+
+    {VAD_TENSOR_LSTM_WEIGHT_IH,      GGML_OP_MUL_MAT},
+    {VAD_TENSOR_LSTM_WEIGHT_HH,      GGML_OP_MUL_MAT},
+    {VAD_TENSOR_LSTM_BIAS_IH,        GGML_OP_ADD},
+    {VAD_TENSOR_LSTM_BIAS_HH,        GGML_OP_ADD},
+
+    {VAD_TENSOR_FINAL_CONV_WEIGHT,   GGML_OP_MUL_MAT},
+    {VAD_TENSOR_FINAL_CONV_BIAS,     GGML_OP_ADD}
+};
+
+static const std::map<vad_tensor, const char *> VAD_TENSOR_NAMES = {
+    {VAD_TENSOR_STFT_BASIS,          "_model.stft.forward_basis_buffer"},
+    {VAD_TENSOR_ENC_0_WEIGHT,        "_model.encoder.0.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_0_BIAS,          "_model.encoder.0.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_1_WEIGHT,        "_model.encoder.1.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_1_BIAS,          "_model.encoder.1.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_2_WEIGHT,        "_model.encoder.2.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_2_BIAS,          "_model.encoder.2.reparam_conv.bias"},
+    {VAD_TENSOR_ENC_3_WEIGHT,        "_model.encoder.3.reparam_conv.weight"},
+    {VAD_TENSOR_ENC_3_BIAS,          "_model.encoder.3.reparam_conv.bias"},
+    {VAD_TENSOR_LSTM_WEIGHT_IH,      "_model.decoder.rnn.weight_ih"},
+    {VAD_TENSOR_LSTM_WEIGHT_HH,      "_model.decoder.rnn.weight_hh"},
+    {VAD_TENSOR_LSTM_BIAS_IH,        "_model.decoder.rnn.bias_ih"},
+    {VAD_TENSOR_LSTM_BIAS_HH,        "_model.decoder.rnn.bias_hh"},
+    {VAD_TENSOR_FINAL_CONV_WEIGHT,   "_model.decoder.decoder.2.weight"},
+    {VAD_TENSOR_FINAL_CONV_BIAS,     "_model.decoder.decoder.2.bias"}
+};