ggml-org · alubbe · Oct 12, 2025
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@@ -387,4 +387,126 @@ public interface WhisperCppJnaLibrary extends Library {
      * @return The result of the benchmark as a string.
      */
     String whisper_bench_ggml_mul_mat_str(int nThreads);
+
+    // ============================================================================
+    // Voice Activity Detection (VAD) Functions
+    // ============================================================================
+
+    /**
+     * Get default VAD parameters.
+     *
+     * @return Default VAD parameters
+     */
+    Pointer whisper_vad_default_params();
+
+    /**
+     * Get default VAD context parameters.
+     *
+     * @return Default VAD context parameters
+     */
+    Pointer whisper_vad_default_context_params();
+
+    /**
+     * Initialize VAD context from file with parameters.
+     *
+     * @param path_model Path to the VAD model file
+     * @param params VAD context parameters
+     * @return VAD context pointer on success, null on failure
+     */
+    Pointer whisper_vad_init_from_file_with_params(String path_model, Pointer params);
+
+    /**
+     * Initialize VAD context with model loader and parameters.
+     *
+     * @param loader Model loader
+     * @param params VAD context parameters
+     * @return VAD context pointer on success, null on failure
+     */
+    Pointer whisper_vad_init_with_params(WhisperModelLoader loader, Pointer params);
+
+    /**
+     * Detect speech in audio samples.
+     *
+     * @param vctx VAD context
+     * @param samples Audio samples (float array)
+     * @param n_samples Number of samples
+     * @return true if speech detected, false otherwise
+     */
+    boolean whisper_vad_detect_speech(Pointer vctx, float[] samples, int n_samples);
+
+    /**
+     * Get number of probability values in VAD context.
+     *
+     * @param vctx VAD context
+     * @return Number of probability values
+     */
+    int whisper_vad_n_probs(Pointer vctx);
+
+    /**
+     * Get probability array from VAD context.
+     *
+     * @param vctx VAD context
+     * @return Pointer to probability array
+     */
+    Pointer whisper_vad_probs(Pointer vctx);
+
+    /**
+     * Get VAD segments from pre-computed probabilities.
+     *
+     * @param vctx VAD context
+     * @param params VAD parameters
+     * @return Pointer to VAD segments
+     */
+    Pointer whisper_vad_segments_from_probs(Pointer vctx, Pointer params);
+
+    /**
+     * Get VAD segments directly from audio samples.
+     *
+     * @param vctx VAD context
+     * @param params VAD parameters
+     * @param samples Audio samples (float array)
+     * @param n_samples Number of samples
+     * @return Pointer to VAD segments
+     */
+    Pointer whisper_vad_segments_from_samples(Pointer vctx, Pointer params, float[] samples, int n_samples);
+
+    /**
+     * Get number of segments in VAD segments result.
+     *
+     * @param segments VAD segments pointer
+     * @return Number of segments
+     */
+    int whisper_vad_segments_n_segments(Pointer segments);
+
+    /**
+     * Get start time of a specific segment.
+     *
+     * @param segments VAD segments pointer
+     * @param i_segment Segment index
+     * @return Start time in seconds
+     */
+    float whisper_vad_segments_get_segment_t0(Pointer segments, int i_segment);
+
+    /**
+     * Get end time of a specific segment.
+     *
+     * @param segments VAD segments pointer
+     * @param i_segment Segment index
+     * @return End time in seconds
+     */
+    float whisper_vad_segments_get_segment_t1(Pointer segments, int i_segment);
+
+    /**
+     * Free VAD segments memory.
+     *
+     * @param segments VAD segments pointer to free
+     */
+    void whisper_vad_free_segments(Pointer segments);
+
+    /**
+     * Free VAD context memory.
+     *
+     * @param ctx VAD context pointer to free
+     */
+    void whisper_vad_free(Pointer ctx);
 }
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -331,6 +331,38 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
     public long i_start_rule;
     public float grammar_penalty;
 
+    /** Voice Activity Detection (VAD) parameters */
+
+    /** Enable VAD (default = false) */
+    public CBool vad;
+
+    /** Enable VAD */
+    public void enableVAD(boolean enable) {
+        vad = enable ? CBool.TRUE : CBool.FALSE;
+    }
+
+    /** Path to VAD model file */
+    public String vad_model_path;
+
+    /** Set VAD model path */
+    public void setVADModelPath(String path) {
+        this.vad_model_path = path;
+    }
+
+    /** VAD parameters */
+    public WhisperVADParams.ByValue vad_params;
+
+    /** Set VAD parameters */
+    public void setVADParams(WhisperVADParams params) {
+        this.vad_params = new WhisperVADParams.ByValue();
+        this.vad_params.threshold = params.threshold;
+        this.vad_params.min_speech_duration_ms = params.min_speech_duration_ms;
+        this.vad_params.min_silence_duration_ms = params.min_silence_duration_ms;
+        this.vad_params.max_speech_duration_s = params.max_speech_duration_s;
+        this.vad_params.speech_pad_ms = params.speech_pad_ms;
+        this.vad_params.samples_overlap = params.samples_overlap;
+    }
+
     @Override
     protected List<String> getFieldOrder() {
         return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
@@ -349,7 +381,8 @@ protected List<String> getFieldOrder() {
                 "encoder_begin_callback", "encoder_begin_callback_user_data",
                 "abort_callback", "abort_callback_user_data",
                 "logits_filter_callback", "logits_filter_callback_user_data",
-                "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
+                "grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty",
+                "vad", "vad_model_path", "vad_params");
     }
 
     public static class ByValue extends WhisperFullParams implements Structure.ByValue {

diff --git a/...ngs/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADContextParams.java b/...ngs/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADContextParams.java
@@ -0,0 +1,66 @@
+package io.github.ggerganov.whispercpp.params;
+
+import com.sun.jna.*;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Parameters for initializing a VAD context.
+ */
+public class WhisperVADContextParams extends Structure {
+
+    public WhisperVADContextParams() {
+        super();
+    }
+
+    public WhisperVADContextParams(Pointer p) {
+        super(p);
+    }
+
+    /** Number of threads to use for VAD processing (default = 4) */
+    public int n_threads;
+
+    /** Use GPU for VAD (default = true) */
+    public CBool use_gpu;
+
+    /** CUDA device to use (default = 0) */
+    public int gpu_device;
+
+    /**
+     * Set number of threads for VAD processing.
+     * @param threads Number of threads
+     */
+    public void setThreads(int threads) {
+        this.n_threads = threads;
+    }
+
+    /**
+     * Enable or disable GPU for VAD.
+     * @param enable Whether to use GPU
+     */
+    public void useGpu(boolean enable) {
+        use_gpu = enable ? CBool.TRUE : CBool.FALSE;
+    }
+
+    /**
+     * Set CUDA device for VAD.
+     * @param device CUDA device ID
+     */
+    public void setGpuDevice(int device) {
+        this.gpu_device = device;
+    }
+
+    @Override
+    protected List<String> getFieldOrder() {
+        return Arrays.asList(
+            "n_threads",
+            "use_gpu",
+            "gpu_device"
+        );
+    }
+
+    public static class ByValue extends WhisperVADContextParams implements Structure.ByValue {
+        public ByValue() { super(); }
+        public ByValue(Pointer p) { super(p); }
+    }
+}
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperVADParams.java
@@ -0,0 +1,103 @@
+package io.github.ggerganov.whispercpp.params;
+
+import com.sun.jna.*;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Voice Activity Detection (VAD) parameters.
+ * Used for detecting speech segments in audio.
+ */
+public class WhisperVADParams extends Structure {
+
+    public WhisperVADParams() {
+        super();
+    }
+
+    public WhisperVADParams(Pointer p) {
+        super(p);
+    }
+
+    /** Probability threshold to consider as speech (default = 0.5) */
+    public float threshold;
+
+    /** Minimum duration for a valid speech segment in milliseconds (default = 250) */
+    public int min_speech_duration_ms;
+
+    /** Minimum silence duration to consider speech as ended in milliseconds (default = 2000) */
+    public int min_silence_duration_ms;
+
+    /** Maximum duration of a speech segment before forcing a new segment in seconds (default = Float.MAX_VALUE) */
+    public float max_speech_duration_s;
+
+    /** Padding added before and after speech segments in milliseconds (default = 400) */
+    public int speech_pad_ms;
+
+    /** Overlap in seconds when copying audio samples from speech segment (default = 1.0) */
+    public float samples_overlap;
+
+    /**
+     * Set probability threshold for speech detection.
+     * @param threshold Probability threshold (0.0 to 1.0)
+     */
+    public void setThreshold(float threshold) {
+        this.threshold = threshold;
+    }
+
+    /**
+     * Set minimum speech duration.
+     * @param durationMs Duration in milliseconds
+     */
+    public void setMinSpeechDuration(int durationMs) {
+        this.min_speech_duration_ms = durationMs;
+    }
+
+    /**
+     * Set minimum silence duration.
+     * @param durationMs Duration in milliseconds
+     */
+    public void setMinSilenceDuration(int durationMs) {
+        this.min_silence_duration_ms = durationMs;
+    }
+
+    /**
+     * Set maximum speech duration.
+     * @param durationS Duration in seconds
+     */
+    public void setMaxSpeechDuration(float durationS) {
+        this.max_speech_duration_s = durationS;
+    }
+
+    /**
+     * Set speech padding.
+     * @param paddingMs Padding in milliseconds
+     */
+    public void setSpeechPadding(int paddingMs) {
+        this.speech_pad_ms = paddingMs;
+    }
+
+    /**
+     * Set samples overlap.
+     * @param overlapS Overlap in seconds
+     */
+    public void setSamplesOverlap(float overlapS) {
+        this.samples_overlap = overlapS;
+    }
+
+    @Override
+    protected List<String> getFieldOrder() {
+        return Arrays.asList(
+            "threshold",
+            "min_speech_duration_ms",
+            "min_silence_duration_ms",
+            "max_speech_duration_s",
+            "speech_pad_ms",
+            "samples_overlap"
+        );
+    }
+
+    public static class ByValue extends WhisperVADParams implements Structure.ByValue {
+        public ByValue() { super(); }
+        public ByValue(Pointer p) { super(p); }
+    }
+}