Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -387,4 +387,126 @@ public interface WhisperCppJnaLibrary extends Library {
* @return The result of the benchmark as a string.
*/
String whisper_bench_ggml_mul_mat_str(int nThreads);

// ============================================================================
// Voice Activity Detection (VAD) Functions
// ============================================================================

/**
* Get default VAD parameters.
*
* @return Default VAD parameters
*/
Pointer whisper_vad_default_params();

/**
* Get default VAD context parameters.
*
* @return Default VAD context parameters
*/
Pointer whisper_vad_default_context_params();

/**
* Initialize VAD context from file with parameters.
*
* @param path_model Path to the VAD model file
* @param params VAD context parameters
* @return VAD context pointer on success, null on failure
*/
Pointer whisper_vad_init_from_file_with_params(String path_model, Pointer params);

/**
* Initialize VAD context with model loader and parameters.
*
* @param loader Model loader
* @param params VAD context parameters
* @return VAD context pointer on success, null on failure
*/
Pointer whisper_vad_init_with_params(WhisperModelLoader loader, Pointer params);

/**
* Detect speech in audio samples.
*
* @param vctx VAD context
* @param samples Audio samples (float array)
* @param n_samples Number of samples
* @return true if speech detected, false otherwise
*/
boolean whisper_vad_detect_speech(Pointer vctx, float[] samples, int n_samples);

/**
* Get number of probability values in VAD context.
*
* @param vctx VAD context
* @return Number of probability values
*/
int whisper_vad_n_probs(Pointer vctx);

/**
* Get probability array from VAD context.
*
* @param vctx VAD context
* @return Pointer to probability array
*/
Pointer whisper_vad_probs(Pointer vctx);

/**
* Get VAD segments from pre-computed probabilities.
*
* @param vctx VAD context
* @param params VAD parameters
* @return Pointer to VAD segments
*/
Pointer whisper_vad_segments_from_probs(Pointer vctx, Pointer params);

/**
* Get VAD segments directly from audio samples.
*
* @param vctx VAD context
* @param params VAD parameters
* @param samples Audio samples (float array)
* @param n_samples Number of samples
* @return Pointer to VAD segments
*/
Pointer whisper_vad_segments_from_samples(Pointer vctx, Pointer params, float[] samples, int n_samples);

/**
* Get number of segments in VAD segments result.
*
* @param segments VAD segments pointer
* @return Number of segments
*/
int whisper_vad_segments_n_segments(Pointer segments);

/**
* Get start time of a specific segment.
*
* @param segments VAD segments pointer
* @param i_segment Segment index
* @return Start time in seconds
*/
float whisper_vad_segments_get_segment_t0(Pointer segments, int i_segment);

/**
* Get end time of a specific segment.
*
* @param segments VAD segments pointer
* @param i_segment Segment index
* @return End time in seconds
*/
float whisper_vad_segments_get_segment_t1(Pointer segments, int i_segment);

/**
* Free VAD segments memory.
*
* @param segments VAD segments pointer to free
*/
void whisper_vad_free_segments(Pointer segments);

/**
* Free VAD context memory.
*
* @param ctx VAD context pointer to free
*/
void whisper_vad_free(Pointer ctx);
}
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,38 @@ public void setLogitsFilterCallback(WhisperLogitsFilterCallback callback) {
public long i_start_rule;
public float grammar_penalty;

/** Voice Activity Detection (VAD) parameters */

/** Enable VAD (default = false) */
public CBool vad;

/** Enable VAD */
public void enableVAD(boolean enable) {
vad = enable ? CBool.TRUE : CBool.FALSE;
}

/** Path to VAD model file */
public String vad_model_path;

/** Set VAD model path */
public void setVADModelPath(String path) {
this.vad_model_path = path;
}

/** VAD parameters */
public WhisperVADParams.ByValue vad_params;

/** Set VAD parameters */
public void setVADParams(WhisperVADParams params) {
this.vad_params = new WhisperVADParams.ByValue();
this.vad_params.threshold = params.threshold;
this.vad_params.min_speech_duration_ms = params.min_speech_duration_ms;
this.vad_params.min_silence_duration_ms = params.min_silence_duration_ms;
this.vad_params.max_speech_duration_s = params.max_speech_duration_s;
this.vad_params.speech_pad_ms = params.speech_pad_ms;
this.vad_params.samples_overlap = params.samples_overlap;
}

@Override
protected List<String> getFieldOrder() {
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
Expand All @@ -349,7 +381,8 @@ protected List<String> getFieldOrder() {
"encoder_begin_callback", "encoder_begin_callback_user_data",
"abort_callback", "abort_callback_user_data",
"logits_filter_callback", "logits_filter_callback_user_data",
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty",
"vad", "vad_model_path", "vad_params");
}

public static class ByValue extends WhisperFullParams implements Structure.ByValue {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package io.github.ggerganov.whispercpp.params;

import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;

/**
* Parameters for initializing a VAD context.
*/
public class WhisperVADContextParams extends Structure {

public WhisperVADContextParams() {
super();
}

public WhisperVADContextParams(Pointer p) {
super(p);
}

/** Number of threads to use for VAD processing (default = 4) */
public int n_threads;

/** Use GPU for VAD (default = true) */
public CBool use_gpu;

/** CUDA device to use (default = 0) */
public int gpu_device;

/**
* Set number of threads for VAD processing.
* @param threads Number of threads
*/
public void setThreads(int threads) {
this.n_threads = threads;
}

/**
* Enable or disable GPU for VAD.
* @param enable Whether to use GPU
*/
public void useGpu(boolean enable) {
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
}

/**
* Set CUDA device for VAD.
* @param device CUDA device ID
*/
public void setGpuDevice(int device) {
this.gpu_device = device;
}

@Override
protected List<String> getFieldOrder() {
return Arrays.asList(
"n_threads",
"use_gpu",
"gpu_device"
);
}

public static class ByValue extends WhisperVADContextParams implements Structure.ByValue {
public ByValue() { super(); }
public ByValue(Pointer p) { super(p); }
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
package io.github.ggerganov.whispercpp.params;

import com.sun.jna.*;
import java.util.Arrays;
import java.util.List;

/**
* Voice Activity Detection (VAD) parameters.
* Used for detecting speech segments in audio.
*/
public class WhisperVADParams extends Structure {

public WhisperVADParams() {
super();
}

public WhisperVADParams(Pointer p) {
super(p);
}

/** Probability threshold to consider as speech (default = 0.5) */
public float threshold;

/** Minimum duration for a valid speech segment in milliseconds (default = 250) */
public int min_speech_duration_ms;

/** Minimum silence duration to consider speech as ended in milliseconds (default = 2000) */
public int min_silence_duration_ms;

/** Maximum duration of a speech segment before forcing a new segment in seconds (default = Float.MAX_VALUE) */
public float max_speech_duration_s;

/** Padding added before and after speech segments in milliseconds (default = 400) */
public int speech_pad_ms;

/** Overlap in seconds when copying audio samples from speech segment (default = 1.0) */
public float samples_overlap;

/**
* Set probability threshold for speech detection.
* @param threshold Probability threshold (0.0 to 1.0)
*/
public void setThreshold(float threshold) {
this.threshold = threshold;
}

/**
* Set minimum speech duration.
* @param durationMs Duration in milliseconds
*/
public void setMinSpeechDuration(int durationMs) {
this.min_speech_duration_ms = durationMs;
}

/**
* Set minimum silence duration.
* @param durationMs Duration in milliseconds
*/
public void setMinSilenceDuration(int durationMs) {
this.min_silence_duration_ms = durationMs;
}

/**
* Set maximum speech duration.
* @param durationS Duration in seconds
*/
public void setMaxSpeechDuration(float durationS) {
this.max_speech_duration_s = durationS;
}

/**
* Set speech padding.
* @param paddingMs Padding in milliseconds
*/
public void setSpeechPadding(int paddingMs) {
this.speech_pad_ms = paddingMs;
}

/**
* Set samples overlap.
* @param overlapS Overlap in seconds
*/
public void setSamplesOverlap(float overlapS) {
this.samples_overlap = overlapS;
}

@Override
protected List<String> getFieldOrder() {
return Arrays.asList(
"threshold",
"min_speech_duration_ms",
"min_silence_duration_ms",
"max_speech_duration_s",
"speech_pad_ms",
"samples_overlap"
);
}

public static class ByValue extends WhisperVADParams implements Structure.ByValue {
public ByValue() { super(); }
public ByValue(Pointer p) { super(p); }
}
}