vad : add build encoder layer graph

danbev · danbev · commit a6271ec763a3 · 2025-04-07T14:11:10.000+02:00
diff --git a/include/whisper.h b/include/whisper.h
@@ -687,7 +687,7 @@ extern "C" {
 
     WHISPER_API struct whisper_vad_segments whisper_vad_detect_speech(
             whisper_vad_context * vctx,
-            const float * pcmf32, int n_samples);
+            const float * pcmf32, int n_samples, int n_threads);
 
     WHISPER_API void whisper_vad_free       (struct whisper_vad_context * ctx);
     WHISPER_API void whisper_vad_free_state (struct whisper_vad_state   * state);
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4431,6 +4431,17 @@ struct whisper_vad_context {
     whisper_vad_model model;
     whisper_vad_state * state = nullptr;
 
+    int window_size_samples;
+    int context_samples;
+    int effective_window_size;
+
+    bool triggered;
+    std::vector<float> context_buffer;
+    unsigned int current_sample;
+    unsigned int temp_end;
+
+    std::vector<whisper_vad_segment> detected_segments;
+
     whisper_context_params params;
 
     std::string path_model;
@@ -4470,11 +4481,50 @@ static ggml_backend_buffer_type_t select_weight_buft(const whisper_vad_hparams &
     return nullptr;
 }
 
+static ggml_tensor * whisper_vad_build_encoder_layer(ggml_context* ctx0,
+        const whisper_vad_model & model, ggml_tensor * cur) {
+    WHISPER_LOG_INFO("%s: building encoder layer\n", __func__);
+    // Reshape from the STFT output which is [258, 1, 1, 1] where are complex
+    // number pairs. I think we can ignore the imaginary part and just use the
+    // real part here.
+    struct ggml_tensor * real_part = ggml_view_1d(ctx0, cur, 129, 0);
+    struct ggml_tensor * reshaped = ggml_reshape_3d(ctx0, real_part, 1, 129, 1);
+
+    // First Conv1D: expands to 128 channels.
+    cur = ggml_conv_1d(ctx0, model.encoder_0_weight, reshaped, 1, 1, 1);
+    cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_0_bias, 1, 128, 1));
+    cur = ggml_relu(ctx0, cur);
+
+    // First Conv1D: reduces to 64 channels.
+    cur = ggml_conv_1d(ctx0, model.encoder_1_weight, cur, 1, 1, 1);
+    cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_1_bias, 1, 64, 1));
+    cur = ggml_relu(ctx0, cur);
+
+    // Third Conv1D: maintains 64 channels
+    cur = ggml_conv_1d(ctx0, model.encoder_2_weight, cur, 1, 1, 1);
+    cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_2_bias, 1, 64, 1));
+    cur = ggml_relu(ctx0, cur);
+
+    // Fourth Conv1D: expands to 128 channels
+    cur = ggml_conv_1d(ctx0, model.encoder_3_weight, cur, 1, 1, 1);
+    cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.encoder_3_bias, 1, 128, 1));
+    cur = ggml_relu(ctx0, cur);
+
+    return cur;
+}
+
+static ggml_tensor * whisper_vad_lstm_layer(ggml_context* ctx0,
+        const whisper_vad_context & vctx, ggml_tensor * cur) {
+    WHISPER_LOG_INFO("%s: building LSTM layer\n", __func__);
+
+    return cur;
+}
+
 static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
         whisper_vad_state & vstate) {
     const auto & model   = vctx.model;
     const auto & hparams = model.hparams;
-    const int n_window   = 256;
+    const int n_window = vctx.effective_window_size;
 
     WHISPER_LOG_INFO("%s: Building VAD graph\n", __func__);
     struct ggml_init_params params = {
@@ -4487,18 +4537,25 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
 
     ggml_cgraph * gf = ggml_new_graph(ctx0);
 
-    struct ggml_tensor * samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_window);
-    ggml_set_name(samples, "samples");
-    ggml_set_input(samples);
+    // We process one frame/segment at a time of size n_window.
+    struct ggml_tensor * frame = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_window);
+    ggml_set_name(frame, "frame");
+    ggml_set_input(frame);
 
     struct ggml_tensor * cur = nullptr;
     {
-        cur = ggml_mul_mat(ctx0, model.stft_forward_basis, samples);
+        cur = ggml_mul_mat(ctx0, model.stft_forward_basis, frame);
+        ggml_set_name(cur, "stft");
+        ggml_set_output(cur);
+
+        cur = whisper_vad_build_encoder_layer(ctx0, model, cur);
+
+        cur = whisper_vad_lstm_layer(ctx0, vctx, cur);
     }
 
-    //ggml_build_forward_expand(gf, cur);
+    ggml_build_forward_expand(gf, cur);
 
-    //ggml_free(ctx0);
+    ggml_free(ctx0);
 
     return gf;
 }
@@ -4604,6 +4661,14 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
     whisper_vad_context * vctx = new whisper_vad_context;
     vctx->path_model = path_model;
 
+    vctx->window_size_samples = 192;
+    vctx->context_samples = 64;
+    vctx->effective_window_size = vctx->window_size_samples + vctx->context_samples;
+    vctx->triggered = false;
+    vctx->context_buffer.resize(vctx->context_samples, 0.0f);
+    vctx->current_sample = 0;
+    vctx->temp_end = 0;
+
     auto & model = vctx->model;
     auto & hparams = model.hparams;
 
@@ -4899,19 +4964,91 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
     return vctx;
 }
 
-
 struct whisper_vad_segments whisper_vad_detect_speech(
     struct whisper_vad_context * vctx,
     const float * pcmf32,
-    int n_samples) {
+    int n_samples,
+    int n_threads) {
     WHISPER_LOG_INFO("%s: detecting speech in %d samples\n", __func__, n_samples);
+    auto & sched = vctx->state->sched.sched;
 
     struct whisper_vad_segments segments {
         /* n_segments = */ 0,
         /* segments   = */ nullptr,
     };
 
-    const ggml_cgraph * gf = whisper_vad_build_graph(*vctx, *vctx->state);
+    // Reset state for this detection
+    vctx->triggered = false;
+    vctx->current_sample = 0;
+    vctx->temp_end = 0;
+    std::fill(vctx->context_buffer.begin(), vctx->context_buffer.end(), 0.0f);
+    vctx->detected_segments.clear();
+
+    ggml_cgraph * gf = whisper_vad_build_graph(*vctx, *vctx->state);
+
+    if (!ggml_backend_sched_alloc_graph(sched, gf)) {
+        // TODO(danbev) Add error handling
+        return segments;
+    }
+
+    std::vector<float> window_with_context(vctx->effective_window_size);
+    WHISPER_LOG_INFO("%s: window_with_context.size() = %zu\n", __func__, window_with_context.size());
+    WHISPER_LOG_INFO("%s: window_sample_size: %u\n", __func__, vctx->window_size_samples);
+    WHISPER_LOG_INFO("%s: context_sample_size: %u\n", __func__, vctx->context_samples);
+    WHISPER_LOG_INFO("%s: effective_window_size: %u\n", __func__, vctx->effective_window_size);
+
+    whisper_vad_segment current_segment = {-1.0f, -1.0f};
+    struct ggml_tensor * frame = ggml_graph_get_tensor(gf, "frame");
+
+    WHISPER_LOG_INFO("%s: frame tensor size: %ld\n", __func__, frame->ne[0]);
+
+    for (int i = 0; i < n_samples; i += vctx->window_size_samples) {
+        // Skip if we don't have enough samples for a full window
+        if (i + vctx->window_size_samples > n_samples) {
+            break;
+        }
+        //WHISPER_LOG_INFO("%s: processing window %d\n", __func__, i / vctx->window_size_samples);
+
+        // Copy the previous context buffer into the next window to be processed next
+        // context_buffer contains the 64 samples from the previous window and this is
+        // part of the overlapping windows to avoid spectral leakage.
+        std::copy(vctx->context_buffer.begin(), vctx->context_buffer.end(), window_with_context.begin());
+
+        // Copy the current samples from pcmf32 into the window_with_context,
+        // starting after the context buffer copied above.
+        std::copy(&pcmf32[i], &pcmf32[i + vctx->window_size_samples], window_with_context.begin() + vctx->context_samples);
+
+        ggml_backend_tensor_set(frame, window_with_context.data(), 0, vctx->effective_window_size * sizeof(float));
+
+        if (!ggml_graph_compute_helper(sched, gf, n_threads)) {
+            WHISPER_LOG_ERROR("%s: failed to compute VAD graph\n", __func__);
+            break;
+        }
+
+        // TODO(danbev): get the speech probability once it is implemented
+
+        // Update the context buffer for the next iteration
+        std::copy(&pcmf32[i + vctx->window_size_samples - vctx->context_samples],
+                  &pcmf32[i + vctx->window_size_samples],
+                  vctx->context_buffer.begin());
+
+        vctx->current_sample += vctx->window_size_samples;
+    }
+    WHISPER_LOG_INFO("%s: finished processing %d samples\n", __func__, n_samples);
+
+    // Print out the result of one STFT operation
+    /*
+    {
+        struct ggml_tensor * stft = ggml_graph_get_tensor(gf, "stft");
+        std::vector<float> output;
+        output.resize(256);
+        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched, stft);
+        ggml_backend_tensor_get(stft, output.data(), 0, ggml_nbytes(stft));
+        for (int i = 0; i < 10; i++) {
+            WHISPER_LOG_INFO("%s: output[%d]: %f\n", __func__, i, output[i]);
+        }
+    }
+    */
 
     return segments;
 }
diff --git a/tests/test-vad.cpp b/tests/test-vad.cpp
@@ -28,7 +28,13 @@ int main() {
     assert(read_audio_data(sample_path.c_str(), pcmf32, pcmf32s, false));
     assert(pcmf32.size() > 0);
     assert(pcmf32s.size() == 0); // no stereo vector
-    //printf("Read %zu samples from %s\n", pcmf32.size(), sample_path.c_str());
+
+    /*
+    printf("Read %zu samples from %s\n", pcmf32.size(), sample_path.c_str());
+    for (int i = 900; i < 1000; i++) {
+        printf("%s: input pcmf32[%d]: %f\n", __func__, i, pcmf32[i]);
+    }
+    */
 
     // Load the VAD model
     struct whisper_vad_params params = whisper_vad_default_params();
@@ -45,7 +51,7 @@ int main() {
     // Detect speech segments
     struct whisper_vad_segments segments = whisper_vad_detect_speech(
             vctx,
-            pcmf32.data(), pcmf32.size());
+            pcmf32.data(), pcmf32.size(), 1);
 
     //assert(segments.n_segments > 0);
     return 0;