vad : extract speeches resize into lambda [no ci]

danbev · danbev · commit 5758650ffeeb · 2025-04-24T15:14:24.000+02:00
diff --git a/include/whisper.h b/include/whisper.h
@@ -682,7 +682,7 @@ extern "C" {
         bool  use_gpu;
         int   gpu_device;  // CUDA device
     };
-    WHISPER_API struct whisper_vad_context_params  whisper_vad_default_context_params(void);
+    WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
 
     WHISPER_API struct whisper_vad_state * whisper_vad_init_state(struct whisper_vad_context * ctx);
 
diff --git a/src/whisper.cpp b/src/whisper.cpp
@@ -4649,7 +4649,6 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
         whisper_vad_state & vstate) {
     const auto & model   = vctx.model;
 
-    WHISPER_LOG_INFO("%s: Building VAD graph\n", __func__);
     struct ggml_init_params params = {
         /*.mem_size   =*/ vstate.sched.meta.size(),
         /*.mem_buffer =*/ vstate.sched.meta.data(),
@@ -4828,16 +4827,12 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
         for (int32_t i = 0; i < hparams.n_encoder_layers; i++) {
             WHISPER_LOG_INFO("%s: encoder_out_channels[%d] = %d\n", __func__, i, hparams.encoder_out_channels[i]);
         }
-        for (int32_t i = 0; i < hparams.n_encoder_layers; i++) {
-            WHISPER_LOG_INFO("%s: kernel_sizes[%d] = %d\n", __func__, i, hparams.kernel_sizes[i]);
-        }
         WHISPER_LOG_INFO("%s: lstm_input_size = %d\n", __func__, hparams.lstm_input_size);
         WHISPER_LOG_INFO("%s: lstm_hidden_size = %d\n", __func__, hparams.lstm_hidden_size);
         WHISPER_LOG_INFO("%s: final_conv_in = %d\n", __func__, hparams.final_conv_in);
         WHISPER_LOG_INFO("%s: final_conv_out = %d\n", __func__, hparams.final_conv_out);
     }
 
-
     // 1 STFT tensor, 4*2 encoder tensors, 4 LSTM tensors, 2 final output tensors
     const size_t n_tensors = hparams.n_encoder_layers * 2 + 4 + 2 + 1;
 
@@ -4884,7 +4879,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
         return tensor;
     };
 
-    // prepare tensors for the weights
+    // create tensors
     {
         ggml_init_params params = {
             /*.mem_size   =*/ n_tensors * ggml_tensor_overhead(),
@@ -4995,9 +4990,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
     // load weights
     {
         size_t total_size = 0;
-
         model.n_loaded = 0;
-
         std::vector<char> read_buf;
 
         while (true) {
@@ -5021,8 +5014,8 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
             }
 
             std::string name;
-            std::vector<char> tmp(length); // create a buffer
-            loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
+            std::vector<char> tmp(length);
+            loader->read(loader->context, &tmp[0], tmp.size());
             name.assign(&tmp[0], tmp.size());
 
             if (model.tensors.find(name) == model.tensors.end()) {
@@ -5123,7 +5116,7 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
     float * probs= new float[n_chunks];
     WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
 
-    std::vector<float> window_with_context(vctx->n_window, 0.0f);
+    std::vector<float> window(vctx->n_window, 0.0f);
     for (int i = 0; i < n_chunks; i++) {
         int start_idx = i * vctx->n_window;
         int end_idx = std::min(start_idx + vctx->n_window, n_samples);
@@ -5134,22 +5127,22 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
             std::vector<float> partial_chunk(vctx->n_window, 0.0f);
             std::copy(pcmf32 + start_idx, pcmf32 + end_idx, partial_chunk.begin());
 
-            // Copy the zero-padded chunk after the context
+            // Copy the zero-padded chunk to the window.
             int max_samples_to_copy = vctx->n_window;
             int actual_samples_to_copy = std::min(max_samples_to_copy, (int)partial_chunk.size());
-            std::copy(partial_chunk.begin(), partial_chunk.begin() + actual_samples_to_copy, window_with_context.begin());
+            std::copy(partial_chunk.begin(), partial_chunk.begin() + actual_samples_to_copy, window.begin());
             if (actual_samples_to_copy < max_samples_to_copy) {
-                std::fill(window_with_context.begin() + actual_samples_to_copy, window_with_context.end(), 0.0f);
+                std::fill(window.begin() + actual_samples_to_copy, window.end(), 0.0f);
             }
         } else {
-            // Copy current frame samples to after the context.
-            int samples_to_copy = std::min(end_idx - start_idx, 512);
+            // Copy current frame samples to the window.
+            int samples_to_copy = std::min(end_idx - start_idx, vctx->n_window);
             std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
-                 window_with_context.begin());
+                 window.begin());
         }
 
-        // Set the frame tensor data with the context + the samples.
-        ggml_backend_tensor_set(frame, window_with_context.data(), 0, ggml_nelements(frame) * sizeof(float));
+        // Set the frame tensor data with the samples.
+        ggml_backend_tensor_set(frame, window.data(), 0, ggml_nelements(frame) * sizeof(float));
 
         ggml_backend_tensor_set(h_in, h_state.data(), 0, hidden_dim * sizeof(float));
         ggml_backend_tensor_set(c_in, c_state.data(), 0, hidden_dim * sizeof(float));
@@ -5229,7 +5222,7 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
     } speech_segment_t;
 
     // Allocate initial memory for speech segments.
-    int  speech_capacity   = 16;
+    int speech_capacity = 16;
     speech_segment_t * speeches = (speech_segment_t*)malloc(speech_capacity * sizeof(speech_segment_t));
     if (!speeches) {
         WHISPER_LOG_ERROR("%s: failed to allocate memory for temporary segments\n", __func__);
@@ -5246,8 +5239,29 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
     int  temp_end             = 0;
     int  prev_end             = 0;
     int  next_start           = 0;
-    int  curr_speech_start = 0;
-    bool has_curr_speech   = false;
+    int  curr_speech_start    = 0;
+    bool has_curr_speech      = false;
+
+    auto resize_speeches = [&]() -> bool {
+        if (speech_count >= speech_capacity) {
+            speech_capacity *= 2;
+            speech_segment_t* new_speeches = (speech_segment_t*)realloc(speeches,
+                                              speech_capacity * sizeof(speech_segment_t));
+            if (!new_speeches) {
+                WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
+                free(speeches);
+                return false;
+            }
+            speeches = new_speeches;
+
+            // Initialize new memory
+            for (int j = speech_count; j < speech_capacity; j++) {
+                speeches[j].start = 0;
+                speeches[j].end = 0;
+            }
+        }
+        return true;
+    };
 
     for (int i = 0; i < n_probs; i++) {
         float curr_prob   = probs[i];
@@ -5273,21 +5287,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
         if (is_speech_segment && (curr_sample - curr_speech_start) > max_speech_samples) {
             if (prev_end) {
                 // Check if we need to increase capacity
-                if (speech_count >= speech_capacity) {
-                    speech_capacity *= 2;
-                    speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
-                    if (!new_speeches) {
-                        WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
-                        free(speeches);
-                        return { 0, nullptr };
-                    }
-                    speeches = new_speeches;
-
-                    // Initialize new memory
-                    for (int j = speech_count; j < speech_capacity; j++) {
-                        speeches[j].start = 0;
-                        speeches[j].end = 0;
-                    }
+                if (!resize_speeches()) {
+                    return { 0, nullptr };
                 }
 
                 // Add segment ending at previously detected silence
@@ -5305,21 +5306,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
                 prev_end = next_start = temp_end = 0;
             } else {
                 // No silence detected, force end the segment
-                if (speech_count >= speech_capacity) {
-                    speech_capacity *= 2;
-                    speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
-                    if (!new_speeches) {
-                        WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
-                        free(speeches);
-                        return { 0, nullptr };
-                    }
-                    speeches = new_speeches;
-
-                    // Initialize new memory
-                    for (int j = speech_count; j < speech_capacity; j++) {
-                        speeches[j].start = 0;
-                        speeches[j].end = 0;
-                    }
+                if (!resize_speeches()) {
+                    return { 0, nullptr };
                 }
 
                 speeches[speech_count].start = curr_speech_start;
@@ -5351,21 +5339,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
                 // End the segment if it's long enough
                 if ((temp_end - curr_speech_start) > min_speech_samples) {
                     // Check if we need to increase capacity
-                    if (speech_count >= speech_capacity) {
-                        speech_capacity *= 2;
-                        speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
-                        if (!new_speeches) {
-                            WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
-                            free(speeches);
-                            return { 0, nullptr };
-                        }
-                        speeches = new_speeches;
-
-                        // Initialize new memory
-                        for (int j = speech_count; j < speech_capacity; j++) {
-                            speeches[j].start = 0;
-                            speeches[j].end = 0;
-                        }
+                    if (!resize_speeches()) {
+                        return { 0, nullptr };
                     }
 
                     speeches[speech_count].start = curr_speech_start;
@@ -5384,21 +5359,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
     // Handle the case if we're still in a speech segment at the end
     if (has_curr_speech && (audio_length_samples - curr_speech_start) > min_speech_samples) {
         // Check if we need to increase capacity
-        if (speech_count >= speech_capacity) {
-            speech_capacity *= 2;
-            speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
-            if (!new_speeches) {
-                WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
-                free(speeches);
-                return { 0, nullptr };
-            }
-            speeches = new_speeches;
-
-            // Initialize new memory
-            for (int j = speech_count; j < speech_capacity; j++) {
-                speeches[j].start = 0;
-                speeches[j].end = 0;
-            }
+        if (!resize_speeches()) {
+            return { 0, nullptr };
         }
 
         speeches[speech_count].start = curr_speech_start;