Skip to content

Commit 5758650

Browse files
committed
vad : extract speeches resize into lambda [no ci]
1 parent 55ca3f2 commit 5758650

File tree

2 files changed

+45
-83
lines changed

2 files changed

+45
-83
lines changed

include/whisper.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -682,7 +682,7 @@ extern "C" {
682682
bool use_gpu;
683683
int gpu_device; // CUDA device
684684
};
685-
WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
685+
WHISPER_API struct whisper_vad_context_params whisper_vad_default_context_params(void);
686686

687687
WHISPER_API struct whisper_vad_state * whisper_vad_init_state(struct whisper_vad_context * ctx);
688688

src/whisper.cpp

+44-82
Original file line numberDiff line numberDiff line change
@@ -4649,7 +4649,6 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
46494649
whisper_vad_state & vstate) {
46504650
const auto & model = vctx.model;
46514651

4652-
WHISPER_LOG_INFO("%s: Building VAD graph\n", __func__);
46534652
struct ggml_init_params params = {
46544653
/*.mem_size =*/ vstate.sched.meta.size(),
46554654
/*.mem_buffer =*/ vstate.sched.meta.data(),
@@ -4828,16 +4827,12 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
48284827
for (int32_t i = 0; i < hparams.n_encoder_layers; i++) {
48294828
WHISPER_LOG_INFO("%s: encoder_out_channels[%d] = %d\n", __func__, i, hparams.encoder_out_channels[i]);
48304829
}
4831-
for (int32_t i = 0; i < hparams.n_encoder_layers; i++) {
4832-
WHISPER_LOG_INFO("%s: kernel_sizes[%d] = %d\n", __func__, i, hparams.kernel_sizes[i]);
4833-
}
48344830
WHISPER_LOG_INFO("%s: lstm_input_size = %d\n", __func__, hparams.lstm_input_size);
48354831
WHISPER_LOG_INFO("%s: lstm_hidden_size = %d\n", __func__, hparams.lstm_hidden_size);
48364832
WHISPER_LOG_INFO("%s: final_conv_in = %d\n", __func__, hparams.final_conv_in);
48374833
WHISPER_LOG_INFO("%s: final_conv_out = %d\n", __func__, hparams.final_conv_out);
48384834
}
48394835

4840-
48414836
// 1 STFT tensor, 4*2 encoder tensors, 4 LSTM tensors, 2 final output tensors
48424837
const size_t n_tensors = hparams.n_encoder_layers * 2 + 4 + 2 + 1;
48434838

@@ -4884,7 +4879,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
48844879
return tensor;
48854880
};
48864881

4887-
// prepare tensors for the weights
4882+
// create tensors
48884883
{
48894884
ggml_init_params params = {
48904885
/*.mem_size =*/ n_tensors * ggml_tensor_overhead(),
@@ -4995,9 +4990,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
49954990
// load weights
49964991
{
49974992
size_t total_size = 0;
4998-
49994993
model.n_loaded = 0;
5000-
50014994
std::vector<char> read_buf;
50024995

50034996
while (true) {
@@ -5021,8 +5014,8 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
50215014
}
50225015

50235016
std::string name;
5024-
std::vector<char> tmp(length); // create a buffer
5025-
loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
5017+
std::vector<char> tmp(length);
5018+
loader->read(loader->context, &tmp[0], tmp.size());
50265019
name.assign(&tmp[0], tmp.size());
50275020

50285021
if (model.tensors.find(name) == model.tensors.end()) {
@@ -5123,7 +5116,7 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51235116
float * probs= new float[n_chunks];
51245117
WHISPER_LOG_INFO("%s: props size: %u\n", __func__, n_chunks);
51255118

5126-
std::vector<float> window_with_context(vctx->n_window, 0.0f);
5119+
std::vector<float> window(vctx->n_window, 0.0f);
51275120
for (int i = 0; i < n_chunks; i++) {
51285121
int start_idx = i * vctx->n_window;
51295122
int end_idx = std::min(start_idx + vctx->n_window, n_samples);
@@ -5134,22 +5127,22 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
51345127
std::vector<float> partial_chunk(vctx->n_window, 0.0f);
51355128
std::copy(pcmf32 + start_idx, pcmf32 + end_idx, partial_chunk.begin());
51365129

5137-
// Copy the zero-padded chunk after the context
5130+
// Copy the zero-padded chunk to the window.
51385131
int max_samples_to_copy = vctx->n_window;
51395132
int actual_samples_to_copy = std::min(max_samples_to_copy, (int)partial_chunk.size());
5140-
std::copy(partial_chunk.begin(), partial_chunk.begin() + actual_samples_to_copy, window_with_context.begin());
5133+
std::copy(partial_chunk.begin(), partial_chunk.begin() + actual_samples_to_copy, window.begin());
51415134
if (actual_samples_to_copy < max_samples_to_copy) {
5142-
std::fill(window_with_context.begin() + actual_samples_to_copy, window_with_context.end(), 0.0f);
5135+
std::fill(window.begin() + actual_samples_to_copy, window.end(), 0.0f);
51435136
}
51445137
} else {
5145-
// Copy current frame samples to after the context.
5146-
int samples_to_copy = std::min(end_idx - start_idx, 512);
5138+
// Copy current frame samples to the window.
5139+
int samples_to_copy = std::min(end_idx - start_idx, vctx->n_window);
51475140
std::copy(pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5148-
window_with_context.begin());
5141+
window.begin());
51495142
}
51505143

5151-
// Set the frame tensor data with the context + the samples.
5152-
ggml_backend_tensor_set(frame, window_with_context.data(), 0, ggml_nelements(frame) * sizeof(float));
5144+
// Set the frame tensor data with the samples.
5145+
ggml_backend_tensor_set(frame, window.data(), 0, ggml_nelements(frame) * sizeof(float));
51535146

51545147
ggml_backend_tensor_set(h_in, h_state.data(), 0, hidden_dim * sizeof(float));
51555148
ggml_backend_tensor_set(c_in, c_state.data(), 0, hidden_dim * sizeof(float));
@@ -5229,7 +5222,7 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
52295222
} speech_segment_t;
52305223

52315224
// Allocate initial memory for speech segments.
5232-
int speech_capacity = 16;
5225+
int speech_capacity = 16;
52335226
speech_segment_t * speeches = (speech_segment_t*)malloc(speech_capacity * sizeof(speech_segment_t));
52345227
if (!speeches) {
52355228
WHISPER_LOG_ERROR("%s: failed to allocate memory for temporary segments\n", __func__);
@@ -5246,8 +5239,29 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
52465239
int temp_end = 0;
52475240
int prev_end = 0;
52485241
int next_start = 0;
5249-
int curr_speech_start = 0;
5250-
bool has_curr_speech = false;
5242+
int curr_speech_start = 0;
5243+
bool has_curr_speech = false;
5244+
5245+
auto resize_speeches = [&]() -> bool {
5246+
if (speech_count >= speech_capacity) {
5247+
speech_capacity *= 2;
5248+
speech_segment_t* new_speeches = (speech_segment_t*)realloc(speeches,
5249+
speech_capacity * sizeof(speech_segment_t));
5250+
if (!new_speeches) {
5251+
WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
5252+
free(speeches);
5253+
return false;
5254+
}
5255+
speeches = new_speeches;
5256+
5257+
// Initialize new memory
5258+
for (int j = speech_count; j < speech_capacity; j++) {
5259+
speeches[j].start = 0;
5260+
speeches[j].end = 0;
5261+
}
5262+
}
5263+
return true;
5264+
};
52515265

52525266
for (int i = 0; i < n_probs; i++) {
52535267
float curr_prob = probs[i];
@@ -5273,21 +5287,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
52735287
if (is_speech_segment && (curr_sample - curr_speech_start) > max_speech_samples) {
52745288
if (prev_end) {
52755289
// Check if we need to increase capacity
5276-
if (speech_count >= speech_capacity) {
5277-
speech_capacity *= 2;
5278-
speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
5279-
if (!new_speeches) {
5280-
WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
5281-
free(speeches);
5282-
return { 0, nullptr };
5283-
}
5284-
speeches = new_speeches;
5285-
5286-
// Initialize new memory
5287-
for (int j = speech_count; j < speech_capacity; j++) {
5288-
speeches[j].start = 0;
5289-
speeches[j].end = 0;
5290-
}
5290+
if (!resize_speeches()) {
5291+
return { 0, nullptr };
52915292
}
52925293

52935294
// Add segment ending at previously detected silence
@@ -5305,21 +5306,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
53055306
prev_end = next_start = temp_end = 0;
53065307
} else {
53075308
// No silence detected, force end the segment
5308-
if (speech_count >= speech_capacity) {
5309-
speech_capacity *= 2;
5310-
speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
5311-
if (!new_speeches) {
5312-
WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
5313-
free(speeches);
5314-
return { 0, nullptr };
5315-
}
5316-
speeches = new_speeches;
5317-
5318-
// Initialize new memory
5319-
for (int j = speech_count; j < speech_capacity; j++) {
5320-
speeches[j].start = 0;
5321-
speeches[j].end = 0;
5322-
}
5309+
if (!resize_speeches()) {
5310+
return { 0, nullptr };
53235311
}
53245312

53255313
speeches[speech_count].start = curr_speech_start;
@@ -5351,21 +5339,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
53515339
// End the segment if it's long enough
53525340
if ((temp_end - curr_speech_start) > min_speech_samples) {
53535341
// Check if we need to increase capacity
5354-
if (speech_count >= speech_capacity) {
5355-
speech_capacity *= 2;
5356-
speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
5357-
if (!new_speeches) {
5358-
WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
5359-
free(speeches);
5360-
return { 0, nullptr };
5361-
}
5362-
speeches = new_speeches;
5363-
5364-
// Initialize new memory
5365-
for (int j = speech_count; j < speech_capacity; j++) {
5366-
speeches[j].start = 0;
5367-
speeches[j].end = 0;
5368-
}
5342+
if (!resize_speeches()) {
5343+
return { 0, nullptr };
53695344
}
53705345

53715346
speeches[speech_count].start = curr_speech_start;
@@ -5384,21 +5359,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
53845359
// Handle the case if we're still in a speech segment at the end
53855360
if (has_curr_speech && (audio_length_samples - curr_speech_start) > min_speech_samples) {
53865361
// Check if we need to increase capacity
5387-
if (speech_count >= speech_capacity) {
5388-
speech_capacity *= 2;
5389-
speech_segment_t * new_speeches = (speech_segment_t*)realloc(speeches, speech_capacity * sizeof(speech_segment_t));
5390-
if (!new_speeches) {
5391-
WHISPER_LOG_ERROR("%s: failed to reallocate memory for speech segments\n", __func__);
5392-
free(speeches);
5393-
return { 0, nullptr };
5394-
}
5395-
speeches = new_speeches;
5396-
5397-
// Initialize new memory
5398-
for (int j = speech_count; j < speech_capacity; j++) {
5399-
speeches[j].start = 0;
5400-
speeches[j].end = 0;
5401-
}
5362+
if (!resize_speeches()) {
5363+
return { 0, nullptr };
54025364
}
54035365

54045366
speeches[speech_count].start = curr_speech_start;

0 commit comments

Comments
 (0)