@@ -4649,7 +4649,6 @@ static struct ggml_cgraph * whisper_vad_build_graph(whisper_vad_context & vctx,
4649
4649
whisper_vad_state & vstate) {
4650
4650
const auto & model = vctx.model ;
4651
4651
4652
- WHISPER_LOG_INFO (" %s: Building VAD graph\n " , __func__);
4653
4652
struct ggml_init_params params = {
4654
4653
/* .mem_size =*/ vstate.sched .meta .size (),
4655
4654
/* .mem_buffer =*/ vstate.sched .meta .data (),
@@ -4828,16 +4827,12 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
4828
4827
for (int32_t i = 0 ; i < hparams.n_encoder_layers ; i++) {
4829
4828
WHISPER_LOG_INFO (" %s: encoder_out_channels[%d] = %d\n " , __func__, i, hparams.encoder_out_channels [i]);
4830
4829
}
4831
- for (int32_t i = 0 ; i < hparams.n_encoder_layers ; i++) {
4832
- WHISPER_LOG_INFO (" %s: kernel_sizes[%d] = %d\n " , __func__, i, hparams.kernel_sizes [i]);
4833
- }
4834
4830
WHISPER_LOG_INFO (" %s: lstm_input_size = %d\n " , __func__, hparams.lstm_input_size );
4835
4831
WHISPER_LOG_INFO (" %s: lstm_hidden_size = %d\n " , __func__, hparams.lstm_hidden_size );
4836
4832
WHISPER_LOG_INFO (" %s: final_conv_in = %d\n " , __func__, hparams.final_conv_in );
4837
4833
WHISPER_LOG_INFO (" %s: final_conv_out = %d\n " , __func__, hparams.final_conv_out );
4838
4834
}
4839
4835
4840
-
4841
4836
// 1 STFT tensor, 4*2 encoder tensors, 4 LSTM tensors, 2 final output tensors
4842
4837
const size_t n_tensors = hparams.n_encoder_layers * 2 + 4 + 2 + 1 ;
4843
4838
@@ -4884,7 +4879,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
4884
4879
return tensor;
4885
4880
};
4886
4881
4887
- // prepare tensors for the weights
4882
+ // create tensors
4888
4883
{
4889
4884
ggml_init_params params = {
4890
4885
/* .mem_size =*/ n_tensors * ggml_tensor_overhead (),
@@ -4995,9 +4990,7 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
4995
4990
// load weights
4996
4991
{
4997
4992
size_t total_size = 0 ;
4998
-
4999
4993
model.n_loaded = 0 ;
5000
-
5001
4994
std::vector<char > read_buf;
5002
4995
5003
4996
while (true ) {
@@ -5021,8 +5014,8 @@ struct whisper_vad_context * whisper_vad_init_with_params_no_state(struct whispe
5021
5014
}
5022
5015
5023
5016
std::string name;
5024
- std::vector<char > tmp (length); // create a buffer
5025
- loader->read (loader->context , &tmp[0 ], tmp.size ()); // read to buffer
5017
+ std::vector<char > tmp (length);
5018
+ loader->read (loader->context , &tmp[0 ], tmp.size ());
5026
5019
name.assign (&tmp[0 ], tmp.size ());
5027
5020
5028
5021
if (model.tensors .find (name) == model.tensors .end ()) {
@@ -5123,7 +5116,7 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
5123
5116
float * probs= new float [n_chunks];
5124
5117
WHISPER_LOG_INFO (" %s: props size: %u\n " , __func__, n_chunks);
5125
5118
5126
- std::vector<float > window_with_context (vctx->n_window , 0 .0f );
5119
+ std::vector<float > window (vctx->n_window , 0 .0f );
5127
5120
for (int i = 0 ; i < n_chunks; i++) {
5128
5121
int start_idx = i * vctx->n_window ;
5129
5122
int end_idx = std::min (start_idx + vctx->n_window , n_samples);
@@ -5134,22 +5127,22 @@ struct whisper_vad_speech whisper_vad_detect_speech(struct whisper_vad_context *
5134
5127
std::vector<float > partial_chunk (vctx->n_window , 0 .0f );
5135
5128
std::copy (pcmf32 + start_idx, pcmf32 + end_idx, partial_chunk.begin ());
5136
5129
5137
- // Copy the zero-padded chunk after the context
5130
+ // Copy the zero-padded chunk to the window.
5138
5131
int max_samples_to_copy = vctx->n_window ;
5139
5132
int actual_samples_to_copy = std::min (max_samples_to_copy, (int )partial_chunk.size ());
5140
- std::copy (partial_chunk.begin (), partial_chunk.begin () + actual_samples_to_copy, window_with_context .begin ());
5133
+ std::copy (partial_chunk.begin (), partial_chunk.begin () + actual_samples_to_copy, window .begin ());
5141
5134
if (actual_samples_to_copy < max_samples_to_copy) {
5142
- std::fill (window_with_context .begin () + actual_samples_to_copy, window_with_context .end (), 0 .0f );
5135
+ std::fill (window .begin () + actual_samples_to_copy, window .end (), 0 .0f );
5143
5136
}
5144
5137
} else {
5145
- // Copy current frame samples to after the context .
5146
- int samples_to_copy = std::min (end_idx - start_idx, 512 );
5138
+ // Copy current frame samples to the window .
5139
+ int samples_to_copy = std::min (end_idx - start_idx, vctx-> n_window );
5147
5140
std::copy (pcmf32 + start_idx, pcmf32 + start_idx + samples_to_copy,
5148
- window_with_context .begin ());
5141
+ window .begin ());
5149
5142
}
5150
5143
5151
- // Set the frame tensor data with the context + the samples.
5152
- ggml_backend_tensor_set (frame, window_with_context .data (), 0 , ggml_nelements (frame) * sizeof (float ));
5144
+ // Set the frame tensor data with the samples.
5145
+ ggml_backend_tensor_set (frame, window .data (), 0 , ggml_nelements (frame) * sizeof (float ));
5153
5146
5154
5147
ggml_backend_tensor_set (h_in, h_state.data (), 0 , hidden_dim * sizeof (float ));
5155
5148
ggml_backend_tensor_set (c_in, c_state.data (), 0 , hidden_dim * sizeof (float ));
@@ -5229,7 +5222,7 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5229
5222
} speech_segment_t ;
5230
5223
5231
5224
// Allocate initial memory for speech segments.
5232
- int speech_capacity = 16 ;
5225
+ int speech_capacity = 16 ;
5233
5226
speech_segment_t * speeches = (speech_segment_t *)malloc (speech_capacity * sizeof (speech_segment_t ));
5234
5227
if (!speeches) {
5235
5228
WHISPER_LOG_ERROR (" %s: failed to allocate memory for temporary segments\n " , __func__);
@@ -5246,8 +5239,29 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5246
5239
int temp_end = 0 ;
5247
5240
int prev_end = 0 ;
5248
5241
int next_start = 0 ;
5249
- int curr_speech_start = 0 ;
5250
- bool has_curr_speech = false ;
5242
+ int curr_speech_start = 0 ;
5243
+ bool has_curr_speech = false ;
5244
+
5245
+ auto resize_speeches = [&]() -> bool {
5246
+ if (speech_count >= speech_capacity) {
5247
+ speech_capacity *= 2 ;
5248
+ speech_segment_t * new_speeches = (speech_segment_t *)realloc (speeches,
5249
+ speech_capacity * sizeof (speech_segment_t ));
5250
+ if (!new_speeches) {
5251
+ WHISPER_LOG_ERROR (" %s: failed to reallocate memory for speech segments\n " , __func__);
5252
+ free (speeches);
5253
+ return false ;
5254
+ }
5255
+ speeches = new_speeches;
5256
+
5257
+ // Initialize new memory
5258
+ for (int j = speech_count; j < speech_capacity; j++) {
5259
+ speeches[j].start = 0 ;
5260
+ speeches[j].end = 0 ;
5261
+ }
5262
+ }
5263
+ return true ;
5264
+ };
5251
5265
5252
5266
for (int i = 0 ; i < n_probs; i++) {
5253
5267
float curr_prob = probs[i];
@@ -5273,21 +5287,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5273
5287
if (is_speech_segment && (curr_sample - curr_speech_start) > max_speech_samples) {
5274
5288
if (prev_end) {
5275
5289
// Check if we need to increase capacity
5276
- if (speech_count >= speech_capacity) {
5277
- speech_capacity *= 2 ;
5278
- speech_segment_t * new_speeches = (speech_segment_t *)realloc (speeches, speech_capacity * sizeof (speech_segment_t ));
5279
- if (!new_speeches) {
5280
- WHISPER_LOG_ERROR (" %s: failed to reallocate memory for speech segments\n " , __func__);
5281
- free (speeches);
5282
- return { 0 , nullptr };
5283
- }
5284
- speeches = new_speeches;
5285
-
5286
- // Initialize new memory
5287
- for (int j = speech_count; j < speech_capacity; j++) {
5288
- speeches[j].start = 0 ;
5289
- speeches[j].end = 0 ;
5290
- }
5290
+ if (!resize_speeches ()) {
5291
+ return { 0 , nullptr };
5291
5292
}
5292
5293
5293
5294
// Add segment ending at previously detected silence
@@ -5305,21 +5306,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5305
5306
prev_end = next_start = temp_end = 0 ;
5306
5307
} else {
5307
5308
// No silence detected, force end the segment
5308
- if (speech_count >= speech_capacity) {
5309
- speech_capacity *= 2 ;
5310
- speech_segment_t * new_speeches = (speech_segment_t *)realloc (speeches, speech_capacity * sizeof (speech_segment_t ));
5311
- if (!new_speeches) {
5312
- WHISPER_LOG_ERROR (" %s: failed to reallocate memory for speech segments\n " , __func__);
5313
- free (speeches);
5314
- return { 0 , nullptr };
5315
- }
5316
- speeches = new_speeches;
5317
-
5318
- // Initialize new memory
5319
- for (int j = speech_count; j < speech_capacity; j++) {
5320
- speeches[j].start = 0 ;
5321
- speeches[j].end = 0 ;
5322
- }
5309
+ if (!resize_speeches ()) {
5310
+ return { 0 , nullptr };
5323
5311
}
5324
5312
5325
5313
speeches[speech_count].start = curr_speech_start;
@@ -5351,21 +5339,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5351
5339
// End the segment if it's long enough
5352
5340
if ((temp_end - curr_speech_start) > min_speech_samples) {
5353
5341
// Check if we need to increase capacity
5354
- if (speech_count >= speech_capacity) {
5355
- speech_capacity *= 2 ;
5356
- speech_segment_t * new_speeches = (speech_segment_t *)realloc (speeches, speech_capacity * sizeof (speech_segment_t ));
5357
- if (!new_speeches) {
5358
- WHISPER_LOG_ERROR (" %s: failed to reallocate memory for speech segments\n " , __func__);
5359
- free (speeches);
5360
- return { 0 , nullptr };
5361
- }
5362
- speeches = new_speeches;
5363
-
5364
- // Initialize new memory
5365
- for (int j = speech_count; j < speech_capacity; j++) {
5366
- speeches[j].start = 0 ;
5367
- speeches[j].end = 0 ;
5368
- }
5342
+ if (!resize_speeches ()) {
5343
+ return { 0 , nullptr };
5369
5344
}
5370
5345
5371
5346
speeches[speech_count].start = curr_speech_start;
@@ -5384,21 +5359,8 @@ struct whisper_vad_timestamps whisper_vad_timestamps_from_probs(whisper_vad_cont
5384
5359
// Handle the case if we're still in a speech segment at the end
5385
5360
if (has_curr_speech && (audio_length_samples - curr_speech_start) > min_speech_samples) {
5386
5361
// Check if we need to increase capacity
5387
- if (speech_count >= speech_capacity) {
5388
- speech_capacity *= 2 ;
5389
- speech_segment_t * new_speeches = (speech_segment_t *)realloc (speeches, speech_capacity * sizeof (speech_segment_t ));
5390
- if (!new_speeches) {
5391
- WHISPER_LOG_ERROR (" %s: failed to reallocate memory for speech segments\n " , __func__);
5392
- free (speeches);
5393
- return { 0 , nullptr };
5394
- }
5395
- speeches = new_speeches;
5396
-
5397
- // Initialize new memory
5398
- for (int j = speech_count; j < speech_capacity; j++) {
5399
- speeches[j].start = 0 ;
5400
- speeches[j].end = 0 ;
5401
- }
5362
+ if (!resize_speeches ()) {
5363
+ return { 0 , nullptr };
5402
5364
}
5403
5365
5404
5366
speeches[speech_count].start = curr_speech_start;
0 commit comments