@@ -4369,35 +4369,35 @@ struct whisper_vad_model {
4369
4369
e_vad_model type = VAD_MODEL_UNKNOWN;
4370
4370
whisper_vad_hparams hparams;
4371
4371
4372
- struct ggml_tensor * stft_forward_basis; // [258, 256 ]
4372
+ struct ggml_tensor * stft_forward_basis; // [256, 258 ]
4373
4373
4374
4374
// Encoder tensors - 4 convolutional layers
4375
- struct ggml_tensor * encoder_0_weight; // [128 , 129, 3 ]
4375
+ struct ggml_tensor * encoder_0_weight; // [3 , 129, 128 ]
4376
4376
struct ggml_tensor * encoder_0_bias; // [128]
4377
4377
4378
4378
// Second encoder layer
4379
- struct ggml_tensor * encoder_1_weight; // [64 , 128, 3 ]
4379
+ struct ggml_tensor * encoder_1_weight; // [3 , 128, 64 ]
4380
4380
struct ggml_tensor * encoder_1_bias; // [64]
4381
4381
4382
4382
// Third encoder layer
4383
- struct ggml_tensor * encoder_2_weight; // [64 , 64, 3 ]
4383
+ struct ggml_tensor * encoder_2_weight; // [3 , 64, 64 ]
4384
4384
struct ggml_tensor * encoder_2_bias; // [64]
4385
4385
4386
4386
// Fourth encoder layer
4387
- struct ggml_tensor * encoder_3_weight; // [128 , 64, 3 ]
4387
+ struct ggml_tensor * encoder_3_weight; // [3 , 64, 128 ]
4388
4388
struct ggml_tensor * encoder_3_bias; // [128]
4389
4389
4390
4390
// LSTM decoder tensors
4391
- struct ggml_tensor * lstm_ih_weight; // [512, 128 ] input-to-hidden
4391
+ struct ggml_tensor * lstm_ih_weight; // [128, 512 ] input-to-hidden
4392
4392
struct ggml_tensor * lstm_ih_bias; // [512]
4393
- struct ggml_tensor * lstm_hh_weight; // [512, 128 ] hidden-to-hidden
4393
+ struct ggml_tensor * lstm_hh_weight; // [128, 512 ] hidden-to-hidden
4394
4394
struct ggml_tensor * lstm_hh_bias; // [512]
4395
4395
4396
4396
// Final conv layer
4397
- struct ggml_tensor * final_conv_weight; // [1, 128, 1 ]
4397
+ struct ggml_tensor * final_conv_weight; // [128]
4398
4398
struct ggml_tensor * final_conv_bias; // [1]
4399
4399
4400
- // ggml context
4400
+ // ggml contexts
4401
4401
std::vector<ggml_context *> ctxs;
4402
4402
4403
4403
// buffer for the model tensors
@@ -4887,20 +4887,16 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
4887
4887
VAD_TENSOR_LSTM_WEIGHT_IH,
4888
4888
ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.lstm_hidden_size , hstate_dim)
4889
4889
);
4890
+ model.lstm_ih_bias = create_tensor (
4891
+ VAD_TENSOR_LSTM_BIAS_IH,
4892
+ ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
4893
+ );
4890
4894
4891
4895
// LSTM weights - hidden to hidden
4892
4896
model.lstm_hh_weight = create_tensor (
4893
4897
VAD_TENSOR_LSTM_WEIGHT_HH,
4894
4898
ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.lstm_hidden_size , hstate_dim)
4895
4899
);
4896
-
4897
- // LSTM bias - input to hidden
4898
- model.lstm_ih_bias = create_tensor (
4899
- VAD_TENSOR_LSTM_BIAS_IH,
4900
- ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
4901
- );
4902
-
4903
- // LSTM bias - hidden to hidden
4904
4900
model.lstm_hh_bias = create_tensor (
4905
4901
VAD_TENSOR_LSTM_BIAS_HH,
4906
4902
ggml_new_tensor_1d (ctx, GGML_TYPE_F32, hstate_dim)
@@ -4911,8 +4907,6 @@ whisper_vad_context * whisper_vad_init_from_file_with_params_no_state(
4911
4907
VAD_TENSOR_FINAL_CONV_WEIGHT,
4912
4908
ggml_new_tensor_2d (ctx, GGML_TYPE_F32, hparams.final_conv_in , 1 )
4913
4909
);
4914
-
4915
- // Final conv layer bias
4916
4910
model.final_conv_bias = create_tensor (
4917
4911
VAD_TENSOR_FINAL_CONV_BIAS,
4918
4912
ggml_new_tensor_1d (ctx, GGML_TYPE_F32, 1 )
0 commit comments