@@ -88,42 +88,45 @@ static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct g
88
88
89
89
gamma = ggml_add (ctx, ggml_mul_mat (ctx, block->norm1_gamma , style), block->norm1_gamma_bias );
90
90
beta = ggml_add (ctx, ggml_mul_mat (ctx, block->norm1_beta , style), block->norm1_beta_bias );
91
- cur = ggml_cont (ctx, ggml_transpose (ctx, ggml_norm (ctx, ggml_cont (ctx, ggml_transpose (ctx, cur)), 0.00001 )) );
91
+ cur = ggml_norm (ctx, x, 0.00001 );
92
92
93
93
// The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
94
94
// An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
95
- cur = ggml_add (ctx, ggml_add (ctx, cur, ggml_mul (ctx, cur, gamma)), beta);
96
- cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
95
+ cur = ggml_add (ctx, cur, ggml_mul (ctx, cur, ggml_transpose (ctx, gamma)));
96
+ cur = ggml_add (ctx, cur, ggml_transpose (ctx, beta));
97
+ cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
97
98
98
99
if (block->pool ) {
99
- cur = ggml_conv_transpose_1d (ctx, block->pool , ggml_cont (ctx, ggml_transpose (ctx, cur)) , 2 , 1 , 1 , 1 , cur->ne [0 ]);
100
+ cur = ggml_conv_transpose_1d (ctx, block->pool , cur, 2 , 1 , 1 , 1 , cur->ne [1 ]);
100
101
cur = ggml_add (ctx, cur, block->pool_bias );
101
- cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
102
102
}
103
103
104
- cur = ggml_conv_1d (ctx, block->conv1 , ggml_cont (ctx, ggml_transpose (ctx, cur)) , 1 , 1 , 1 );
104
+ cur = ggml_conv_1d (ctx, block->conv1 , cur, 1 , 1 , 1 );
105
105
106
106
cur = ggml_add (ctx, cur, block->conv1_bias );
107
107
gamma = ggml_add (ctx, ggml_mul_mat (ctx, block->norm2_gamma , style), block->norm2_gamma_bias );
108
108
beta = ggml_add (ctx, ggml_mul_mat (ctx, block->norm2_beta , style), block->norm2_beta_bias );
109
- cur = ggml_cont (ctx, ggml_transpose (ctx, ggml_norm (ctx, cur, 0.00001 )) );
109
+ cur = ggml_norm (ctx, cur, 0.00001 );
110
110
111
111
// The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
112
112
// An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
113
- cur = ggml_add (ctx, ggml_add (ctx, cur, ggml_mul (ctx, cur, gamma)), beta);
114
- cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
115
- cur = ggml_add (ctx, ggml_conv_1d (ctx, block->conv2 , ggml_cont (ctx, ggml_transpose (ctx, cur)), 1 , 1 , 1 ), block->conv2_bias );
113
+ cur = ggml_add (ctx, cur, ggml_mul (ctx, cur, ggml_transpose (ctx, gamma)));
114
+ cur = ggml_add (ctx, cur, ggml_transpose (ctx, beta));
115
+ cur = ggml_leaky_relu (ctx, cur, 0 .2f , false );
116
+ cur = ggml_add (ctx, ggml_conv_1d (ctx, block->conv2 , cur, 1 , 1 , 1 ), block->conv2_bias );
116
117
117
- struct ggml_tensor * res = cur;
118
- cur = ggml_cont (ctx, ggml_transpose (ctx, x)) ;
118
+ struct ggml_tensor * res = cur;
119
+ cur = x ;
119
120
if (block->upsample ) {
121
+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
120
122
if (block->pool ) {
121
- cur = ggml_upscale_ext (ctx, cur, cur->ne [0 ]* 2 , cur->ne [1 ], cur->ne [2 ], cur->ne [3 ]);
123
+ cur = ggml_upscale_ext (ctx, cur, cur->ne [0 ], cur->ne [1 ]* 2 , cur->ne [2 ], cur->ne [3 ]);
122
124
}
123
- cur = ggml_conv_1d (ctx, block->upsample , cur, 1 , 0 , 1 );
125
+ cur = ggml_mul_mat (ctx, block->upsample , cur);
126
+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
124
127
}
125
-
126
- return ggml_cont (ctx, ggml_transpose (ctx, ggml_div (ctx, ggml_add (ctx, res, cur), sqrt_tensor))) ;
128
+ cur = ggml_div (ctx, ggml_add (ctx, res, cur), sqrt_tensor);
129
+ return cur;
127
130
}
128
131
129
132
static struct ggml_tensor * build_kokoro_generator_res_block (ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block) {
@@ -158,6 +161,7 @@ static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx,
158
161
}
159
162
160
163
static struct ggml_tensor * build_noise_block (ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style) {
164
+ // This conv_1d seems replaceable with squeezed and transposed ggml_mul_mut, but s0 and p0 are dynamic
161
165
ggml_tensor * cur = ggml_add (ctx, ggml_conv_1d (ctx, block->input_conv , x, block->input_conv_stride , block->input_conv_padding , 1 ), block->input_conv_bias );
162
166
return build_kokoro_generator_res_block (ctx, cur, style, block->res_block );
163
167
}
@@ -510,6 +514,16 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
510
514
}
511
515
}
512
516
517
+ /* *
518
+ * Removes the last axis, for cases where it's redundantly of length 1.
519
+ * assert x.ndim == 3; numpy.squeeze(x, axis=-1)
520
+ */
521
+ static ggml_tensor * squeeze_3d_2d_e0 (ggml_context * ctx, ggml_tensor * x) {
522
+ TTS_ASSERT (x->ne [0 ] == 1 );
523
+ TTS_ASSERT (ggml_is_contiguous (x));
524
+ return ggml_reshape_2d (ctx, x, x->ne [1 ], x->ne [2 ]);
525
+ }
526
+
513
527
void kokoro_model::assign_ada_res_block (ada_residual_conv_block * block, std::string name, ggml_tensor * tensor) {
514
528
if (name == " norm1_gamma_weight" ) {
515
529
block->norm1_gamma = ggml_dup_tensor (ctx, tensor);
@@ -554,6 +568,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
554
568
block->pool_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
555
569
set_tensor (block->pool_bias , tensor);
556
570
} else if (name == " conv1x1_weight" ) {
571
+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
557
572
block->upsample = ggml_dup_tensor (ctx, tensor);
558
573
set_tensor (block->upsample , tensor);
559
574
} else if (name == " conv1x1_bias" ) {
@@ -576,6 +591,7 @@ void kokoro_model::assign_decoder_weight(std::string name, ggml_tensor * tensor)
576
591
decoder->n_conv_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
577
592
set_tensor (decoder->n_conv_bias , tensor);
578
593
} else if (name == " asr_conv_weight" ) {
594
+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
579
595
decoder->asr_conv = ggml_dup_tensor (ctx, tensor);
580
596
set_tensor (decoder->asr_conv , tensor);
581
597
} else if (name == " asr_conv_bias" ) {
@@ -607,12 +623,14 @@ void kokoro_model::assign_duration_weight(std::string name, ggml_tensor * tensor
607
623
prosody_pred->duration_proj_bias = ggml_dup_tensor (ctx, tensor);
608
624
set_tensor (prosody_pred->duration_proj_bias , tensor);
609
625
} else if (name == " n_proj_kernel" ) {
626
+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
610
627
prosody_pred->n_proj_kernel = ggml_dup_tensor (ctx, tensor);
611
628
set_tensor (prosody_pred->n_proj_kernel , tensor);
612
629
} else if (name == " n_proj_bias" ) {
613
630
prosody_pred->n_proj_bias = ggml_dup_tensor (ctx, ggml_transpose (ctx, tensor));
614
631
set_tensor (prosody_pred->n_proj_bias , tensor);
615
632
} else if (name == " f0_proj_kernel" ) {
633
+ tensor = squeeze_3d_2d_e0 (ctx, tensor);
616
634
prosody_pred->f0_proj_kernel = ggml_dup_tensor (ctx, tensor);
617
635
set_tensor (prosody_pred->f0_proj_kernel , tensor);
618
636
} else if (name == " f0_proj_bias" ) {
@@ -1147,20 +1165,27 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
1147
1165
1148
1166
cur = build_lstm (ctx, cur, model->prosody_pred ->shared_lstm , cur->ne [1 ]);
1149
1167
1150
- ggml_build_forward_expand (gf, cur);
1151
1168
1152
1169
struct ggml_tensor * f0_curve = cur;
1170
+ f0_curve = ggml_cont (ctx, ggml_transpose (ctx, f0_curve));
1153
1171
for (auto block : model->prosody_pred ->f0_blocks ) {
1154
1172
f0_curve = build_ada_residual_conv (ctx, f0_curve, block, style_half, model->sqrt_tensor );
1155
1173
}
1156
- f0_curve = ggml_add (ctx, ggml_conv_1d (ctx, model->prosody_pred ->f0_proj_kernel , ggml_cont (ctx, ggml_transpose (ctx, f0_curve)), 1 , 0 , 1 ), model->prosody_pred ->f0_proj_bias );
1174
+ f0_curve = ggml_cont (ctx, ggml_transpose (ctx, f0_curve));
1175
+ f0_curve = ggml_mul_mat (ctx, model->prosody_pred ->f0_proj_kernel , f0_curve);
1176
+ f0_curve = squeeze_3d_2d_e0 (ctx, f0_curve);
1177
+ f0_curve = ggml_add (ctx, f0_curve, model->prosody_pred ->f0_proj_bias );
1157
1178
ggml_set_name (f0_curve, " f0_out" );
1158
1179
1159
1180
struct ggml_tensor * n = cur;
1181
+ n = ggml_cont (ctx, ggml_transpose (ctx, n));
1160
1182
for (auto block : model->prosody_pred ->n_blocks ) {
1161
1183
n = build_ada_residual_conv (ctx, n, block, style_half, model->sqrt_tensor );
1162
1184
}
1163
- n = ggml_add (ctx, ggml_conv_1d (ctx, model->prosody_pred ->n_proj_kernel , ggml_cont (ctx, ggml_transpose (ctx, n)), 1 , 0 , 1 ), model->prosody_pred ->n_proj_bias );
1185
+ n = ggml_cont (ctx, ggml_transpose (ctx, n));
1186
+ n = ggml_mul_mat (ctx, model->prosody_pred ->n_proj_kernel , n);
1187
+ n = squeeze_3d_2d_e0 (ctx, n);
1188
+ n = ggml_add (ctx, n, model->prosody_pred ->n_proj_bias );
1164
1189
ggml_set_name (n, " n_out" );
1165
1190
ggml_build_forward_expand (gf, n);
1166
1191
@@ -1188,17 +1213,20 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
1188
1213
struct ggml_tensor * style_half2 = ggml_view_1d (ctx, voice, voice->ne [0 ]/2 , (batch.n_tokens - 3 ) * voice->nb [1 ]);
1189
1214
1190
1215
{
1191
- f0 = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->f0_conv , f0_curve, 2 , 1 , 1 ), model->decoder ->f0_conv_bias )));
1192
- n_base = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->n_conv , n, 2 , 1 , 1 ), model->decoder ->n_conv_bias )));
1193
- cur = ggml_concat (ctx, ggml_concat (ctx, asr, f0, 0 ), n_base, 0 );
1194
-
1216
+ f0 = ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->f0_conv , f0_curve, 2 , 1 , 1 ), model->decoder ->f0_conv_bias );
1217
+ n_base = ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->n_conv , n, 2 , 1 , 1 ), model->decoder ->n_conv_bias );
1218
+ cur = ggml_concat (ctx, ggml_concat (ctx, ggml_cont (ctx, ggml_transpose (ctx, asr)), f0, 1 ), n_base, 1 );
1195
1219
cur = build_ada_residual_conv (ctx, cur, model->decoder ->encoder_block , style_half2, model->sqrt_tensor );
1196
- asr_res = ggml_cont (ctx, ggml_transpose (ctx, ggml_add (ctx, ggml_conv_1d (ctx, model->decoder ->asr_conv , ggml_cont (ctx, ggml_transpose (ctx, asr)), 1 , 0 , 1 ), model->decoder ->asr_conv_bias )));
1197
1220
1221
+ asr_res = ggml_mul_mat (ctx, model->decoder ->asr_conv , asr);
1222
+ asr_res = ggml_add (ctx, asr_res, ggml_transpose (ctx, model->decoder ->asr_conv_bias ));
1223
+
1224
+ asr_res = ggml_cont (ctx, ggml_transpose (ctx, asr_res));
1198
1225
for (auto l : model->decoder ->decoder_blocks ) {
1199
- cur = ggml_concat (ctx, ggml_concat (ctx, ggml_concat (ctx, cur, asr_res, 0 ), f0, 0 ), n_base, 0 );
1226
+ cur = ggml_concat (ctx, ggml_concat (ctx, ggml_concat (ctx, cur, asr_res, 1 ), f0, 1 ), n_base, 1 );
1200
1227
cur = build_ada_residual_conv (ctx, cur, l, style_half2, model->sqrt_tensor );
1201
1228
}
1229
+ cur = ggml_cont (ctx, ggml_transpose (ctx, cur));
1202
1230
}
1203
1231
1204
1232
kctx->window_sq_sum = ggml_new_tensor_1d (ctx, GGML_TYPE_F32, kctx->total_duration *model->up_sampling_factor );
0 commit comments