Skip to content

Commit 8e50738

Browse files
authored
Merge pull request #64 from danielzgtg/optimize/k1conv1d
kokoro: Replace K=1 conv_1d with mul_mat
2 parents d3d8c81 + 0bcb0a6 commit 8e50738

File tree

1 file changed

+53
-25
lines changed

1 file changed

+53
-25
lines changed

src/kokoro_model.cpp

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -88,42 +88,45 @@ static struct ggml_tensor * build_ada_residual_conv(ggml_context * ctx, struct g
8888

8989
gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->norm1_gamma, style), block->norm1_gamma_bias);
9090
beta = ggml_add(ctx, ggml_mul_mat(ctx, block->norm1_beta, style), block->norm1_beta_bias);
91-
cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_norm(ctx, ggml_cont(ctx, ggml_transpose(ctx, cur)), 0.00001)));
91+
cur = ggml_norm(ctx, x, 0.00001);
9292

9393
// The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
9494
// An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
95-
cur = ggml_add(ctx, ggml_add(ctx, cur, ggml_mul(ctx, cur, gamma)), beta);
96-
cur = ggml_leaky_relu(ctx, cur, 0.2f, false);
95+
cur = ggml_add(ctx, cur, ggml_mul(ctx, cur, ggml_transpose(ctx, gamma)));
96+
cur = ggml_add(ctx, cur, ggml_transpose(ctx, beta));
97+
cur = ggml_leaky_relu(ctx, cur, 0.2f, false);
9798

9899
if (block->pool) {
99-
cur = ggml_conv_transpose_1d(ctx, block->pool, ggml_cont(ctx, ggml_transpose(ctx, cur)), 2, 1, 1, 1, cur->ne[0]);
100+
cur = ggml_conv_transpose_1d(ctx, block->pool, cur, 2, 1, 1, 1, cur->ne[1]);
100101
cur = ggml_add(ctx, cur, block->pool_bias);
101-
cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
102102
}
103103

104-
cur = ggml_conv_1d(ctx, block->conv1, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, 1, 1);
104+
cur = ggml_conv_1d(ctx, block->conv1, cur, 1, 1, 1);
105105

106106
cur = ggml_add(ctx, cur, block->conv1_bias);
107107
gamma = ggml_add(ctx, ggml_mul_mat(ctx, block->norm2_gamma, style), block->norm2_gamma_bias);
108108
beta = ggml_add(ctx, ggml_mul_mat(ctx, block->norm2_beta, style), block->norm2_beta_bias);
109-
cur = ggml_cont(ctx, ggml_transpose(ctx, ggml_norm(ctx, cur, 0.00001)));
109+
cur = ggml_norm(ctx, cur, 0.00001);
110110

111111
// The addition between gamma * x and x is performed here because ggml doesn't support scalar multiplication without initializing the scalars in advance.
112112
// An optimal remedy to this would be to increment the gamma bias above by one when preparing the gguf file for the model.
113-
cur = ggml_add(ctx, ggml_add(ctx, cur, ggml_mul(ctx, cur, gamma)), beta);
114-
cur = ggml_leaky_relu(ctx, cur, 0.2f, false);
115-
cur = ggml_add(ctx, ggml_conv_1d(ctx, block->conv2, ggml_cont(ctx, ggml_transpose(ctx, cur)), 1, 1, 1), block->conv2_bias);
113+
cur = ggml_add(ctx, cur, ggml_mul(ctx, cur, ggml_transpose(ctx, gamma)));
114+
cur = ggml_add(ctx, cur, ggml_transpose(ctx, beta));
115+
cur = ggml_leaky_relu(ctx, cur, 0.2f, false);
116+
cur = ggml_add(ctx, ggml_conv_1d(ctx, block->conv2, cur, 1, 1, 1), block->conv2_bias);
116117

117-
struct ggml_tensor * res = cur;
118-
cur = ggml_cont(ctx, ggml_transpose(ctx, x));
118+
struct ggml_tensor * res = cur;
119+
cur = x;
119120
if (block->upsample) {
121+
cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
120122
if (block->pool) {
121-
cur = ggml_upscale_ext(ctx, cur, cur->ne[0]*2, cur->ne[1], cur->ne[2], cur->ne[3]);
123+
cur = ggml_upscale_ext(ctx, cur, cur->ne[0], cur->ne[1]*2, cur->ne[2], cur->ne[3]);
122124
}
123-
cur = ggml_conv_1d(ctx, block->upsample, cur, 1, 0, 1);
125+
cur = ggml_mul_mat(ctx, block->upsample, cur);
126+
cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
124127
}
125-
126-
return ggml_cont(ctx, ggml_transpose(ctx, ggml_div(ctx, ggml_add(ctx, res, cur), sqrt_tensor)));
128+
cur = ggml_div(ctx, ggml_add(ctx, res, cur), sqrt_tensor);
129+
return cur;
127130
}
128131

129132
static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * style, kokoro_generator_residual_block * block) {
@@ -158,6 +161,7 @@ static struct ggml_tensor * build_kokoro_generator_res_block(ggml_context * ctx,
158161
}
159162

160163
static struct ggml_tensor * build_noise_block(ggml_context * ctx, kokoro_noise_residual_block * block, struct ggml_tensor * x, struct ggml_tensor * style) {
164+
// This conv_1d seems replaceable with squeezed and transposed ggml_mul_mut, but s0 and p0 are dynamic
161165
ggml_tensor * cur = ggml_add(ctx, ggml_conv_1d(ctx, block->input_conv, x, block->input_conv_stride, block->input_conv_padding, 1), block->input_conv_bias);
162166
return build_kokoro_generator_res_block(ctx, cur, style, block->res_block);
163167
}
@@ -510,6 +514,16 @@ void kokoro_model::assign_gen_resblock(kokoro_generator_residual_block * block,
510514
}
511515
}
512516

517+
/**
518+
* Removes the last axis, for cases where it's redundantly of length 1.
519+
* assert x.ndim == 3; numpy.squeeze(x, axis=-1)
520+
*/
521+
static ggml_tensor * squeeze_3d_2d_e0(ggml_context * ctx, ggml_tensor * x) {
522+
TTS_ASSERT(x->ne[0] == 1);
523+
TTS_ASSERT(ggml_is_contiguous(x));
524+
return ggml_reshape_2d(ctx, x, x->ne[1], x->ne[2]);
525+
}
526+
513527
void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::string name, ggml_tensor * tensor) {
514528
if (name == "norm1_gamma_weight") {
515529
block->norm1_gamma = ggml_dup_tensor(ctx, tensor);
@@ -554,6 +568,7 @@ void kokoro_model::assign_ada_res_block(ada_residual_conv_block * block, std::st
554568
block->pool_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
555569
set_tensor(block->pool_bias, tensor);
556570
} else if (name == "conv1x1_weight") {
571+
tensor = squeeze_3d_2d_e0(ctx, tensor);
557572
block->upsample = ggml_dup_tensor(ctx, tensor);
558573
set_tensor(block->upsample, tensor);
559574
} else if (name == "conv1x1_bias") {
@@ -576,6 +591,7 @@ void kokoro_model::assign_decoder_weight(std::string name, ggml_tensor * tensor)
576591
decoder->n_conv_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
577592
set_tensor(decoder->n_conv_bias, tensor);
578593
} else if (name == "asr_conv_weight") {
594+
tensor = squeeze_3d_2d_e0(ctx, tensor);
579595
decoder->asr_conv = ggml_dup_tensor(ctx, tensor);
580596
set_tensor(decoder->asr_conv, tensor);
581597
} else if (name == "asr_conv_bias") {
@@ -607,12 +623,14 @@ void kokoro_model::assign_duration_weight(std::string name, ggml_tensor * tensor
607623
prosody_pred->duration_proj_bias = ggml_dup_tensor(ctx, tensor);
608624
set_tensor(prosody_pred->duration_proj_bias, tensor);
609625
} else if (name == "n_proj_kernel") {
626+
tensor = squeeze_3d_2d_e0(ctx, tensor);
610627
prosody_pred->n_proj_kernel = ggml_dup_tensor(ctx, tensor);
611628
set_tensor(prosody_pred->n_proj_kernel, tensor);
612629
} else if (name == "n_proj_bias") {
613630
prosody_pred->n_proj_bias = ggml_dup_tensor(ctx, ggml_transpose(ctx, tensor));
614631
set_tensor(prosody_pred->n_proj_bias, tensor);
615632
} else if (name == "f0_proj_kernel") {
633+
tensor = squeeze_3d_2d_e0(ctx, tensor);
616634
prosody_pred->f0_proj_kernel = ggml_dup_tensor(ctx, tensor);
617635
set_tensor(prosody_pred->f0_proj_kernel, tensor);
618636
} else if (name == "f0_proj_bias") {
@@ -1147,20 +1165,27 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
11471165

11481166
cur = build_lstm(ctx, cur, model->prosody_pred->shared_lstm, cur->ne[1]);
11491167

1150-
ggml_build_forward_expand(gf, cur);
11511168

11521169
struct ggml_tensor * f0_curve = cur;
1170+
f0_curve = ggml_cont(ctx, ggml_transpose(ctx, f0_curve));
11531171
for (auto block : model->prosody_pred->f0_blocks) {
11541172
f0_curve = build_ada_residual_conv(ctx, f0_curve, block, style_half, model->sqrt_tensor);
11551173
}
1156-
f0_curve = ggml_add(ctx, ggml_conv_1d(ctx, model->prosody_pred->f0_proj_kernel, ggml_cont(ctx, ggml_transpose(ctx, f0_curve)), 1, 0, 1), model->prosody_pred->f0_proj_bias);
1174+
f0_curve = ggml_cont(ctx, ggml_transpose(ctx, f0_curve));
1175+
f0_curve = ggml_mul_mat(ctx, model->prosody_pred->f0_proj_kernel, f0_curve);
1176+
f0_curve = squeeze_3d_2d_e0(ctx, f0_curve);
1177+
f0_curve = ggml_add(ctx, f0_curve, model->prosody_pred->f0_proj_bias);
11571178
ggml_set_name(f0_curve, "f0_out");
11581179

11591180
struct ggml_tensor * n = cur;
1181+
n = ggml_cont(ctx, ggml_transpose(ctx, n));
11601182
for (auto block : model->prosody_pred->n_blocks) {
11611183
n = build_ada_residual_conv(ctx, n, block, style_half, model->sqrt_tensor);
11621184
}
1163-
n = ggml_add(ctx, ggml_conv_1d(ctx, model->prosody_pred->n_proj_kernel, ggml_cont(ctx, ggml_transpose(ctx, n)), 1, 0, 1), model->prosody_pred->n_proj_bias);
1185+
n = ggml_cont(ctx, ggml_transpose(ctx, n));
1186+
n = ggml_mul_mat(ctx, model->prosody_pred->n_proj_kernel, n);
1187+
n = squeeze_3d_2d_e0(ctx, n);
1188+
n = ggml_add(ctx, n, model->prosody_pred->n_proj_bias);
11641189
ggml_set_name(n, "n_out");
11651190
ggml_build_forward_expand(gf, n);
11661191

@@ -1188,17 +1213,20 @@ struct ggml_cgraph * kokoro_runner::build_kokoro_graph(kokoro_ubatch & batch) {
11881213
struct ggml_tensor * style_half2 = ggml_view_1d(ctx, voice, voice->ne[0]/2, (batch.n_tokens - 3) * voice->nb[1]);
11891214

11901215
{
1191-
f0 = ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_conv_1d(ctx, model->decoder->f0_conv, f0_curve, 2, 1, 1), model->decoder->f0_conv_bias)));
1192-
n_base = ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_conv_1d(ctx, model->decoder->n_conv, n, 2, 1, 1), model->decoder->n_conv_bias)));
1193-
cur = ggml_concat(ctx, ggml_concat(ctx, asr, f0, 0), n_base, 0);
1194-
1216+
f0 = ggml_add(ctx, ggml_conv_1d(ctx, model->decoder->f0_conv, f0_curve, 2, 1, 1), model->decoder->f0_conv_bias);
1217+
n_base = ggml_add(ctx, ggml_conv_1d(ctx, model->decoder->n_conv, n, 2, 1, 1), model->decoder->n_conv_bias);
1218+
cur = ggml_concat(ctx, ggml_concat(ctx, ggml_cont(ctx, ggml_transpose(ctx, asr)), f0, 1), n_base, 1);
11951219
cur = build_ada_residual_conv(ctx, cur, model->decoder->encoder_block, style_half2, model->sqrt_tensor);
1196-
asr_res = ggml_cont(ctx, ggml_transpose(ctx, ggml_add(ctx, ggml_conv_1d(ctx, model->decoder->asr_conv, ggml_cont(ctx, ggml_transpose(ctx, asr)), 1, 0, 1), model->decoder->asr_conv_bias)));
11971220

1221+
asr_res = ggml_mul_mat(ctx, model->decoder->asr_conv, asr);
1222+
asr_res = ggml_add(ctx, asr_res, ggml_transpose(ctx, model->decoder->asr_conv_bias));
1223+
1224+
asr_res = ggml_cont(ctx, ggml_transpose(ctx, asr_res));
11981225
for (auto l : model->decoder->decoder_blocks) {
1199-
cur = ggml_concat(ctx, ggml_concat(ctx, ggml_concat(ctx, cur, asr_res, 0), f0, 0), n_base, 0);
1226+
cur = ggml_concat(ctx, ggml_concat(ctx, ggml_concat(ctx, cur, asr_res, 1), f0, 1), n_base, 1 );
12001227
cur = build_ada_residual_conv(ctx, cur, l, style_half2, model->sqrt_tensor);
12011228
}
1229+
cur = ggml_cont(ctx, ggml_transpose(ctx, cur));
12021230
}
12031231

12041232
kctx->window_sq_sum = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, kctx->total_duration*model->up_sampling_factor);

0 commit comments

Comments
 (0)