Skip to content

Commit d7c0d87

Browse files
committed
Small changes.
1 parent eb6f3a0 commit d7c0d87

File tree

3 files changed

+52
-36
lines changed

3 files changed

+52
-36
lines changed

kernel/power/sbgemv_common_power10.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,42 @@ FORCEINLINE void vec_store4_pair(vec_f32 *v_y, vec_f32 *vy0)
525525
vec_store_pair(v_y + 6, vy0 + 6);
526526
}
527527

528+
FORCEINLINE void vec_setzero_2(__vector_quad *temp0)
529+
{
530+
__builtin_mma_xxsetaccz(&temp0[0]);
531+
__builtin_mma_xxsetaccz(&temp0[1]);
532+
}
533+
534+
FORCEINLINE void vec_setzero_4(__vector_quad *temp0)
535+
{
536+
vec_setzero_2(temp0 + 0);
537+
vec_setzero_2(temp0 + 2);
538+
}
539+
540+
FORCEINLINE void vec_setzero_8(__vector_quad *temp0)
541+
{
542+
vec_setzero_4(temp0 + 0);
543+
vec_setzero_4(temp0 + 4);
544+
}
545+
546+
FORCEINLINE void vec_reduce_2(vec_f32 *temp00, __vector_quad *temp0)
547+
{
548+
__builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]);
549+
__builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]);
550+
}
551+
552+
FORCEINLINE void vec_reduce_4(vec_f32 *temp00, __vector_quad *temp0)
553+
{
554+
vec_reduce_2(temp00 + 0, temp0 + 0);
555+
vec_reduce_2(temp00 + 8, temp0 + 2);
556+
}
557+
558+
FORCEINLINE void vec_reduce_8(vec_f32 *temp00, __vector_quad *temp0)
559+
{
560+
vec_reduce_4(temp00 + 0, temp0 + 0);
561+
vec_reduce_4(temp00 + 16, temp0 + 4);
562+
}
563+
528564
#ifdef USE_MERGE_MMA
529565
FORCEINLINE void vec_load8_pair(vec_f32 *vy0, vec_f32 *v_y)
530566
{

kernel/power/sbgemv_t_power10.c

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,7 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
101101
vec_f32 temp00[4*2];
102102
vec_bf16 inp[4];
103103

104-
__builtin_mma_xxsetaccz(&temp0[0]);
105-
__builtin_mma_xxsetaccz(&temp0[1]);
104+
vec_setzero_2(&temp0[0]);
106105

107106
a0 = ap;
108107
a1 = ap + lda;
@@ -141,8 +140,7 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
141140
vec_loadN_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0], n);
142141
}
143142

144-
__builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]);
145-
__builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]);
143+
vec_reduce_2(temp00, &temp0[0]);
146144

147145
y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]);
148146
y[1] = (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])) + (beta * y[1]);
@@ -156,10 +154,7 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
156154
vec_f32 temp00[4*4];
157155
vec_bf16 inp[4];
158156

159-
__builtin_mma_xxsetaccz(&temp0[0]);
160-
__builtin_mma_xxsetaccz(&temp0[1]);
161-
__builtin_mma_xxsetaccz(&temp0[2]);
162-
__builtin_mma_xxsetaccz(&temp0[3]);
157+
vec_setzero_4(&temp0[0]);
163158

164159
a0 = ap;
165160
a1 = ap + lda;
@@ -202,10 +197,7 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
202197
vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n);
203198
}
204199

205-
__builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]);
206-
__builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]);
207-
__builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]);
208-
__builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]);
200+
vec_reduce_4(temp00, &temp0[0]);
209201

210202
vec_f32 t0, t1, t2, t3, t4, t5, t6, t7;
211203
vec_f32 a = { alpha, alpha, alpha, alpha };
@@ -239,23 +231,17 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
239231
vec_f32 temp00[4*8];
240232
vec_bf16 inp[4];
241233

242-
__builtin_mma_xxsetaccz(&temp0[0]);
243-
__builtin_mma_xxsetaccz(&temp0[1]);
244-
__builtin_mma_xxsetaccz(&temp0[2]);
245-
__builtin_mma_xxsetaccz(&temp0[3]);
246-
__builtin_mma_xxsetaccz(&temp0[4]);
247-
__builtin_mma_xxsetaccz(&temp0[5]);
248-
__builtin_mma_xxsetaccz(&temp0[6]);
249-
__builtin_mma_xxsetaccz(&temp0[7]);
234+
vec_setzero_8(&temp0[0]);
250235

236+
BLASLONG lda4 = lda << 2;
251237
a0 = ap;
252238
a1 = ap + lda;
253239
a2 = a1 + lda;
254240
a3 = a2 + lda;
255-
a4 = a3 + lda;
256-
a5 = a4 + lda;
257-
a6 = a5 + lda;
258-
a7 = a6 + lda;
241+
a4 = a0 + lda4;
242+
a5 = a1 + lda4;
243+
a6 = a2 + lda4;
244+
a7 = a3 + lda4;
259245
va0 = (vec_bf16 *)a0;
260246
va1 = (vec_bf16 *)a1;
261247
va2 = (vec_bf16 *)a2;
@@ -301,14 +287,7 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
301287
vec_loadN_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0], n);
302288
}
303289

304-
__builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]);
305-
__builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]);
306-
__builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]);
307-
__builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]);
308-
__builtin_mma_disassemble_acc((void*)(temp00 + 16), &temp0[4]);
309-
__builtin_mma_disassemble_acc((void*)(temp00 + 20), &temp0[5]);
310-
__builtin_mma_disassemble_acc((void*)(temp00 + 24), &temp0[6]);
311-
__builtin_mma_disassemble_acc((void*)(temp00 + 28), &temp0[7]);
290+
vec_reduce_8(temp00, &temp0[0]);
312291

313292
vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17;
314293
vec_f32 a = { alpha, alpha, alpha, alpha };

kernel/power/sbgemv_t_vsx.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,14 +198,15 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL
198198
vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 };
199199
vec_f32 inp[2];
200200

201+
BLASLONG lda4 = lda << 2;
201202
a0 = ap;
202203
a1 = ap + lda;
203204
a2 = a1 + lda;
204205
a3 = a2 + lda;
205-
a4 = a3 + lda;
206-
a5 = a4 + lda;
207-
a6 = a5 + lda;
208-
a7 = a6 + lda;
206+
a4 = a0 + lda4;
207+
a5 = a1 + lda4;
208+
a6 = a2 + lda4;
209+
a7 = a3 + lda4;
209210
va0 = (vec_bf16 *)a0;
210211
va1 = (vec_bf16 *)a1;
211212
va2 = (vec_bf16 *)a2;

0 commit comments

Comments
 (0)