Skip to content

Commit 6bcb06f

Browse files
authored
make further changes to icopy_8 easier
1 parent b7315f8 commit 6bcb06f

File tree

1 file changed

+61
-42
lines changed

1 file changed

+61
-42
lines changed

kernel/x86_64/dgemm_kernel_8x8_skylakex.c

Lines changed: 61 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
#include "common.h"
22
#include <stdint.h>
33
#include <immintrin.h>
4+
//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
45
/* row-major c_block */
5-
/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/
66
#define INNER_KERNEL_k1m1n8 \
77
"prefetcht0 384(%1);"\
88
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\
@@ -158,7 +158,7 @@
158158
#define INNER_STORE_m1n8(c1,disp) \
159159
"kxnorw %%k1,%%k1,%%k1;"\
160160
"vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
161-
"vaddpd %%zmm7,"#c1","#c1";"\
161+
"vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
162162
"kxnorw %%k1,%%k1,%%k1;"\
163163
"vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
164164

@@ -227,26 +227,27 @@
227227
"vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
228228
"vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd %%zmm7,"#c8","#c8"%{%5%};"
229229

230+
//%7 for k01(input) only when m=4
230231
#define INNER_STORE_4x8(c1,c2,c3,c4) \
231-
"vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\
232+
"vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
232233
"vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
233-
"vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\
234+
"vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
234235
"vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
235-
"vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\
236+
"vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
236237
"vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
237-
"vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\
238+
"vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
238239
"vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
239240
"leaq (%3,%4,4),%3;"
240241

241242
#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
242243
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
243-
"vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
244+
"vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
244245
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
245-
"vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
246+
"vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
246247
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
247-
"vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
248+
"vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
248249
"prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
249-
"vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
250+
"vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
250251

251252
#define INNER_SAVE_m4n8 \
252253
INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
@@ -292,6 +293,7 @@
292293

293294
#define COMPUTE_n8 {\
294295
__asm__ __volatile__(\
296+
"vbroadcastsd (%9),%%zmm3;"\
295297
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
296298
"cmpq $8,%8; jb 42222f;"\
297299
"42221:\n\t"\
@@ -327,12 +329,13 @@
327329
"42225:\n\t"\
328330
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
329331
"shlq $3,%4;addq %4,%3;shrq $3,%4;"\
330-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
331-
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
332+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
333+
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
332334
a_block_pointer -= M * K;\
333335
}
334336
#define COMPUTE_n16 {\
335337
__asm__ __volatile__(\
338+
"vbroadcastsd (%9),%%zmm3;"\
336339
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
337340
"cmpq $8,%8; jb 32222f;"\
338341
"32221:\n\t"\
@@ -369,13 +372,14 @@
369372
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
370373
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
371374
"leaq (%1,%%r12,2),%1;"\
372-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
373-
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
375+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
376+
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
374377
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
375378
a_block_pointer -= M * K;\
376379
}
377380
#define COMPUTE_n24 {\
378381
__asm__ __volatile__(\
382+
"vbroadcastsd (%9),%%zmm3;"\
379383
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
380384
"cmpq $8,%8; jb 22222f;"\
381385
"22221:\n\t"\
@@ -412,21 +416,21 @@
412416
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
413417
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
414418
"leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
415-
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
416-
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
419+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
420+
::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
417421
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
418422
a_block_pointer -= M * K;\
419423
}
420424

421-
static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
425+
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
422426
//perform C += A<pack> B<pack>
423427
if(k==0 || m==0 || ndiv8==0) return;
424428
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
425429
int64_t K = (int64_t)k; int64_t M = (int64_t)m;
426430
double *a_block_pointer;
427431
double *c_pointer = c;
428432
__mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
429-
BLASLONG ndiv8_count;
433+
BLASLONG m_count,ndiv8_count,k_count;
430434
double *packed_b_pointer = packed_b;
431435
a_block_pointer = packed_a;
432436
for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
@@ -474,24 +478,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
474478
#define INIT_m8n2 zc2=INIT_m8n1
475479
#define INIT_m8n4 zc4=zc3=INIT_m8n2
476480
#define SAVE_m8n1 {\
477-
za1 = _mm512_loadu_pd(c_pointer);\
478-
zc1 = _mm512_add_pd(zc1,za1);\
481+
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
482+
zb1 = _mm512_loadu_pd(c_pointer);\
483+
zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\
479484
_mm512_storeu_pd(c_pointer,zc1);\
480485
c_pointer += 8;\
481486
}
482487
#define SAVE_m8n2 {\
488+
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
483489
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
484-
zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
490+
zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
485491
_mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
486492
c_pointer += 8;\
487493
}
488494
#define SAVE_m8n4 {\
495+
__asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
489496
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
490-
zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
497+
zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
491498
_mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
492499
c_pointer += LDC*2;\
493500
zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
494-
zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\
501+
zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\
495502
_mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\
496503
c_pointer += 8-LDC*2;\
497504
}
@@ -518,24 +525,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
518525
#define INIT_m4n2 yc2=INIT_m4n1
519526
#define INIT_m4n4 yc4=yc3=INIT_m4n2
520527
#define SAVE_m4n1 {\
528+
yb1 = _mm256_broadcast_sd(alpha);\
521529
ya1 = _mm256_loadu_pd(c_pointer);\
522-
yc1 = _mm256_add_pd(yc1,ya1);\
530+
yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
523531
_mm256_storeu_pd(c_pointer,yc1);\
524532
c_pointer += 4;\
525533
}
526534
#define SAVE_m4n2 {\
535+
ya1 = _mm256_broadcast_sd(alpha);\
527536
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
528-
yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
537+
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
529538
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
530539
c_pointer += 4;\
531540
}
532541
#define SAVE_m4n4 {\
542+
ya1 = _mm256_broadcast_sd(alpha);\
533543
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
534-
yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
544+
yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
535545
_mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
536546
c_pointer += LDC*2;\
537547
yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
538-
yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\
548+
yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
539549
_mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
540550
c_pointer += 4-LDC*2;\
541551
}
@@ -553,14 +563,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
553563
#define INIT_m2n1 xc1=_mm_setzero_pd();
554564
#define INIT_m2n2 xc2=INIT_m2n1
555565
#define SAVE_m2n1 {\
566+
xb1 = _mm_loaddup_pd(alpha);\
556567
xa1 = _mm_loadu_pd(c_pointer);\
557-
xc1 = _mm_add_pd(xc1,xa1);\
568+
xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
558569
_mm_storeu_pd(c_pointer,xc1);\
559570
c_pointer += 2;\
560571
}
561572
#define SAVE_m2n2 {\
573+
xa1 = _mm_loaddup_pd(alpha);\
562574
xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
563-
xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\
575+
xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
564576
_mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
565577
c_pointer += 2;\
566578
}
@@ -571,7 +583,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
571583
}
572584
#define INIT_m1n1 sc1=0.0;
573585
#define SAVE_m1n1 {\
574-
*c_pointer += sc1;\
586+
*c_pointer += sc1 * (*alpha);\
575587
c_pointer++;\
576588
}
577589

@@ -596,6 +608,9 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
596608
#define INIT_m1n4 INIT_m4n1
597609
#define INIT_m2n4 INIT_m4n2
598610
#define SAVE_m2n4 {\
611+
ya1 = _mm256_broadcast_sd(alpha);\
612+
yc1 = _mm256_mul_pd(yc1,ya1);\
613+
yc2 = _mm256_mul_pd(yc2,ya1);\
599614
yb1 = _mm256_unpacklo_pd(yc1,yc2);\
600615
yb2 = _mm256_unpackhi_pd(yc1,yc2);\
601616
xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
@@ -609,12 +624,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
609624
c_pointer += 2;\
610625
}
611626
#define SAVE_m1n2 {\
627+
xb1 = _mm_loaddup_pd(alpha);\
628+
xc1 = _mm_mul_pd(xc1,xb1);\
612629
*c_pointer += _mm_cvtsd_f64(xc1);\
613630
xa1 = _mm_unpackhi_pd(xc1,xc1);\
614631
c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
615632
c_pointer ++;\
616633
}
617634
#define SAVE_m1n4 {\
635+
ya1 = _mm256_broadcast_sd(alpha);\
636+
yc1 = _mm256_mul_pd(yc1,ya1);\
618637
xb1 = _mm256_extractf128_pd(yc1,0);\
619638
*c_pointer += _mm_cvtsd_f64(xb1);\
620639
xb2 = _mm_unpackhi_pd(xb1,xb1);\
@@ -626,7 +645,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
626645
c_pointer ++;\
627646
}
628647

629-
static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
648+
static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
630649
//perform C += A<pack> B<pack> , edge_n<8 must be satisfied !
631650
if(k==0 || m==0 || edge_n==0) return;
632651
double *a_block_pointer,*b_block_pointer,*b_base_pointer;
@@ -724,30 +743,30 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
724743
}
725744
}
726745
}
727-
static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k,double alpha){
728-
BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp,alp;
729-
src1 = src; dst1 = dst; src2 = src1 + 4 * k; alp = _mm256_set1_pd(alpha);
746+
static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
747+
BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp;
748+
src1 = src; dst1 = dst; src2 = src1 + 4 * k;
730749
for(m_count=m;m_count>7;m_count-=8){
731750
for(k_count=k;k_count>0;k_count--){
732-
tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
733-
tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
751+
tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
752+
tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
734753
dst1+=8;
735754
}
736755
src1+=4*k;src2+=4*k;
737756
}
738757
for(;m_count>0;m_count--){
739758
for(k_count=k;k_count>0;k_count--){
740-
*dst1 = (*src1) * alpha; src1++; dst1++;
759+
*dst1 = (*src1); src1++; dst1++;
741760
}
742761
}
743762
}
744763
int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
745-
if(m==0 || n==0 || k==0) return 0;
746-
BLASLONG ndiv8 = n/8;
764+
if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
765+
BLASLONG ndiv8 = n/8;double ALPHA = alpha;
747766
double *packed_a = (double *)malloc(m*k*sizeof(double));
748-
copy_4_to_8(A,packed_a,m,k,alpha);
749-
if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C);
750-
if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8);
767+
copy_4_to_8(A,packed_a,m,k);
768+
if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
769+
if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
751770
free(packed_a);packed_a=NULL;
752771
return 0;
753772
}

0 commit comments

Comments
 (0)