|
330 | 330 | "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
|
331 | 331 | "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
|
332 | 332 | :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
|
333 |
| - ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\ |
| 333 | + ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\ |
334 | 334 | a_block_pointer -= M * K;\
|
335 | 335 | }
|
336 | 336 | #define COMPUTE_n16 {\
|
@@ -645,8 +645,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
|
645 | 645 | c_pointer ++;\
|
646 | 646 | }
|
647 | 647 |
|
648 |
| -static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 |
649 |
| -//perform C += A<pack> B<pack> , edge_n<8 must be satisfied ! |
| 648 | +static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8 |
| 649 | +//perform C += A<pack> B<pack> , edge_n<8 must be satisfied. |
650 | 650 | if(k==0 || m==0 || edge_n==0) return;
|
651 | 651 | double *a_block_pointer,*b_block_pointer,*b_base_pointer;
|
652 | 652 | double *c_pointer = c;
|
@@ -763,11 +763,16 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
|
763 | 763 | int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
|
764 | 764 | if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
|
765 | 765 | BLASLONG ndiv8 = n/8;double ALPHA = alpha;
|
| 766 | +#ifdef ICOPY_4 |
766 | 767 | double *packed_a = (double *)malloc(m*k*sizeof(double));
|
767 | 768 | copy_4_to_8(A,packed_a,m,k);
|
| 769 | +#else //ICOPY_8 |
| 770 | + double *packed_a = A; |
| 771 | +#endif |
768 | 772 | if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
|
769 | 773 | if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
|
| 774 | +#ifdef ICOPY_4 |
770 | 775 | free(packed_a);packed_a=NULL;
|
| 776 | +#endif |
771 | 777 | return 0;
|
772 | 778 | }
|
773 |
| - |
|
0 commit comments