Skip to content

Commit b7315f8

Browse files
authored
Add files via upload
1 parent 9b19e9e commit b7315f8

File tree

1 file changed

+26
-49
lines changed

1 file changed

+26
-49
lines changed

kernel/x86_64/dgemm_kernel_8x8_skylakex.c

Lines changed: 26 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
2626

2727
#define INNER_KERNEL_k1m1n16 \
28-
"prefetcht0 384(%1); prefetcht0 448(%1);"\
29-
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\
28+
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\
29+
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\
3030
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
3131

3232
#define INNER_KERNEL_k1m2n16 \
@@ -46,8 +46,8 @@
4646
"vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
4747

4848
#define INNER_KERNEL_k1m1n24 \
49-
"prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\
50-
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\
49+
"prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\
50+
"prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\
5151
"vbroadcastsd (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
5252

5353
#define INNER_KERNEL_k1m2n24 \
@@ -292,37 +292,37 @@
292292

293293
#define COMPUTE_n8 {\
294294
__asm__ __volatile__(\
295-
"movq %8,%%r14;movq %2,%%r13;"\
295+
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
296296
"cmpq $8,%8; jb 42222f;"\
297297
"42221:\n\t"\
298298
INNER_INIT_m8n8\
299299
INNER_KERNELm8(8)\
300300
INNER_SAVE_m8n8\
301-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
301+
"movq %%r13,%2; subq %%r12,%1;"\
302302
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
303303
"subq $8,%8; cmpq $8,%8; jnb 42221b;"\
304304
"42222:\n\t"\
305305
"cmpq $4,%8; jb 42223f;"\
306306
INNER_INIT_m4n8\
307307
INNER_KERNELm4(8)\
308308
INNER_SAVE_m4n8\
309-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
309+
"movq %%r13,%2; subq %%r12,%1;"\
310310
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
311311
"subq $4,%8;"\
312312
"42223:\n\t"\
313313
"cmpq $2,%8; jb 42224f;"\
314314
INNER_INIT_m2n8\
315315
INNER_KERNELm2(8)\
316316
INNER_SAVE_m2n8\
317-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
317+
"movq %%r13,%2; subq %%r12,%1;"\
318318
"addq $16,%3;"\
319319
"subq $2,%8;"\
320320
"42224:\n\t"\
321321
"cmpq $1,%8; jb 42225f;"\
322322
INNER_INIT_m1n8\
323323
INNER_KERNELm1(8)\
324324
INNER_SAVE_m1n8\
325-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
325+
"movq %%r13,%2; subq %%r12,%1;"\
326326
"addq $8,%3;"\
327327
"42225:\n\t"\
328328
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
@@ -333,94 +333,92 @@
333333
}
334334
#define COMPUTE_n16 {\
335335
__asm__ __volatile__(\
336-
"movq %8,%%r14;movq %2,%%r13;"\
336+
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
337337
"cmpq $8,%8; jb 32222f;"\
338338
"32221:\n\t"\
339339
INNER_INIT_m8n16\
340340
INNER_KERNELm8(16)\
341341
INNER_SAVE_m8n16\
342-
"movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
342+
"movq %%r13,%2; subq %%r12,%1;"\
343343
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
344344
"subq $8,%8; cmpq $8,%8; jnb 32221b;"\
345345
"32222:\n\t"\
346346
"cmpq $4,%8; jb 32223f;"\
347347
INNER_INIT_m4n16\
348348
INNER_KERNELm4(16)\
349349
INNER_SAVE_m4n16\
350-
"movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
350+
"movq %%r13,%2; subq %%r12,%1;"\
351351
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
352352
"subq $4,%8;"\
353353
"32223:\n\t"\
354354
"cmpq $2,%8; jb 32224f;"\
355355
INNER_INIT_m2n16\
356356
INNER_KERNELm2(16)\
357357
INNER_SAVE_m2n16\
358-
"movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
358+
"movq %%r13,%2; subq %%r12,%1;"\
359359
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
360360
"subq $2,%8;"\
361361
"32224:\n\t"\
362362
"cmpq $1,%8; jb 32225f;"\
363363
INNER_INIT_m1n16\
364364
INNER_KERNELm1(16)\
365365
INNER_SAVE_m1n16\
366-
"movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
366+
"movq %%r13,%2; subq %%r12,%1;"\
367367
"shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
368368
"32225:\n\t"\
369369
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
370370
"shlq $4,%4;addq %4,%3;shrq $4,%4;"\
371-
:"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
371+
"leaq (%1,%%r12,2),%1;"\
372+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
372373
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
373-
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\
374+
"zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
374375
a_block_pointer -= M * K;\
375376
}
376377
#define COMPUTE_n24 {\
377378
__asm__ __volatile__(\
378-
"movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\
379+
"movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
379380
"cmpq $8,%8; jb 22222f;"\
380381
"22221:\n\t"\
381382
INNER_INIT_m8n24\
382-
"prefetcht2 (%%r15); prefetcht2 64(%%r15);"\
383383
INNER_KERNELm8(24)\
384-
"prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\
385384
INNER_SAVE_m8n24\
386-
"prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\
387-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
385+
"movq %%r13,%2; subq %%r12,%1;"\
388386
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
389387
"subq $8,%8; cmpq $8,%8; jnb 22221b;"\
390388
"22222:\n\t"\
391389
"cmpq $4,%8; jb 22223f;"\
392390
INNER_INIT_m4n24\
393391
INNER_KERNELm4(24)\
394392
INNER_SAVE_m4n24\
395-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
393+
"movq %%r13,%2; subq %%r12,%1;"\
396394
"shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
397395
"subq $4,%8;"\
398396
"22223:\n\t"\
399397
"cmpq $2,%8; jb 22224f;"\
400398
INNER_INIT_m2n24\
401399
INNER_KERNELm2(24)\
402400
INNER_SAVE_m2n24\
403-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
401+
"movq %%r13,%2; subq %%r12,%1;"\
404402
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
405403
"subq $2,%8;"\
406404
"22224:\n\t"\
407405
"cmpq $1,%8; jb 22225f;"\
408406
INNER_INIT_m1n24\
409407
INNER_KERNELm1(24)\
410408
INNER_SAVE_m1n24\
411-
"movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
409+
"movq %%r13,%2; subq %%r12,%1;"\
412410
"shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
413411
"22225:\n\t"\
414412
"movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
415413
"shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
416-
:"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\
417-
"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\
414+
"leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
415+
:"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
418416
::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
419-
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\
417+
"zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
420418
a_block_pointer -= M * K;\
421419
}
422420

423-
static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
421+
static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
424422
//perform C += A<pack> B<pack>
425423
if(k==0 || m==0 || ndiv8==0) return;
426424
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
@@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
429427
double *c_pointer = c;
430428
__mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
431429
BLASLONG ndiv8_count;
432-
double *b_scratch;
433-
posix_memalign(&b_scratch,64,192*k);
434430
double *packed_b_pointer = packed_b;
435431
a_block_pointer = packed_a;
436432
for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
437-
__asm__ __volatile__ (
438-
"testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;"
439-
"100001:\n\t"
440-
"vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;"
441-
"vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;"
442-
"decq %2; testq %2,%2; jnz 100001b;"
443-
"100002:\n\t"
444-
"movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;"
445-
:"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7");
446433
COMPUTE_n24
447434
}
448435
for(;ndiv8_count>1;ndiv8_count-=2){
449-
__asm__ __volatile__ (
450-
"testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;"
451-
"1000001:\n\t"
452-
"vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;"
453-
"vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;"
454-
"decq %2; testq %2,%2; jnz 1000001b;"
455-
"1000002:\n\t"
456-
"movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;"
457-
:"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6");
458436
COMPUTE_n16
459437
}
460438
if(ndiv8_count>0){
461439
COMPUTE_n8
462440
}
463-
free(b_scratch);b_scratch=NULL;
464441
}
465442

466443
/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */

0 commit comments

Comments
 (0)