Skip to content

Commit 0257f26

Browse files
authored
Merge pull request #21 from xianyi/develop
rebase
2 parents 1070518 + c45b7ae commit 0257f26

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+10564
-1898
lines changed

CONTRIBUTORS.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,3 +171,11 @@ In chronological order:
171171
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
172172
* [2019-03-14] power9 dgemm/dtrmm kernel
173173
* [2019-04-29] power9 sgemm/strmm kernel
174+
175+
* Jiachen Wang <https://github.com/wjc404>
176+
* [2019-07-29] optimize AVX2 DGEMM
177+
* [2019-10-20] AVX512 DGEMM kernel (4x8)
178+
* [2019-11-06] optimize AVX512 SGEMM
179+
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
180+
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
181+
* [2019-12-27] AVX2 CGEMM3M kernel

Makefile.arm64

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
3939
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
4040
endif
4141

42+
ifeq ($(GCCVERSIONGTEQ9), 1)
4243
ifeq ($(CORE), TSV110)
4344
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
4445
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
4546
endif
47+
endif
48+

Makefile.system

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC)
326326
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
327327
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
328328
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
329+
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
329330
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
330331
ifeq ($(GCCVERSIONGT4), 1)
331332
# GCC Major version > 4
@@ -547,9 +548,14 @@ endif
547548

548549
ifeq ($(ARCH), arm64)
549550
DYNAMIC_CORE = ARMV8
551+
DYNAMIC_CORE += CORTEXA53
550552
DYNAMIC_CORE += CORTEXA57
553+
DYNAMIC_CORE += CORTEXA72
554+
DYNAMIC_CORE += CORTEXA73
555+
DYNAMIC_CORE += FALKOR
551556
DYNAMIC_CORE += THUNDERX
552557
DYNAMIC_CORE += THUNDERX2T99
558+
DYNAMIC_CORE += TSV110
553559
endif
554560

555561
ifeq ($(ARCH), power)

cmake/arch.cmake

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,11 @@ endif ()
4545

4646
if (DYNAMIC_ARCH)
4747
if (ARM64)
48-
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
48+
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
49+
endif ()
50+
51+
if (POWER)
52+
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
4953
endif ()
5054

5155
if (X86)

cmake/prebuild.cmake

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
309309
set(ZGEMM_UNROLL_M 4)
310310
set(ZGEMM_UNROLL_N 4)
311311
set(SYMV_P 16)
312+
elseif ("${TCORE}" STREQUAL "TSV110")
313+
file(APPEND ${TARGET_CONF_TEMP}
314+
"#define ARMV8\n"
315+
"#define L1_CODE_SIZE\t65536\n"
316+
"#define L1_CODE_LINESIZE\t64\n"
317+
"#define L1_CODE_ASSOCIATIVE\t4\n"
318+
"#define L1_DATA_SIZE\t65536\n"
319+
"#define L1_DATA_LINESIZE\t64\n"
320+
"#define L1_DATA_ASSOCIATIVE\t4\n"
321+
"#define L2_SIZE\t524288\n"
322+
"#define L2_LINESIZE\t64\n"
323+
"#define L2_ASSOCIATIVE\t8\n"
324+
"#define DTB_DEFAULT_ENTRIES\t64\n"
325+
"#define DTB_SIZE\t4096\n")
326+
set(SGEMM_UNROLL_M 16)
327+
set(SGEMM_UNROLL_N 4)
328+
set(DGEMM_UNROLL_M 8)
329+
set(DGEMM_UNROLL_N 4)
330+
set(CGEMM_UNROLL_M 8)
331+
set(CGEMM_UNROLL_N 4)
332+
set(ZGEMM_UNROLL_M 4)
333+
set(ZGEMM_UNROLL_N 4)
334+
set(SYMV_P 16)
335+
elseif ("${TCORE}" STREQUAL "POWER6")
336+
file(APPEND ${TARGET_CONF_TEMP}
337+
"#define L1_DATA_SIZE 32768\n"
338+
"#define L1_DATA_LINESIZE 128\n"
339+
"#define L2_SIZE 524288\n"
340+
"#define L2_LINESIZE 128 \n"
341+
"#define DTB_DEFAULT_ENTRIES 128\n"
342+
"#define DTB_SIZE 4096\n"
343+
"#define L2_ASSOCIATIVE 8\n")
344+
set(SGEMM_UNROLL_M 4)
345+
set(SGEMM_UNROLL_N 4)
346+
set(DGEMM_UNROLL_M 4)
347+
set(DGEMM_UNROLL_N 4)
348+
set(CGEMM_UNROLL_M 2)
349+
set(CGEMM_UNROLL_N 4)
350+
set(ZGEMM_UNROLL_M 2)
351+
set(ZGEMM_UNROLL_N 4)
352+
set(SYMV_P 8)
353+
elseif ("${TCORE}" STREQUAL "POWER8")
354+
file(APPEND ${TARGET_CONF_TEMP}
355+
"#define L1_DATA_SIZE 32768\n"
356+
"#define L1_DATA_LINESIZE 128\n"
357+
"#define L2_SIZE 524288\n"
358+
"#define L2_LINESIZE 128 \n"
359+
"#define DTB_DEFAULT_ENTRIES 128\n"
360+
"#define DTB_SIZE 4096\n"
361+
"#define L2_ASSOCIATIVE 8\n")
362+
set(SGEMM_UNROLL_M 16)
363+
set(SGEMM_UNROLL_N 8)
364+
set(DGEMM_UNROLL_M 16)
365+
set(DGEMM_UNROLL_N 4)
366+
set(CGEMM_UNROLL_M 8)
367+
set(CGEMM_UNROLL_N 4)
368+
set(ZGEMM_UNROLL_M 8)
369+
set(ZGEMM_UNROLL_N 2)
370+
set(SYMV_P 8)
371+
elseif ("${TCORE}" STREQUAL "POWER9")
372+
file(APPEND ${TARGET_CONF_TEMP}
373+
"#define L1_DATA_SIZE 32768\n"
374+
"#define L1_DATA_LINESIZE 128\n"
375+
"#define L2_SIZE 524288\n"
376+
"#define L2_LINESIZE 128 \n"
377+
"#define DTB_DEFAULT_ENTRIES 128\n"
378+
"#define DTB_SIZE 4096\n"
379+
"#define L2_ASSOCIATIVE 8\n")
380+
set(SGEMM_UNROLL_M 16)
381+
set(SGEMM_UNROLL_N 8)
382+
set(DGEMM_UNROLL_M 16)
383+
set(DGEMM_UNROLL_N 4)
384+
set(CGEMM_UNROLL_M 8)
385+
set(CGEMM_UNROLL_N 4)
386+
set(ZGEMM_UNROLL_M 8)
387+
set(ZGEMM_UNROLL_N 2)
388+
set(SYMV_P 8)
312389
endif()
313390

314391
# Or should this actually be NUM_CORES?

common_power.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,35 @@
3939
#ifndef COMMON_POWER
4040
#define COMMON_POWER
4141

42+
#define str(x) #x
43+
44+
#ifdef OS_AIX
45+
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
46+
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
47+
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
48+
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
49+
#define XVMOVDP(T,A) xvcpsgndp T, A, A
50+
51+
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
52+
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
53+
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
54+
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
55+
56+
#else
57+
#define XXSPLTD(T,A,z) xxspltd T, A, z
58+
#define XXMRGHD(T,A,B) xxmrghd T, A, B
59+
#define XXMRGLD(T,A,B) xxmrgld T, A, B
60+
#define XXSWAPD(T,A) xxswapd T, A
61+
#define XVMOVDP(T,A) xvmovdp T, A
62+
63+
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
64+
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
65+
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
66+
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
67+
68+
#endif
69+
70+
4271
#if defined(POWER8) || defined(POWER9)
4372
#define MB __asm__ __volatile__ ("eieio":::"memory")
4473
#define WMB __asm__ __volatile__ ("eieio":::"memory")

driver/level3/gemm3m_level3.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
338338

339339
for(jjs = js; jjs < js + min_j; jjs += min_jj){
340340
min_jj = min_j + js - jjs;
341-
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
341+
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
342342

343343
START_RPCC();
344344

@@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
398398

399399
for(jjs = js; jjs < js + min_j; jjs += min_jj){
400400
min_jj = min_j + js - jjs;
401-
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
401+
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
402402

403403
START_RPCC();
404404

@@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
463463

464464
for(jjs = js; jjs < js + min_j; jjs += min_jj){
465465
min_jj = min_j + js - jjs;
466-
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
466+
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
467467

468468
START_RPCC();
469469

driver/others/blas_server_win32.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
462462

463463
for(i = 0; i < blas_num_threads - 1; i++){
464464
// Could also just use WaitForMultipleObjects
465-
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
465+
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
466466

467467
#ifndef OS_WINDOWSSTORE
468468
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP

driver/others/dynamic.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
586586
}
587587
return NULL;
588588
case 7:
589+
if (model == 10) // Goldmont Plus
590+
return &gotoblas_NEHALEM;
589591
if (model == 14) {
590592
// Ice Lake
591593
if (support_avx512())

driver/others/dynamic_arm64.c

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,18 @@
4343
#endif
4444

4545
extern gotoblas_t gotoblas_ARMV8;
46+
extern gotoblas_t gotoblas_CORTEXA53;
4647
extern gotoblas_t gotoblas_CORTEXA57;
48+
extern gotoblas_t gotoblas_CORTEXA72;
49+
extern gotoblas_t gotoblas_CORTEXA73;
50+
extern gotoblas_t gotoblas_FALKOR;
4751
extern gotoblas_t gotoblas_THUNDERX;
4852
extern gotoblas_t gotoblas_THUNDERX2T99;
53+
extern gotoblas_t gotoblas_TSV110;
4954

5055
extern void openblas_warning(int verbose, const char * msg);
5156

52-
#define NUM_CORETYPES 4
57+
#define NUM_CORETYPES 9
5358

5459
/*
5560
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
6570

6671
static char *corename[] = {
6772
"armv8",
73+
"cortexa53",
6874
"cortexa57",
75+
"cortexa72",
76+
"cortexa73",
77+
"falkor",
6978
"thunderx",
7079
"thunderx2t99",
80+
"tsv110",
7181
"unknown"
7282
};
7383

7484
char *gotoblas_corename(void) {
7585
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
76-
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
77-
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
78-
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
86+
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
87+
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
88+
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
89+
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
90+
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
91+
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
92+
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
93+
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
7994
return corename[NUM_CORETYPES];
8095
}
8196

@@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
96111
switch (found)
97112
{
98113
case 0: return (&gotoblas_ARMV8);
99-
case 1: return (&gotoblas_CORTEXA57);
100-
case 2: return (&gotoblas_THUNDERX);
101-
case 3: return (&gotoblas_THUNDERX2T99);
114+
case 1: return (&gotoblas_CORTEXA53);
115+
case 2: return (&gotoblas_CORTEXA57);
116+
case 3: return (&gotoblas_CORTEXA72);
117+
case 4: return (&gotoblas_CORTEXA73);
118+
case 5: return (&gotoblas_FALKOR);
119+
case 6: return (&gotoblas_THUNDERX);
120+
case 7: return (&gotoblas_THUNDERX2T99);
121+
case 8: return (&gotoblas_TSV110);
102122
}
103123
snprintf(message, 128, "Core not found: %s\n", coretype);
104124
openblas_warning(1, message);
@@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
136156
case 0x41: // ARM
137157
switch (part)
138158
{
139-
case 0xd07: // Cortex A57
140-
case 0xd08: // Cortex A72
141159
case 0xd03: // Cortex A53
160+
return &gotoblas_CORTEXA53;
161+
case 0xd07: // Cortex A57
142162
return &gotoblas_CORTEXA57;
163+
case 0xd08: // Cortex A72
164+
return &gotoblas_CORTEXA72;
165+
case 0xd09: // Cortex A73
166+
return &gotoblas_CORTEXA73;
143167
}
144168
break;
145169
case 0x42: // Broadcom
@@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
158182
return &gotoblas_THUNDERX2T99;
159183
}
160184
break;
185+
case 0x48: // HiSilicon
186+
switch (part)
187+
{
188+
case 0xd01: // tsv110
189+
return &gotoblas_TSV110;
190+
}
191+
break;
192+
case 0x51: // Qualcomm
193+
switch (part)
194+
{
195+
case 0xc00: // Falkor
196+
return &gotoblas_FALKOR;
197+
}
198+
break;
161199
}
162200
return NULL;
163201
}

0 commit comments

Comments
 (0)