Skip to content

Commit a17cf36

Browse files
authored
Merge pull request #2153 from quickwritereader/develop
improved power9 zgemm,sgemm
2 parents 909ad04 + 148c4cc commit a17cf36

File tree

10 files changed

+4465
-749
lines changed

10 files changed

+4465
-749
lines changed

benchmark/gemm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
207207
for (i = 0; i < m * n * COMPSIZE; i++) {
208208
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
209209
}
210-
210+
211211
fprintf(stderr, " SIZE Flops Time\n");
212212

213213
for (i = from; i <= to; i += step) {

kernel/power/KERNEL.POWER9

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
STRMMKERNEL = sgemm_kernel_power9.S
77
DTRMMKERNEL = dgemm_kernel_power9.S
88
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
9-
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
9+
ZTRMMKERNEL = zgemm_kernel_power9.S
1010

1111
SGEMMKERNEL = sgemm_kernel_power9.S
1212
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
3838
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
3939
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
4040

41-
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
41+
ZGEMMKERNEL = zgemm_kernel_power9.S
4242
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
4343
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
4444
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c

kernel/power/dgemm_kernel_power9.S

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
135135
std r14, 280(SP)
136136

137137

138-
stxv v20, 288(SP)
139-
stxv v21, 304(SP)
140-
stxv v22, 320(SP)
141-
stxv v23, 336(SP)
142-
stxv v24, 352(SP)
143-
stxv v25, 368(SP)
144-
stxv v26, 384(SP)
145-
stxv v27, 400(SP)
146-
stxv v28, 416(SP)
147-
stxv v29, 432(SP)
148-
stxv v30, 448(SP)
149-
stxv v31, 464(SP)
138+
stxv vs52, 288(SP)
139+
stxv vs53, 304(SP)
140+
stxv vs54, 320(SP)
141+
stxv vs55, 336(SP)
142+
stxv vs56, 352(SP)
143+
stxv vs57, 368(SP)
144+
stxv vs58, 384(SP)
145+
stxv vs59, 400(SP)
146+
stxv vs60, 416(SP)
147+
stxv vs61, 432(SP)
148+
stxv vs62, 448(SP)
149+
stxv vs63, 464(SP)
150150

151151

152152
stfd f1, ALPHA_SP
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
229229
ld r15, 272(SP)
230230
ld r14, 280(SP)
231231

232-
lxv v20, 288(SP)
233-
lxv v21, 304(SP)
234-
lxv v22, 320(SP)
235-
lxv v23, 336(SP)
236-
lxv v24, 352(SP)
237-
lxv v25, 368(SP)
238-
lxv v26, 384(SP)
239-
lxv v27, 400(SP)
240-
lxv v28, 416(SP)
241-
lxv v29, 432(SP)
242-
lxv v30, 448(SP)
243-
lxv v31, 464(SP)
232+
lxv vs52, 288(SP)
233+
lxv vs53, 304(SP)
234+
lxv vs54, 320(SP)
235+
lxv vs55, 336(SP)
236+
lxv vs56, 352(SP)
237+
lxv vs57, 368(SP)
238+
lxv vs58, 384(SP)
239+
lxv vs59, 400(SP)
240+
lxv vs60, 416(SP)
241+
lxv vs61, 432(SP)
242+
lxv vs62, 448(SP)
243+
lxv vs63, 464(SP)
244244

245245
addi SP, SP, STACKSIZE
246246
blr

kernel/power/sgemm_kernel_power9.S

Lines changed: 62 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3232

3333
#define LOAD ld
3434
#define STACKSIZE (512 )
35-
35+
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
3636
#define M r3
3737
#define N r4
3838
#define K r5
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9191
PROFCODE
9292

9393
addi SP, SP, -STACKSIZE
94-
li r0, 0
94+
mflr r0
95+
9596

9697
stfd f14, 0(SP)
9798
stfd f15, 8(SP)
@@ -137,92 +138,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137138
std r14, 280(SP)
138139

139140

140-
stxv v20, 288(SP)
141-
stxv v21, 304(SP)
142-
stxv v22, 320(SP)
143-
stxv v23, 336(SP)
144-
stxv v24, 352(SP)
145-
stxv v25, 368(SP)
146-
stxv v26, 384(SP)
147-
stxv v27, 400(SP)
148-
stxv v28, 416(SP)
149-
stxv v29, 432(SP)
150-
stxv v30, 448(SP)
151-
stxv v31, 464(SP)
152-
141+
stxv vs52, 288(SP)
142+
stxv vs53, 304(SP)
143+
stxv vs54, 320(SP)
144+
stxv vs55, 336(SP)
145+
stxv vs56, 352(SP)
146+
stxv vs57, 368(SP)
147+
stxv vs58, 384(SP)
148+
stxv vs59, 400(SP)
149+
stxv vs60, 416(SP)
150+
stxv vs61, 432(SP)
151+
stxv vs62, 448(SP)
152+
stxv vs63, 464(SP)
153+
std r0, FLINK_SAVE(SP)
153154

154155

155156
#if defined(TRMMKERNEL)
156157
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
157158
#endif
158159
slwi LDC, LDC, 2
159160

160-
161-
/* cmpwi cr0, M, 0
162-
ble .L999_H1
163-
cmpwi cr0, N, 0
164-
ble .L999_H1
165-
cmpwi cr0, K, 0
166-
ble .L999_H1
167-
*/
168161

169162

170163
/*alpha is stored in f1. convert to single and splat*/
171164
xscvdpspn alpha_r,vs1
172-
xxspltw alpha_r,alpha_r,0
173-
165+
xxspltw alpha_r,alpha_r,0
174166

175167
/*load reverse permute mask for big endian
176168
uint128 = 0xc0d0e0f08090a0b0405060700010203
177169
*/
178170

179171
lis T2, perm_const2@highest
180-
ori T2, T2, perm_const2@higher
181-
rldicr T2, T2, 32, 31
182-
oris T2, T2, perm_const2@h
183-
ori T2, T2, perm_const2@l
184-
185172
lis T1, perm_const1@highest
173+
lis T3, save_permute_12@highest
174+
lis T4, save_permute_11@highest
175+
lis T5, save_permute_22@highest
176+
lis T6, save_permute_21@highest
177+
ori T2, T2, perm_const2@higher
186178
ori T1, T1, perm_const1@higher
179+
ori T3, T3, save_permute_12@higher
180+
ori T4, T4, save_permute_11@higher
181+
ori T5, T5, save_permute_22@higher
182+
ori T6, T6, save_permute_21@higher
183+
rldicr T2, T2, 32, 31
187184
rldicr T1, T1, 32, 31
185+
rldicr T3, T3, 32, 31
186+
rldicr T4, T4, 32, 31
187+
rldicr T5, T5, 32, 31
188+
rldicr T6, T6, 32, 31
189+
oris T2, T2, perm_const2@h
188190
oris T1, T1, perm_const1@h
191+
oris T3, T3, save_permute_12@h
192+
oris T4, T4, save_permute_11@h
193+
oris T5, T5, save_permute_22@h
194+
oris T6, T6, save_permute_21@h
195+
ori T2, T2, perm_const2@l
189196
ori T1, T1, perm_const1@l
190-
197+
ori T3, T3, save_permute_12@l
198+
ori T4, T4, save_permute_11@l
199+
ori T5, T5, save_permute_22@l
200+
ori T6, T6, save_permute_21@l
201+
li r0,0
191202
mtvsrdd permute_mask,T2,T1
192-
193-
lis T2, save_permute_12@highest
194-
ori T2, T2, save_permute_12@higher
195-
rldicr T2, T2, 32, 31
196-
oris T2, T2, save_permute_12@h
197-
ori T2, T2, save_permute_12@l
198-
199-
lis T1, save_permute_11@highest
200-
ori T1, T1, save_permute_11@higher
201-
rldicr T1, T1, 32, 31
202-
oris T1, T1, save_permute_11@h
203-
ori T1, T1, save_permute_11@l
204-
205-
mtvsrdd save_permute_1,T2,T1
206-
207-
lis T2, save_permute_22@highest
208-
ori T2, T2, save_permute_22@higher
209-
rldicr T2, T2, 32, 31
210-
oris T2, T2, save_permute_22@h
211-
ori T2, T2, save_permute_22@l
212-
213-
lis T1, save_permute_21@highest
214-
ori T1, T1, save_permute_21@higher
215-
rldicr T1, T1, 32, 31
216-
oris T1, T1, save_permute_21@h
217-
ori T1, T1, save_permute_21@l
218-
219-
mtvsrdd save_permute_2,T2,T1
203+
mtvsrdd save_permute_1,T3,T4
204+
mtvsrdd save_permute_2,T5,T6
220205

221206
#include "sgemm_logic_power9.S"
222207

223-
.L999:
224-
addi r3, 0, 0
225-
208+
.L999:
226209
lfd f14, 0(SP)
227210
lfd f15, 8(SP)
228211
lfd f16, 16(SP)
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
264247
ld r16, 264(SP)
265248
ld r15, 272(SP)
266249
ld r14, 280(SP)
267-
268-
lxv v20, 288(SP)
269-
lxv v21, 304(SP)
270-
lxv v22, 320(SP)
271-
lxv v23, 336(SP)
272-
lxv v24, 352(SP)
273-
lxv v25, 368(SP)
274-
lxv v26, 384(SP)
275-
lxv v27, 400(SP)
276-
lxv v28, 416(SP)
277-
lxv v29, 432(SP)
278-
lxv v30, 448(SP)
279-
lxv v31, 464(SP)
280250

251+
ld r0, FLINK_SAVE(SP)
281252

282-
addi SP, SP, STACKSIZE
253+
lxv vs52, 288(SP)
254+
lxv vs53, 304(SP)
255+
lxv vs54, 320(SP)
256+
lxv vs55, 336(SP)
257+
lxv vs56, 352(SP)
258+
lxv vs57, 368(SP)
259+
lxv vs58, 384(SP)
260+
lxv vs59, 400(SP)
261+
mtlr r0
262+
lxv vs60, 416(SP)
263+
lxv vs61, 432(SP)
264+
lxv vs62, 448(SP)
265+
lxv vs63, 464(SP)
266+
267+
addi SP, SP, STACKSIZE
283268
blr
284269

270+
285271
EPILOGUE
286272
#endif

0 commit comments

Comments
 (0)