Skip to content

Commit cdbfb89

Browse files
new sgemm 8x16
1 parent 148c4cc commit cdbfb89

File tree

3 files changed

+285
-248
lines changed

3 files changed

+285
-248
lines changed

kernel/power/sgemm_logic_power9.S

Lines changed: 106 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -3,89 +3,89 @@ b L8
33

44
MY_ALIGN
55
LSGEMM_L8x16_LMAIN_SUB:
6-
LOAD8x16_0
7-
mtctr L
6+
LOAD8x16_2
87
MY_ALIGN
98

109
LSGEMM_L8x16_LOOP:
11-
12-
KERNEL8x16_I1_L4_2 64,32, 0,0
13-
KERNEL8x16_I1_L4_2 64,32, 1,0
14-
KERNEL8x16_I1_L4_2 64,32, 2,0
15-
KERNEL8x16_I1_L4_2 64,32, 3,0
16-
KERNEL8x16_I1_L4_2 64,32, 4,0
17-
KERNEL8x16_I1_L4_2 64,32, 5,0
18-
KERNEL8x16_I1_L4_2 64,32, 6,0
19-
KERNEL8x16_I1_L4_2 64,32, 7,0
20-
KERNEL8x16_I1_L4_2 64,32, 8,0
21-
KERNEL8x16_I1_L4_2 64,32, 9,0
22-
KERNEL8x16_I1_L4_2 64,32, 10,0
23-
KERNEL8x16_I1_L4_2 64,32, 11,0
24-
KERNEL8x16_I1_L4_2 64,32, 12,0
25-
KERNEL8x16_I1_L4_2 64,32, 13,0
26-
KERNEL8x16_I1_L4_2 64,32, 14,0
27-
KERNEL8x16_I1_L4_2 64,32, 15,0
28-
KERNEL8x16_I1_L4_2 64,32, 16,0
29-
KERNEL8x16_I1_L4_2 64,32, 17,0
30-
KERNEL8x16_I1_L4_2 64,32, 18,0
31-
KERNEL8x16_I1_L4_2 64,32, 19,0
32-
KERNEL8x16_I1_L4_2 64,32, 20,0
33-
KERNEL8x16_I1_L4_2 64,32, 21,0
34-
KERNEL8x16_I1_L4_2 64,32, 22,0
35-
KERNEL8x16_I1_L4_2 64,32, 23,0
36-
KERNEL8x16_I1_L4_2 64,32, 24,0
37-
KERNEL8x16_I1_L4_2 64,32, 25,0
38-
KERNEL8x16_I1_L4_2 64,32, 26,0
39-
KERNEL8x16_I1_L4_2 64,32, 27,0
40-
KERNEL8x16_I1_L4_2 64,32, 28,0
41-
KERNEL8x16_I1_L4_2 64,32, 29,0
42-
KERNEL8x16_I1_L4_2 64,32, 30,0
43-
KERNEL8x16_I1_L4_2 64,32, 31,1
10+
KERNEL8x16_L2 128,64,0,0
11+
LSGEMM_L8x16_K128:
12+
KERNEL8x16_L2 128,64,1,0
13+
KERNEL8x16_I1_L4_2 128,64, 1,0
14+
KERNEL8x16_I1_L4_2 128,64, 2,0
15+
KERNEL8x16_I1_L4_2 128,64, 3,0
16+
KERNEL8x16_I1_L4_2 128,64, 4,0
17+
KERNEL8x16_I1_L4_2 128,64, 5,0
18+
KERNEL8x16_I1_L4_2 128,64, 6,0
19+
KERNEL8x16_I1_L4_2 128,64, 7,0
20+
KERNEL8x16_I1_L4_2 128,64, 8,0
21+
KERNEL8x16_I1_L4_2 128,64, 9,0
22+
KERNEL8x16_I1_L4_2 128,64, 10,0
23+
KERNEL8x16_I1_L4_2 128,64, 11,0
24+
KERNEL8x16_I1_L4_2 128,64, 12,0
25+
KERNEL8x16_I1_L4_2 128,64, 13,0
26+
KERNEL8x16_I1_L4_2 128,64, 14,0
27+
KERNEL8x16_I1_L4_2 128,64, 15,0
28+
KERNEL8x16_I1_L4_2 128,64, 16,0
29+
KERNEL8x16_I1_L4_2 128,64, 17,0
30+
KERNEL8x16_I1_L4_2 128,64, 18,0
31+
KERNEL8x16_I1_L4_2 128,64, 19,0
32+
KERNEL8x16_I1_L4_2 128,64, 20,0
33+
KERNEL8x16_I1_L4_2 128,64, 21,0
34+
KERNEL8x16_I1_L4_2 128,64, 22,0
35+
KERNEL8x16_I1_L4_2 128,64, 23,0
36+
KERNEL8x16_I1_L4_2 128,64, 24,0
37+
KERNEL8x16_I1_L4_2 128,64, 25,0
38+
KERNEL8x16_I1_L4_2 128,64, 26,0
39+
KERNEL8x16_I1_L4_2 128,64, 27,0
40+
KERNEL8x16_I1_L4_2 128,64, 28,0
41+
KERNEL8x16_I1_L4_2 128,64, 29,0
42+
KERNEL8x16_I1_L4_2 128,64, 30,0
43+
KERNEL8x16_I1_L4_2 128,64, 31,1
4444
bdnz LSGEMM_L8x16_LOOP
4545

4646
MY_ALIGN
4747
LSGEMM_L8x16_LOOP_END:
48-
END8x16 0, AO, BO, 64, 32
48+
END8x16_2
4949
blr
5050

5151
MY_ALIGN
5252
LSGEMM_L8x16_L64_SUB:
53-
LOAD8x16_0
54-
KERNEL8x16_I1_L4_2 64,32, 0,0
55-
KERNEL8x16_I1_L4_2 64,32, 1,0
56-
KERNEL8x16_I1_L4_2 64,32, 2,0
57-
KERNEL8x16_I1_L4_2 64,32, 3,0
58-
KERNEL8x16_I1_L4_2 64,32, 4,0
59-
KERNEL8x16_I1_L4_2 64,32, 5,0
60-
KERNEL8x16_I1_L4_2 64,32, 6,0
61-
KERNEL8x16_I1_L4_2 64,32, 7,0
62-
KERNEL8x16_I1_L4_2 64,32, 8,0
63-
KERNEL8x16_I1_L4_2 64,32, 9,0
64-
KERNEL8x16_I1_L4_2 64,32, 10,0
65-
KERNEL8x16_I1_L4_2 64,32, 11,0
66-
KERNEL8x16_I1_L4_2 64,32, 12,0
67-
KERNEL8x16_I1_L4_2 64,32, 13,0
68-
KERNEL8x16_I1_L4_2 64,32, 14,0
69-
KERNEL8x16_I1_L4_3 64,32, 15,1
53+
LOAD8x16_2
54+
KERNEL8x16_I1_L4_2 128,64, 0,0
55+
KERNEL8x16_I1_L4_2 128,64, 1,0
56+
KERNEL8x16_I1_L4_2 128,64, 2,0
57+
KERNEL8x16_I1_L4_2 128,64,3,0
58+
KERNEL8x16_I1_L4_2 128,64,4,0
59+
KERNEL8x16_I1_L4_2 128,64,5,0
60+
KERNEL8x16_I1_L4_2 128,64,6,0
61+
KERNEL8x16_I1_L4_2 128,64,7,0
62+
KERNEL8x16_I1_L4_2 128,64,8,0
63+
KERNEL8x16_I1_L4_2 128,64,9,0
64+
KERNEL8x16_I1_L4_2 128,64,10,0
65+
KERNEL8x16_I1_L4_2 128,64,11,0
66+
KERNEL8x16_I1_L4_2 128,64,12,0
67+
KERNEL8x16_I1_L4_2 128,64,13,0
68+
KERNEL8x16_I1_L4_2 128,64,14,0
69+
KERNEL8x16_I1_L4_3 128,64,15,1
7070
blr
7171
LSGEMM_L8x16_L32_SUB:
72-
LOAD8x16_0
73-
KERNEL8x16_I1_L4_2 64,32, 0,0
74-
KERNEL8x16_I1_L4_2 64,32, 1,0
75-
KERNEL8x16_I1_L4_2 64,32, 2,0
76-
KERNEL8x16_I1_L4_2 64,32, 3,0
77-
KERNEL8x16_I1_L4_2 64,32, 4,0
78-
KERNEL8x16_I1_L4_2 64,32, 5,0
79-
KERNEL8x16_I1_L4_2 64,32, 6,0
80-
KERNEL8x16_I1_L4_3 64,32, 7,1
72+
LOAD8x16_2
73+
KERNEL8x16_I1_L4_2 128,64,0,0
74+
KERNEL8x16_I1_L4_2 128,64,1,0
75+
KERNEL8x16_I1_L4_2 128,64,2,0
76+
KERNEL8x16_I1_L4_2 128,64,3,0
77+
KERNEL8x16_I1_L4_2 128,64,4,0
78+
KERNEL8x16_I1_L4_2 128,64,5,0
79+
KERNEL8x16_I1_L4_2 128,64,6,0
80+
KERNEL8x16_I1_L4_3 128,64,7,1
8181
blr
8282

8383
LSGEMM_L8x16_L16_SUB:
84-
LOAD8x16_0
85-
KERNEL8x16_I1_L4_2 64,32, 0,0
86-
KERNEL8x16_I1_L4_2 64,32, 1,0
87-
KERNEL8x16_I1_L4_2 64,32, 2,0
88-
KERNEL8x16_I1_L4_3 64,32, 3,1
84+
LOAD8x16_2
85+
KERNEL8x16_I1_L4_2 128,64,0,0
86+
KERNEL8x16_I1_L4_2 128,64,1,0
87+
KERNEL8x16_I1_L4_2 128,64,2,0
88+
KERNEL8x16_I1_L4_3 128,64,3,1
8989
blr
9090

9191
L8:
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
127127
#if defined(TRMMKERNEL)
128128
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
129129
mr T12, T11
130-
addi T12,T12, -1
131-
srawi. L, T12, 7 /**(T11-1) % 128x */
130+
addi T12,T12, -2
131+
srawi. L, T12, 7 /**(T11-2) % 128x */
132132
#else
133133
mr T12, K
134-
addi T12,T12, -1
135-
srawi. L, T12, 7 /**(K-1) % 128x */
134+
addi T12,T12, -2
135+
srawi. L, T12, 7 /**(K-2) % 128x */
136136
#endif
137137

138-
ZERO8x16
138+
ZERO8x16
139+
mtctr L
139140
ble LSGEMM_L8x16_SUB0
140141
bl LSGEMM_L8x16_LMAIN_SUB
141142
andi. L, T12, 127
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
148149
cmpwi T11,128
149150
#else
150151
andi. L, K, 255
152+
cmpwi K,129
153+
#endif
154+
li T10,1
155+
bne CMP8x16_128K
156+
addi BO,BO,-32
157+
addi AO,AO,-64
158+
LOAD8x16 64,32
159+
END8x16_WITHOUT_ADD
160+
LOAD8x16_2O AO,BO, 128, 64
161+
mtctr T10
162+
bl LSGEMM_L8x16_K128
163+
b LSGEMM_L8x16_SAVE
164+
CMP8x16_128K:
165+
/*----------------------------------------*/
166+
#if defined(TRMMKERNEL)
167+
cmpwi T11,128
168+
#else
151169
cmpwi K,128
152-
#endif
153-
154-
bne LSGEMM_L8x16_SUB2
155-
MY_ALIGN
156-
LSGEMM_L8x16_SUB2_128:
157-
bl LSGEMM_L8x16_L64_SUB
158-
bl LSGEMM_L8x16_L64_SUB
159-
b LSGEMM_L8x16_SAVE
170+
#endif
171+
bne LSGEMM_L8x16_SUB2
172+
MY_ALIGN
173+
mtctr T10
174+
addi BO,BO,-64
175+
addi AO,AO,-128
176+
LOAD8x16_2O AO,BO, 128,64
177+
bl LSGEMM_L8x16_K128
178+
b LSGEMM_L8x16_SAVE
160179
MY_ALIGN
161180
LSGEMM_L8x16_SUB2:
162181
andi. T10,L,64
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
176195
LSGEMM_L8x16_SUB2_8:
177196
andi. T10,L, 8
178197
ble LSGEMM_L8x16_SUB2_4
179-
LOAD8x16_0
180-
KERNEL8x16_I1_L4_2 64,32, 0,0
181-
KERNEL8x16_I1_L4_3 64,32, 1,1
198+
LOAD8x16_2
199+
KERNEL8x16_I1_L4_2 128,64, 0,0
200+
KERNEL8x16_I1_L4_3 128,64, 1,1
182201
MY_ALIGN
183202
LSGEMM_L8x16_SUB2_4:
184203
andi. T10,L, 4
185204
ble LSGEMM_L8x16_SUB2_2
186-
LOAD8x16_0
187-
KERNEL8x16_I1_L4_3 64,32, 0,1
205+
LOAD8x16_2
206+
KERNEL8x16_I1_L4_3 128,64, 0,1
188207
MY_ALIGN
189208
LSGEMM_L8x16_SUB2_2:
190209
andi. T10,L, 2
191210
ble LSGEMM_L8x16_SUB2_1
192-
LOAD8x16_0
193-
KERNEL8x16_I1_L2_3 64,32, 0,1
211+
LOAD8x16_2
212+
KERNEL8x16_E2 128,64, 0,1
194213
MY_ALIGN
195214
LSGEMM_L8x16_SUB2_1:
196215
andi. T10,L, 1

0 commit comments

Comments
 (0)