3
3
4
4
MY_ALIGN
5
5
LSGEMM_L8x16_LMAIN_SUB:
6
- LOAD8x16_0
7
- mtctr L
6
+ LOAD8x16_2
8
7
MY_ALIGN
9
8
10
9
LSGEMM_L8x16_LOOP:
11
-
12
- KERNEL8x16_I1_L4_2 64 ,32 , 0 ,0
13
- KERNEL8x16_I1_L4_2 64 ,32 , 1 ,0
14
- KERNEL8x16_I1_L4_2 64 ,32 , 2 ,0
15
- KERNEL8x16_I1_L4_2 64 ,32 , 3 ,0
16
- KERNEL8x16_I1_L4_2 64 ,32 , 4 ,0
17
- KERNEL8x16_I1_L4_2 64 ,32 , 5 ,0
18
- KERNEL8x16_I1_L4_2 64 ,32 , 6 ,0
19
- KERNEL8x16_I1_L4_2 64 ,32 , 7 ,0
20
- KERNEL8x16_I1_L4_2 64 ,32 , 8 ,0
21
- KERNEL8x16_I1_L4_2 64 ,32 , 9 ,0
22
- KERNEL8x16_I1_L4_2 64 ,32 , 10 ,0
23
- KERNEL8x16_I1_L4_2 64 ,32 , 11 ,0
24
- KERNEL8x16_I1_L4_2 64 ,32 , 12 ,0
25
- KERNEL8x16_I1_L4_2 64 ,32 , 13 ,0
26
- KERNEL8x16_I1_L4_2 64 ,32 , 14 ,0
27
- KERNEL8x16_I1_L4_2 64 ,32 , 15 ,0
28
- KERNEL8x16_I1_L4_2 64 ,32 , 16 ,0
29
- KERNEL8x16_I1_L4_2 64 ,32 , 17 ,0
30
- KERNEL8x16_I1_L4_2 64 ,32 , 18 ,0
31
- KERNEL8x16_I1_L4_2 64 ,32 , 19 ,0
32
- KERNEL8x16_I1_L4_2 64 ,32 , 20 ,0
33
- KERNEL8x16_I1_L4_2 64 ,32 , 21 ,0
34
- KERNEL8x16_I1_L4_2 64 ,32 , 22 ,0
35
- KERNEL8x16_I1_L4_2 64 ,32 , 23 ,0
36
- KERNEL8x16_I1_L4_2 64 ,32 , 24 ,0
37
- KERNEL8x16_I1_L4_2 64 ,32 , 25 ,0
38
- KERNEL8x16_I1_L4_2 64 ,32 , 26 ,0
39
- KERNEL8x16_I1_L4_2 64 ,32 , 27 ,0
40
- KERNEL8x16_I1_L4_2 64 ,32 , 28 ,0
41
- KERNEL8x16_I1_L4_2 64 ,32 , 29 ,0
42
- KERNEL8x16_I1_L4_2 64 ,32 , 30 ,0
43
- KERNEL8x16_I1_L4_2 64 ,32 , 31 ,1
10
+ KERNEL8x16_L2 128 ,64 ,0 ,0
11
+ LSGEMM_L8x16_K128:
12
+ KERNEL8x16_L2 128 ,64 ,1 ,0
13
+ KERNEL8x16_I1_L4_2 128 ,64 , 1 ,0
14
+ KERNEL8x16_I1_L4_2 128 ,64 , 2 ,0
15
+ KERNEL8x16_I1_L4_2 128 ,64 , 3 ,0
16
+ KERNEL8x16_I1_L4_2 128 ,64 , 4 ,0
17
+ KERNEL8x16_I1_L4_2 128 ,64 , 5 ,0
18
+ KERNEL8x16_I1_L4_2 128 ,64 , 6 ,0
19
+ KERNEL8x16_I1_L4_2 128 ,64 , 7 ,0
20
+ KERNEL8x16_I1_L4_2 128 ,64 , 8 ,0
21
+ KERNEL8x16_I1_L4_2 128 ,64 , 9 ,0
22
+ KERNEL8x16_I1_L4_2 128 ,64 , 10 ,0
23
+ KERNEL8x16_I1_L4_2 128 ,64 , 11 ,0
24
+ KERNEL8x16_I1_L4_2 128 ,64 , 12 ,0
25
+ KERNEL8x16_I1_L4_2 128 ,64 , 13 ,0
26
+ KERNEL8x16_I1_L4_2 128 ,64 , 14 ,0
27
+ KERNEL8x16_I1_L4_2 128 ,64 , 15 ,0
28
+ KERNEL8x16_I1_L4_2 128 ,64 , 16 ,0
29
+ KERNEL8x16_I1_L4_2 128 ,64 , 17 ,0
30
+ KERNEL8x16_I1_L4_2 128 ,64 , 18 ,0
31
+ KERNEL8x16_I1_L4_2 128 ,64 , 19 ,0
32
+ KERNEL8x16_I1_L4_2 128 ,64 , 20 ,0
33
+ KERNEL8x16_I1_L4_2 128 ,64 , 21 ,0
34
+ KERNEL8x16_I1_L4_2 128 ,64 , 22 ,0
35
+ KERNEL8x16_I1_L4_2 128 ,64 , 23 ,0
36
+ KERNEL8x16_I1_L4_2 128 ,64 , 24 ,0
37
+ KERNEL8x16_I1_L4_2 128 ,64 , 25 ,0
38
+ KERNEL8x16_I1_L4_2 128 ,64 , 26 ,0
39
+ KERNEL8x16_I1_L4_2 128 ,64 , 27 ,0
40
+ KERNEL8x16_I1_L4_2 128 ,64 , 28 ,0
41
+ KERNEL8x16_I1_L4_2 128 ,64 , 29 ,0
42
+ KERNEL8x16_I1_L4_2 128 ,64 , 30 ,0
43
+ KERNEL8x16_I1_L4_2 128 ,64 , 31 ,1
44
44
bdnz LSGEMM_L8x16_LOOP
45
45
46
46
MY_ALIGN
47
47
LSGEMM_L8x16_LOOP_END:
48
- END8x16 0 , AO, BO, 64 , 32
48
+ END8x16_2
49
49
blr
50
50
51
51
MY_ALIGN
52
52
LSGEMM_L8x16_L64_SUB:
53
- LOAD8x16_0
54
- KERNEL8x16_I1_L4_2 64 , 32 , 0 ,0
55
- KERNEL8x16_I1_L4_2 64 , 32 , 1 ,0
56
- KERNEL8x16_I1_L4_2 64 , 32 , 2 ,0
57
- KERNEL8x16_I1_L4_2 64 ,32 , 3 ,0
58
- KERNEL8x16_I1_L4_2 64 ,32 , 4 ,0
59
- KERNEL8x16_I1_L4_2 64 ,32 , 5 ,0
60
- KERNEL8x16_I1_L4_2 64 ,32 , 6 ,0
61
- KERNEL8x16_I1_L4_2 64 ,32 , 7 ,0
62
- KERNEL8x16_I1_L4_2 64 ,32 , 8 ,0
63
- KERNEL8x16_I1_L4_2 64 ,32 , 9 ,0
64
- KERNEL8x16_I1_L4_2 64 ,32 , 10 ,0
65
- KERNEL8x16_I1_L4_2 64 ,32 , 11 ,0
66
- KERNEL8x16_I1_L4_2 64 ,32 , 12 ,0
67
- KERNEL8x16_I1_L4_2 64 ,32 , 13 ,0
68
- KERNEL8x16_I1_L4_2 64 ,32 , 14 ,0
69
- KERNEL8x16_I1_L4_3 64 ,32 , 15 ,1
53
+ LOAD8x16_2
54
+ KERNEL8x16_I1_L4_2 128 , 64 , 0 ,0
55
+ KERNEL8x16_I1_L4_2 128 , 64 , 1 ,0
56
+ KERNEL8x16_I1_L4_2 128 , 64 , 2 ,0
57
+ KERNEL8x16_I1_L4_2 128 , 64 ,3 ,0
58
+ KERNEL8x16_I1_L4_2 128 , 64 ,4 ,0
59
+ KERNEL8x16_I1_L4_2 128 , 64 ,5 ,0
60
+ KERNEL8x16_I1_L4_2 128 , 64 ,6 ,0
61
+ KERNEL8x16_I1_L4_2 128 , 64 ,7 ,0
62
+ KERNEL8x16_I1_L4_2 128 , 64 ,8 ,0
63
+ KERNEL8x16_I1_L4_2 128 , 64 ,9 ,0
64
+ KERNEL8x16_I1_L4_2 128 , 64 ,10 ,0
65
+ KERNEL8x16_I1_L4_2 128 , 64 ,11 ,0
66
+ KERNEL8x16_I1_L4_2 128 , 64 ,12 ,0
67
+ KERNEL8x16_I1_L4_2 128 , 64 ,13 ,0
68
+ KERNEL8x16_I1_L4_2 128 , 64 ,14 ,0
69
+ KERNEL8x16_I1_L4_3 128 , 64 ,15 ,1
70
70
blr
71
71
LSGEMM_L8x16_L32_SUB:
72
- LOAD8x16_0
73
- KERNEL8x16_I1_L4_2 64 ,32 , 0 ,0
74
- KERNEL8x16_I1_L4_2 64 ,32 , 1 ,0
75
- KERNEL8x16_I1_L4_2 64 ,32 , 2 ,0
76
- KERNEL8x16_I1_L4_2 64 ,32 , 3 ,0
77
- KERNEL8x16_I1_L4_2 64 ,32 , 4 ,0
78
- KERNEL8x16_I1_L4_2 64 ,32 , 5 ,0
79
- KERNEL8x16_I1_L4_2 64 ,32 , 6 ,0
80
- KERNEL8x16_I1_L4_3 64 ,32 , 7 ,1
72
+ LOAD8x16_2
73
+ KERNEL8x16_I1_L4_2 128 , 64 ,0 ,0
74
+ KERNEL8x16_I1_L4_2 128 , 64 ,1 ,0
75
+ KERNEL8x16_I1_L4_2 128 , 64 ,2 ,0
76
+ KERNEL8x16_I1_L4_2 128 , 64 ,3 ,0
77
+ KERNEL8x16_I1_L4_2 128 , 64 ,4 ,0
78
+ KERNEL8x16_I1_L4_2 128 , 64 ,5 ,0
79
+ KERNEL8x16_I1_L4_2 128 , 64 ,6 ,0
80
+ KERNEL8x16_I1_L4_3 128 , 64 ,7 ,1
81
81
blr
82
82
83
83
LSGEMM_L8x16_L16_SUB:
84
- LOAD8x16_0
85
- KERNEL8x16_I1_L4_2 64 ,32 , 0 ,0
86
- KERNEL8x16_I1_L4_2 64 ,32 , 1 ,0
87
- KERNEL8x16_I1_L4_2 64 ,32 , 2 ,0
88
- KERNEL8x16_I1_L4_3 64 ,32 , 3 ,1
84
+ LOAD8x16_2
85
+ KERNEL8x16_I1_L4_2 128 , 64 ,0 ,0
86
+ KERNEL8x16_I1_L4_2 128 , 64 ,1 ,0
87
+ KERNEL8x16_I1_L4_2 128 , 64 ,2 ,0
88
+ KERNEL8x16_I1_L4_3 128 , 64 ,3 ,1
89
89
blr
90
90
91
91
L8:
@@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
127
127
#if defined(TRMMKERNEL)
128
128
REFRESH_TEMP_BK T11,K,TEMP_REG,16 ,8
129
129
mr T12, T11
130
- addi T12,T12, -1
131
- srawi. L, T12, 7 /**(T11-1 ) % 128x */
130
+ addi T12,T12, -2
131
+ srawi. L, T12, 7 /**(T11-2 ) % 128x */
132
132
#else
133
133
mr T12, K
134
- addi T12,T12, -1
135
- srawi. L, T12, 7 /**(K-1 ) % 128x */
134
+ addi T12,T12, -2
135
+ srawi. L, T12, 7 /**(K-2 ) % 128x */
136
136
#endif
137
137
138
- ZERO8x16
138
+ ZERO8x16
139
+ mtctr L
139
140
ble LSGEMM_L8x16_SUB0
140
141
bl LSGEMM_L8x16_LMAIN_SUB
141
142
andi. L, T12, 127
@@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
148
149
cmpwi T11,128
149
150
#else
150
151
andi. L, K, 255
152
+ cmpwi K,129
153
+ #endif
154
+ li T10,1
155
+ bne CMP8x16_128K
156
+ addi BO,BO,-32
157
+ addi AO,AO,-64
158
+ LOAD8x16 64 ,32
159
+ END8x16_WITHOUT_ADD
160
+ LOAD8x16_2O AO,BO, 128 , 64
161
+ mtctr T10
162
+ bl LSGEMM_L8x16_K128
163
+ b LSGEMM_L8x16_SAVE
164
+ CMP8x16_128K:
165
+ /*----------------------------------------*/
166
+ #if defined(TRMMKERNEL)
167
+ cmpwi T11,128
168
+ #else
151
169
cmpwi K,128
152
- #endif
153
-
154
- bne LSGEMM_L8x16_SUB2
155
- MY_ALIGN
156
- LSGEMM_L8x16_SUB2_128:
157
- bl LSGEMM_L8x16_L64_SUB
158
- bl LSGEMM_L8x16_L64_SUB
159
- b LSGEMM_L8x16_SAVE
170
+ #endif
171
+ bne LSGEMM_L8x16_SUB2
172
+ MY_ALIGN
173
+ mtctr T10
174
+ addi BO,BO,-64
175
+ addi AO,AO,-128
176
+ LOAD8x16_2O AO,BO, 128 ,64
177
+ bl LSGEMM_L8x16_K128
178
+ b LSGEMM_L8x16_SAVE
160
179
MY_ALIGN
161
180
LSGEMM_L8x16_SUB2:
162
181
andi. T10,L,64
@@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
176
195
LSGEMM_L8x16_SUB2_8:
177
196
andi. T10,L, 8
178
197
ble LSGEMM_L8x16_SUB2_4
179
- LOAD8x16_0
180
- KERNEL8x16_I1_L4_2 64 , 32 , 0 ,0
181
- KERNEL8x16_I1_L4_3 64 , 32 , 1 ,1
198
+ LOAD8x16_2
199
+ KERNEL8x16_I1_L4_2 128 , 64 , 0 ,0
200
+ KERNEL8x16_I1_L4_3 128 , 64 , 1 ,1
182
201
MY_ALIGN
183
202
LSGEMM_L8x16_SUB2_4:
184
203
andi. T10,L, 4
185
204
ble LSGEMM_L8x16_SUB2_2
186
- LOAD8x16_0
187
- KERNEL8x16_I1_L4_3 64 , 32 , 0 ,1
205
+ LOAD8x16_2
206
+ KERNEL8x16_I1_L4_3 128 , 64 , 0 ,1
188
207
MY_ALIGN
189
208
LSGEMM_L8x16_SUB2_2:
190
209
andi. T10,L, 2
191
210
ble LSGEMM_L8x16_SUB2_1
192
- LOAD8x16_0
193
- KERNEL8x16_I1_L2_3 64 , 32 , 0 ,1
211
+ LOAD8x16_2
212
+ KERNEL8x16_E2 128 , 64 , 0 ,1
194
213
MY_ALIGN
195
214
LSGEMM_L8x16_SUB2_1:
196
215
andi. T10,L, 1
0 commit comments