@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
32
33
33
#define LOAD ld
34
34
#define STACKSIZE (512 )
35
-
35
+ #define FLINK_SAVE (STACKSIZE+ 16 ) /* 16($r12) */
36
36
#define M r3
37
37
#define N r4
38
38
#define K r5
@@ -91,7 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
91
91
PROFCODE
92
92
93
93
addi SP, SP, -STACKSIZE
94
- li r0, 0
94
+ mflr r0
95
+
95
96
96
97
stfd f14, 0 (SP)
97
98
stfd f15, 8 (SP)
@@ -137,92 +138,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137
138
std r14, 280 (SP)
138
139
139
140
140
- stxv v20 , 288 (SP)
141
- stxv v21 , 304 (SP)
142
- stxv v22 , 320 (SP)
143
- stxv v23 , 336 (SP)
144
- stxv v24 , 352 (SP)
145
- stxv v25 , 368 (SP)
146
- stxv v26 , 384 (SP)
147
- stxv v27 , 400 (SP)
148
- stxv v28 , 416 (SP)
149
- stxv v29 , 432 (SP)
150
- stxv v30 , 448 (SP)
151
- stxv v31 , 464 (SP)
152
-
141
+ stxv vs52 , 288 (SP)
142
+ stxv vs53 , 304 (SP)
143
+ stxv vs54 , 320 (SP)
144
+ stxv vs55 , 336 (SP)
145
+ stxv vs56 , 352 (SP)
146
+ stxv vs57 , 368 (SP)
147
+ stxv vs58 , 384 (SP)
148
+ stxv vs59 , 400 (SP)
149
+ stxv vs60 , 416 (SP)
150
+ stxv vs61 , 432 (SP)
151
+ stxv vs62 , 448 (SP)
152
+ stxv vs63 , 464 (SP)
153
+ std r0, FLINK_SAVE(SP)
153
154
154
155
155
156
#if defined(TRMMKERNEL)
156
157
ld OFFSET, FRAMESLOT(0 ) + STACKSIZE(SP)
157
158
#endif
158
159
slwi LDC, LDC, 2
159
160
160
-
161
- /* cmpwi cr0, M, 0
162
- ble .L999_H1
163
- cmpwi cr0, N, 0
164
- ble .L999_H1
165
- cmpwi cr0, K, 0
166
- ble .L999_H1
167
- */
168
161
169
162
170
163
/*alpha is stored in f1. convert to single and splat*/
171
- xscvdpspn alpha_r,vs1
172
- xxspltw alpha_r,alpha_r,0
173
-
164
+ xscvdpspn alpha_r,vs1
165
+ xxspltw alpha_r,alpha_r,0
174
166
175
167
/*load reverse permute mask for big endian
176
168
uint128 = 0xc0d0e0f08090a0b0405060700010203
177
169
*/
178
170
179
171
lis T2, perm_const2@highest
180
- ori T2, T2, perm_const2@higher
181
- rldicr T2, T2, 32 , 31
182
- oris T2, T2, perm_const2@h
183
- ori T2, T2, perm_const2@l
184
-
185
172
lis T1, perm_const1@highest
173
+ lis T3, save_permute_12@highest
174
+ lis T4, save_permute_11@highest
175
+ lis T5, save_permute_22@highest
176
+ lis T6, save_permute_21@highest
177
+ ori T2, T2, perm_const2@higher
186
178
ori T1, T1, perm_const1@higher
179
+ ori T3, T3, save_permute_12@higher
180
+ ori T4, T4, save_permute_11@higher
181
+ ori T5, T5, save_permute_22@higher
182
+ ori T6, T6, save_permute_21@higher
183
+ rldicr T2, T2, 32 , 31
187
184
rldicr T1, T1, 32 , 31
185
+ rldicr T3, T3, 32 , 31
186
+ rldicr T4, T4, 32 , 31
187
+ rldicr T5, T5, 32 , 31
188
+ rldicr T6, T6, 32 , 31
189
+ oris T2, T2, perm_const2@h
188
190
oris T1, T1, perm_const1@h
191
+ oris T3, T3, save_permute_12@h
192
+ oris T4, T4, save_permute_11@h
193
+ oris T5, T5, save_permute_22@h
194
+ oris T6, T6, save_permute_21@h
195
+ ori T2, T2, perm_const2@l
189
196
ori T1, T1, perm_const1@l
190
-
197
+ ori T3, T3, save_permute_12@l
198
+ ori T4, T4, save_permute_11@l
199
+ ori T5, T5, save_permute_22@l
200
+ ori T6, T6, save_permute_21@l
201
+ li r0,0
191
202
mtvsrdd permute_mask,T2,T1
192
-
193
- lis T2, save_permute_12@highest
194
- ori T2, T2, save_permute_12@higher
195
- rldicr T2, T2, 32 , 31
196
- oris T2, T2, save_permute_12@h
197
- ori T2, T2, save_permute_12@l
198
-
199
- lis T1, save_permute_11@highest
200
- ori T1, T1, save_permute_11@higher
201
- rldicr T1, T1, 32 , 31
202
- oris T1, T1, save_permute_11@h
203
- ori T1, T1, save_permute_11@l
204
-
205
- mtvsrdd save_permute_1,T2,T1
206
-
207
- lis T2, save_permute_22@highest
208
- ori T2, T2, save_permute_22@higher
209
- rldicr T2, T2, 32 , 31
210
- oris T2, T2, save_permute_22@h
211
- ori T2, T2, save_permute_22@l
212
-
213
- lis T1, save_permute_21@highest
214
- ori T1, T1, save_permute_21@higher
215
- rldicr T1, T1, 32 , 31
216
- oris T1, T1, save_permute_21@h
217
- ori T1, T1, save_permute_21@l
218
-
219
- mtvsrdd save_permute_2,T2,T1
203
+ mtvsrdd save_permute_1,T3,T4
204
+ mtvsrdd save_permute_2,T5,T6
220
205
221
206
#include "sgemm_logic_power9.S"
222
207
223
- .L999:
224
- addi r3, 0 , 0
225
-
208
+ .L999:
226
209
lfd f14, 0 (SP)
227
210
lfd f15, 8 (SP)
228
211
lfd f16, 16 (SP)
@@ -264,23 +247,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
264
247
ld r16, 264 (SP)
265
248
ld r15, 272 (SP)
266
249
ld r14, 280 (SP)
267
-
268
- lxv v20, 288 (SP)
269
- lxv v21, 304 (SP)
270
- lxv v22, 320 (SP)
271
- lxv v23, 336 (SP)
272
- lxv v24, 352 (SP)
273
- lxv v25, 368 (SP)
274
- lxv v26, 384 (SP)
275
- lxv v27, 400 (SP)
276
- lxv v28, 416 (SP)
277
- lxv v29, 432 (SP)
278
- lxv v30, 448 (SP)
279
- lxv v31, 464 (SP)
280
250
251
+ ld r0, FLINK_SAVE(SP)
281
252
282
- addi SP, SP, STACKSIZE
253
+ lxv vs52, 288 (SP)
254
+ lxv vs53, 304 (SP)
255
+ lxv vs54, 320 (SP)
256
+ lxv vs55, 336 (SP)
257
+ lxv vs56, 352 (SP)
258
+ lxv vs57, 368 (SP)
259
+ lxv vs58, 384 (SP)
260
+ lxv vs59, 400 (SP)
261
+ mtlr r0
262
+ lxv vs60, 416 (SP)
263
+ lxv vs61, 432 (SP)
264
+ lxv vs62, 448 (SP)
265
+ lxv vs63, 464 (SP)
266
+
267
+ addi SP, SP, STACKSIZE
283
268
blr
284
269
270
+
285
271
EPILOGUE
286
272
#endif
0 commit comments