Skip to content

Commit 2619ad7

Browse files
Blas1 mikrokernels can be inlined by gcc. Refactoring ( symbolic operan
names). Some fixes and tunings
1 parent 87669d1 commit 2619ad7

File tree

18 files changed

+1593
-1800
lines changed

18 files changed

+1593
-1800
lines changed

kernel/zarch/dasum.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3939

4040

4141
static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
42-
FLOAT asum ;
42+
FLOAT asum ;
4343
__asm__ (
44-
"pfd 1, 0(%3) \n\t"
45-
"sllg %%r0,%2,3 \n\t"
46-
"agr %%r0,%3 \n\t"
44+
"pfd 1, 0(%[ptr_x]) \n\t"
45+
"sllg %%r0,%[n],3 \n\t"
46+
"agr %%r0,%[ptr_x] \n\t"
4747
"vzero %%v0 \n\t"
4848
"vzero %%v1 \n\t"
4949
"vzero %%v2 \n\t"
5050
"vzero %%v3 \n\t"
5151
".align 16 \n\t"
52-
"1: \n\t"
53-
"pfd 1, 256(%1 ) \n\t"
54-
"vlm %%v24,%%v31, 0(%1 ) \n\t"
52+
"1: \n\t"
53+
"pfd 1, 256(%[ptr_temp] ) \n\t"
54+
"vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t"
5555

5656
"vflpdb %%v24, %%v24 \n\t"
5757
"vflpdb %%v25, %%v25 \n\t"
@@ -71,7 +71,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
7171
"vfadb %%v2,%%v2,%%v30 \n\t"
7272
"vfadb %%v3,%%v3,%%v31 \n\t"
7373

74-
"vlm %%v24,%%v31, 128(%1) \n\t"
74+
"vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t"
7575

7676
"vflpdb %%v24, %%v24 \n\t"
7777
"vflpdb %%v25, %%v25 \n\t"
@@ -81,7 +81,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
8181
"vflpdb %%v29, %%v29 \n\t"
8282
"vflpdb %%v30, %%v30 \n\t"
8383
"vflpdb %%v31, %%v31 \n\t"
84-
"la %1,256(%1) \n\t"
84+
"la %[ptr_temp],256(%[ptr_temp]) \n\t"
8585
"vfadb %%v0,%%v0,%%v24 \n\t"
8686
"vfadb %%v1,%%v1,%%v25 \n\t"
8787
"vfadb %%v2,%%v2,%%v26 \n\t"
@@ -91,16 +91,16 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) {
9191
"vfadb %%v2,%%v2,%%v30 \n\t"
9292
"vfadb %%v3,%%v3,%%v31 \n\t"
9393

94-
"clgrjl %1,%%r0,1b \n\t"
94+
"clgrjl %[ptr_temp],%%r0,1b \n\t"
9595
"vfadb %%v24,%%v0,%%v1 \n\t"
9696
"vfadb %%v25,%%v2,%%v3 \n\t"
9797
"vfadb %%v0,%%v25,%%v24 \n\t"
9898
"vrepg %%v1,%%v0,1 \n\t"
9999
"adbr %%f0,%%f1 \n\t"
100-
"ldr %0,%%f0 \n\t"
101-
: "=f"(asum),"+&a"(x)
102-
: "r"(n), "1"(x)
103-
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
100+
"ldr %[asum],%%f0 \n\t"
101+
: [asum] "=f"(asum),[ptr_temp] "+&a"(x)
102+
: [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x)
103+
: "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
104104
);
105105
return asum;
106106

kernel/zarch/daxpy.c

Lines changed: 43 additions & 246 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828

2929
#include "common.h"
30-
#define Z13_D 1
30+
3131
#define PREFETCH_INS 1
3232
#if defined(Z13_A)
3333
#include <vecintrin.h>
3434

35-
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
35+
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
3636
{
3737
BLASLONG i = 0;
38-
__vector double v_a = {*alpha,*alpha};
38+
__vector double v_a = {alpha,alpha};
3939
__vector double * v_y=(__vector double *)y;
4040
__vector double * v_x=(__vector double *)x;
4141

@@ -60,256 +60,53 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
6060
}
6161

6262
}
63-
#elif defined(Z13_B)
64-
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
65-
{
66-
67-
68-
__asm__ volatile(
69-
#if defined(PREFETCH_INS)
70-
"pfd 1, 0(%1) \n\t"
71-
"pfd 2, 0(%2) \n\t"
72-
#endif
73-
"vlrepg %%v0 , 0(%3) \n\t"
74-
"srlg %3,%0,5 \n\t"
75-
"xgr %%r1,%%r1 \n\t"
76-
"vlr %%v1,%%v0 \n\t"
77-
".align 16 \n\t"
78-
"1: \n\t"
79-
#if defined(PREFETCH_INS)
80-
"pfd 1, 256(%%r1,%1) \n\t"
81-
"pfd 2, 256(%%r1,%2) \n\t"
82-
#endif
83-
84-
"vl %%v24, 0(%%r1,%2) \n\t"
85-
"vl %%v16, 0(%%r1,%1) \n\t"
86-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
87-
"vst %%v16, 0(%%r1,%2) \n\t"
88-
"vl %%v25, 16(%%r1,%2) \n\t"
89-
"vl %%v17, 16(%%r1,%1) \n\t"
90-
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
91-
"vst %%v17, 16(%%r1,%2) \n\t"
92-
"vl %%v26, 32(%%r1,%2) \n\t"
93-
"vl %%v18, 32(%%r1,%1) \n\t"
94-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
95-
"vst %%v18, 32(%%r1,%2) \n\t"
96-
"vl %%v27, 48(%%r1,%2) \n\t"
97-
"vl %%v19, 48(%%r1,%1) \n\t"
98-
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
99-
"vst %%v19, 48(%%r1,%2) \n\t"
100-
101-
"vl %%v24,( 0+64)(%%r1,%2) \n\t"
102-
"vl %%v16,( 0+64)(%%r1,%1) \n\t"
103-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
104-
"vst %%v16,( 0+64)(%%r1,%2) \n\t"
105-
"vl %%v25, (16+64)(%%r1,%2) \n\t"
106-
"vl %%v17, (16+64)(%%r1,%1) \n\t"
107-
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
108-
"vst %%v17, (16+64)(%%r1,%2) \n\t"
109-
"vl %%v26, (32+64)(%%r1,%2) \n\t"
110-
"vl %%v18, (32+64)(%%r1,%1) \n\t"
111-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
112-
"vst %%v18, (32+64)(%%r1,%2) \n\t"
113-
"vl %%v27, (48+64)(%%r1,%2) \n\t"
114-
"vl %%v19, (48+64)(%%r1,%1) \n\t"
115-
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
116-
"vst %%v19, (48+64)(%%r1,%2) \n\t"
117-
118-
"vl %%v24,( 0+128)(%%r1,%2) \n\t"
119-
"vl %%v16,( 0+128)(%%r1,%1) \n\t"
120-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
121-
"vst %%v16,( 0+128)(%%r1,%2) \n\t"
122-
"vl %%v25, (16+128)(%%r1,%2) \n\t"
123-
"vl %%v17, (16+128)(%%r1,%1) \n\t"
124-
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
125-
"vst %%v17, (16+128)(%%r1,%2) \n\t"
126-
"vl %%v26, (32+128)(%%r1,%2) \n\t"
127-
"vl %%v18, (32+128)(%%r1,%1) \n\t"
128-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
129-
"vst %%v18, (32+128)(%%r1,%2) \n\t"
130-
"vl %%v27, (48+128)(%%r1,%2) \n\t"
131-
"vl %%v19, (48+128)(%%r1,%1) \n\t"
132-
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
133-
"vst %%v19, (48+128)(%%r1,%2) \n\t"
134-
135-
"vl %%v24,( 0+192)(%%r1,%2) \n\t"
136-
"vl %%v16,( 0+192)(%%r1,%1) \n\t"
137-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
138-
"vst %%v16,( 0+192)(%%r1,%2) \n\t"
139-
"vl %%v25, (16+192)(%%r1,%2) \n\t"
140-
"vl %%v17, (16+192)(%%r1,%1) \n\t"
141-
"vfmadb %%v17,%%v0,%%v17,%%v25 \n\t"
142-
"vst %%v17, (16+192)(%%r1,%2) \n\t"
143-
"vl %%v26, (32+192)(%%r1,%2) \n\t"
144-
"vl %%v18, (32+192)(%%r1,%1) \n\t"
145-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
146-
"vst %%v18, (32+192)(%%r1,%2) \n\t"
147-
"vl %%v27, (48+192)(%%r1,%2) \n\t"
148-
"vl %%v19, (48+192)(%%r1,%1) \n\t"
149-
"vfmadb %%v19,%%v0,%%v19,%%v27 \n\t"
150-
"vst %%v19, (48+192)(%%r1,%2) \n\t"
151-
152-
153-
"la %%r1,256(%%r1) \n\t"
154-
"brctg %3,1b"
155-
:
156-
:"r"(n),"a"(x),"a"(y),"a"(alpha)
157-
:"cc", "memory", "r1" ,"v0" ,"v16","v17","v18","v19", "v24","v25","v26","v27"
158-
);
159-
}
160-
161-
#elif defined(Z13_C)
162-
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
163-
{
164-
165-
__asm__ volatile(
166-
#if defined(PREFETCH_INS)
167-
"pfd 1, 0(%1) \n\t"
168-
"pfd 2, 0(%2) \n\t"
169-
#endif
170-
"vlrepg %%v0 , 0(%3) \n\t"
171-
"srlg %3,%0,5 \n\t"
172-
"xgr %%r1,%%r1 \n\t"
173-
"vlr %%v1,%%v0 \n\t"
174-
".align 16 \n\t"
175-
"1: \n\t"
176-
#if defined(PREFETCH_INS)
177-
"pfd 1, 256(%%r1,%1) \n\t"
178-
"pfd 2, 256(%%r1,%2) \n\t"
179-
#endif
180-
"vl %%v16, 0(%%r1,%1) \n\t"
181-
"vl %%v17, 16(%%r1,%1) \n\t"
182-
"vl %%v18, 32(%%r1,%1) \n\t"
183-
"vl %%v19, 48(%%r1,%1) \n\t"
184-
185-
"vl %%v24, 0(%%r1,%2) \n\t"
186-
"vl %%v25, 16(%%r1,%2) \n\t"
187-
"vl %%v26, 32(%%r1,%2) \n\t"
188-
"vl %%v27, 48(%%r1,%2) \n\t"
189-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
190-
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
191-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
192-
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
193-
"vst %%v16, 0(%%r1,%2) \n\t"
194-
"vst %%v17, 16(%%r1,%2) \n\t"
195-
"vst %%v18, 32(%%r1,%2) \n\t"
196-
"vst %%v19, 48(%%r1,%2) \n\t"
197-
198-
"vl %%v24, 64(%%r1,%1) \n\t"
199-
"vl %%v25, 80(%%r1,%1) \n\t"
200-
"vl %%v26, 96(%%r1,%1) \n\t"
201-
"vl %%v27, 112(%%r1,%1) \n\t"
202-
203-
"vl %%v16, 64(%%r1,%2) \n\t"
204-
"vl %%v17, 80(%%r1,%2) \n\t"
205-
"vl %%v18, 96(%%r1,%2) \n\t"
206-
"vl %%v19, 112(%%r1,%2) \n\t"
207-
208-
209-
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
210-
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
211-
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
212-
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
213-
214-
"vst %%v24, 64(%%r1,%2) \n\t"
215-
"vst %%v25, 80(%%r1,%2) \n\t"
216-
"vst %%v26, 96(%%r1,%2) \n\t"
217-
"vst %%v27, 112(%%r1,%2) \n\t"
218-
219-
"vl %%v16, (0+128)(%%r1,%1) \n\t"
220-
"vl %%v17, (16+128)(%%r1,%1) \n\t"
221-
"vl %%v18, (32+128)(%%r1,%1) \n\t"
222-
"vl %%v19, (48+128)(%%r1,%1) \n\t"
223-
224-
"vl %%v24, (0+128)(%%r1,%2) \n\t"
225-
"vl %%v25, (16+128)(%%r1,%2) \n\t"
226-
"vl %%v26, (32+128)(%%r1,%2) \n\t"
227-
"vl %%v27, (48+128)(%%r1,%2) \n\t"
228-
229-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
230-
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
231-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
232-
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
233-
"vst %%v16, (0+128)(%%r1,%2) \n\t"
234-
"vst %%v17, (16+128)(%%r1,%2) \n\t"
235-
"vst %%v18, (32+128)(%%r1,%2) \n\t"
236-
"vst %%v19, (48+128)(%%r1,%2) \n\t"
237-
238-
"vl %%v24, (64+128)(%%r1,%1) \n\t"
239-
"vl %%v25, (80+128)(%%r1,%1) \n\t"
240-
"vl %%v26, (96+128)(%%r1,%1) \n\t"
241-
"vl %%v27, (112+128)(%%r1,%1) \n\t"
242-
243-
"vl %%v16, (64+128)(%%r1,%2) \n\t"
244-
"vl %%v17, (80+128)(%%r1,%2) \n\t"
245-
"vl %%v18, (96+128)(%%r1,%2) \n\t"
246-
"vl %%v19, (112+128)(%%r1,%2) \n\t"
247-
248-
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
249-
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
250-
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
251-
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
252-
253-
"vst %%v24, (64+128)(%%r1,%2) \n\t"
254-
"vst %%v25, (80+128)(%%r1,%2) \n\t"
255-
"vst %%v26, (96+128)(%%r1,%2) \n\t"
256-
"vst %%v27, (112+128)(%%r1,%2) \n\t"
257-
258-
"la %%r1,256(%%r1) \n\t"
259-
"brctg %3,1b"
260-
:
261-
:"r"(n),"a"(x),"a"(y),"a"(alpha)
262-
:"cc", "memory", "r1" ,"v0","v1","v16","v17","v18","v19", "v24","v25","v26","v27"
263-
);
264-
}
265-
266-
267-
#elif defined(Z13_D)
268-
static void __attribute__ ((noinline)) daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
63+
#else
64+
static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
26965
{
27066

27167
__asm__ volatile(
27268
#if defined(PREFETCH_INS)
273-
"pfd 1, 0(%1) \n\t"
274-
"pfd 2, 0(%2) \n\t"
275-
#endif
276-
"vlrepg %%v0 , 0(%3) \n\t"
277-
"srlg %3,%0,5 \n\t"
278-
"vlr %%v1,%%v0 \n\t"
69+
"pfd 1, 0(%[x_tmp]) \n\t"
70+
"pfd 2, 0(%[y_tmp]) \n\t"
71+
#endif
72+
"lgdr %%r0,%[alpha] \n\t"
73+
"vlvgp %%v0,%%r0,%%r0 \n\t"
74+
"srlg %%r0,%[n],5 \n\t"
75+
"vlr %%v1,%%v0 \n\t"
27976
".align 16 \n\t"
28077
"1: \n\t"
28178
#if defined(PREFETCH_INS)
282-
"pfd 1, 256(%1) \n\t"
283-
"pfd 2, 256(%2) \n\t"
79+
"pfd 1, 256(%[x_tmp]) \n\t"
80+
"pfd 2, 256(%[y_tmp]) \n\t"
28481
#endif
285-
"vlm %%v16,%%v23, 0(%1) \n\t"
286-
"vlm %%v24, %%v31, 0(%2) \n\t"
287-
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
288-
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
289-
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
290-
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
291-
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
292-
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
293-
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
294-
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
295-
"vstm %%v16,%%v23, 0(%2) \n\t"
296-
"vlm %%v24,%%v31, 128(%1) \n\t"
297-
"vlm %%v16,%%v23, 128(%2) \n\t"
298-
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
299-
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
300-
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
301-
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
302-
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
303-
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
304-
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
305-
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
306-
"la %1,256(%1) \n\t"
307-
"vstm %%v24, %%v31, 128(%2) \n\t"
308-
"la %2,256(%2) \n\t"
309-
"brctg %3,1b"
310-
:
311-
:"r"(n),"a"(x),"a"(y),"a"(alpha)
312-
:"cc", "memory", "v0","v1","v16","v17","v18","v19","v20","v21",
82+
"vlm %%v16,%%v23, 0(%[x_tmp]) \n\t"
83+
"vlm %%v24, %%v31, 0(%[y_tmp]) \n\t"
84+
"vfmadb %%v16,%%v0,%%v16,%%v24 \n\t"
85+
"vfmadb %%v17,%%v1,%%v17,%%v25 \n\t"
86+
"vfmadb %%v18,%%v0,%%v18,%%v26 \n\t"
87+
"vfmadb %%v19,%%v1,%%v19,%%v27 \n\t"
88+
"vfmadb %%v20,%%v0,%%v20,%%v28 \n\t"
89+
"vfmadb %%v21,%%v1,%%v21,%%v29 \n\t"
90+
"vfmadb %%v22,%%v0,%%v22,%%v30 \n\t"
91+
"vfmadb %%v23,%%v1,%%v23,%%v31 \n\t"
92+
"vstm %%v16,%%v23, 0(%[y_tmp]) \n\t"
93+
"vlm %%v24,%%v31, 128(%[x_tmp]) \n\t"
94+
"vlm %%v16,%%v23, 128(%[y_tmp]) \n\t"
95+
"vfmadb %%v24,%%v0,%%v24,%%v16 \n\t"
96+
"vfmadb %%v25,%%v1,%%v25,%%v17 \n\t"
97+
"vfmadb %%v26,%%v0,%%v26,%%v18 \n\t"
98+
"vfmadb %%v27,%%v1,%%v27,%%v19 \n\t"
99+
"vfmadb %%v28,%%v0,%%v28,%%v20 \n\t"
100+
"vfmadb %%v29,%%v1,%%v29,%%v21 \n\t"
101+
"vfmadb %%v30,%%v0,%%v30,%%v22 \n\t"
102+
"vfmadb %%v31,%%v1,%%v31,%%v23 \n\t"
103+
"la %[x_tmp],256(%[x_tmp]) \n\t"
104+
"vstm %%v24, %%v31, 128(%[y_tmp]) \n\t"
105+
"la %[y_tmp],256(%[y_tmp]) \n\t"
106+
"brctg %%r0,1b"
107+
: [mem_y] "+m" (*(double (*)[n])y), [x_tmp] "+&a"(x), [y_tmp] "+&a"(y)
108+
: [mem_x] "m" (*(const double (*)[n])x), [n] "r"(n), [alpha] "f"(alpha)
109+
:"cc", "r0", "v0","v1","v16","v17","v18","v19","v20","v21",
313110
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
314111
);
315112

@@ -334,7 +131,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
334131
BLASLONG n1 = n & -32;
335132

336133
if ( n1 )
337-
daxpy_kernel_32(n1, x, y , &da );
134+
daxpy_kernel_32(n1, x, y , da );
338135

339136
i = n1;
340137
while(i < n)

0 commit comments

Comments
 (0)