Skip to content

Commit b766c1e

Browse files
committed
Improve the performance of zasum and casum with AVX512 intrinsic
1 parent ff16329 commit b766c1e

File tree

5 files changed

+980
-0
lines changed

5 files changed

+980
-0
lines changed

kernel/x86_64/KERNEL.SKYLAKEX

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,6 @@ ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c
2727

2828
CSCALKERNEL = ../arm/zscal.c
2929
ZSCALKERNEL = ../arm/zscal.c
30+
31+
CASUMKERNEL = casum.c
32+
ZASUMKERNEL = zasum.c

kernel/x86_64/casum.c

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include "common.h"
2+
3+
#ifndef ABS_K
4+
#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
5+
#endif
6+
7+
#if defined(SKYLAKEX)
8+
#include "casum_microk_skylakex-2.c"
9+
#endif
10+
11+
#ifndef HAVE_CASUM_KERNEL
12+
static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
13+
{
14+
15+
BLASLONG i=0;
16+
BLASLONG n_8 = n & -8;
17+
FLOAT *x = x1;
18+
FLOAT temp0, temp1, temp2, temp3;
19+
FLOAT temp4, temp5, temp6, temp7;
20+
FLOAT sum0 = 0.0;
21+
FLOAT sum1 = 0.0;
22+
FLOAT sum2 = 0.0;
23+
FLOAT sum3 = 0.0;
24+
FLOAT sum4 = 0.0;
25+
26+
while (i < n_8) {
27+
temp0 = ABS_K(x[0]);
28+
temp1 = ABS_K(x[1]);
29+
temp2 = ABS_K(x[2]);
30+
temp3 = ABS_K(x[3]);
31+
temp4 = ABS_K(x[4]);
32+
temp5 = ABS_K(x[5]);
33+
temp6 = ABS_K(x[6]);
34+
temp7 = ABS_K(x[7]);
35+
36+
sum0 += temp0;
37+
sum1 += temp1;
38+
sum2 += temp2;
39+
sum3 += temp3;
40+
41+
sum0 += temp4;
42+
sum1 += temp5;
43+
sum2 += temp6;
44+
sum3 += temp7;
45+
46+
x+=8;
47+
i+=4;
48+
}
49+
50+
while (i < n) {
51+
sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
52+
x1 += 2;
53+
i++;
54+
}
55+
56+
return sum0+sum1+sum2+sum3+sum4;
57+
}
58+
59+
#endif
60+
61+
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
62+
{
63+
BLASLONG i = 0;
64+
BLASLONG ip = 0;
65+
BLASLONG inc_x2;
66+
FLOAT sumf = 0.0;
67+
68+
if (n <= 0 || inc_x <= 0) return(sumf);
69+
if (inc_x == 1) {
70+
sumf = casum_kernel(n, x);
71+
}
72+
else {
73+
inc_x2 = 2 * inc_x;
74+
75+
while (i < n) {
76+
sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
77+
ip += inc_x2;
78+
i++;
79+
}
80+
}
81+
82+
return(sumf);
83+
}
84+
85+
#if defined(SMP)
86+
static int asum_thread_function(BLASLONG n,
87+
BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
88+
FLOAT *x, BLASLONG inc_x,
89+
FLOAT * dummy3, BLASLONG dummy4,
90+
FLOAT * result, BLASLONG dummy5)
91+
{
92+
*(FLOAT *) result = asum_compute(n, x, inc_x);
93+
return 0;
94+
}
95+
96+
extern int blas_level1_thread_with_value(int mode,
97+
BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
98+
void *a, BLASLONG lda,
99+
void *b, BLASLONG ldb,
100+
void *c, BLASLONG ldc,
101+
int (*function)(),
102+
int nthread);
103+
#endif
104+
105+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
106+
{
107+
#if defined(SMP)
108+
int nthreads;
109+
FLOAT dummy_alpha[2];
110+
#endif
111+
FLOAT sumf = 0.0;
112+
113+
#if defined(SMP)
114+
int num_cpu = num_cpu_avail(1);
115+
if (n <= 10000 || inc_x <= 0)
116+
nthreads = 1;
117+
else
118+
nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
119+
120+
if (nthreads == 1) {
121+
sumf = asum_compute(n, x, inc_x);
122+
}
123+
else {
124+
int mode, i;
125+
char result[MAX_CPU_NUMBER * sizeof(double) *2];
126+
FLOAT *ptr;
127+
#if !defined(DOUBLE)
128+
mode = BLAS_SINGLE | BLAS_COMPLEX;
129+
#else
130+
mode = BLAS_DOUBLE | BLAS_COMPLEX;
131+
#endif
132+
blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x,
133+
NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
134+
ptr = (FLOAT *)result;
135+
for (i = 0; i < nthreads; i++) {
136+
sumf += (*ptr);
137+
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
138+
}
139+
}
140+
#else
141+
sumf = asum_compute(n, x, inc_x);
142+
#endif
143+
return(sumf);
144+
}

0 commit comments

Comments
 (0)