Skip to content

Commit 9ef96b3

Browse files
authored
Add multithreading support to the x86_64 zdot kernel (#2222)
* Add multithreading support copied from the ThunderX2T99 kernel. For #2221
1 parent b48c025 commit 9ef96b3

File tree

1 file changed

+72
-14
lines changed

1 file changed

+72
-14
lines changed

kernel/x86_64/zdot.c

Lines changed: 72 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
8686

8787
#endif
8888

89-
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
89+
90+
#if defined(SMP)
91+
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
92+
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
93+
void *c, BLASLONG ldc, int (*function)(), int nthreads);
94+
#endif
95+
96+
97+
98+
static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result)
9099
{
91100
BLASLONG i;
92101
BLASLONG ix,iy;
93102
FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ;
94-
103+
95104
if ( n <= 0 )
96105
{
97-
// CREAL(result) = 0.0 ;
98-
// CIMAG(result) = 0.0 ;
99-
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
100-
return(result);
106+
OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
107+
*result=res;
108+
return;
101109

102110
}
103111

@@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
150158
}
151159

152160
#if !defined(CONJ)
153-
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
154-
// CREAL(result) = dot[0] - dot[1];
155-
// CIMAG(result) = dot[2] + dot[3];
161+
OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
156162
#else
157-
OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
158-
// CREAL(result) = dot[0] + dot[1];
159-
// CIMAG(result) = dot[2] - dot[3];
163+
OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
164+
#endif
165+
*result=res;
166+
return;
167+
}
160168

169+
#if defined(SMP)
170+
static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
171+
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
172+
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
173+
{
174+
zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
175+
return 0;
176+
}
161177
#endif
162178

163-
return(result);
179+
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
180+
{
181+
#if defined(SMP)
182+
int nthreads;
183+
FLOAT dummy_alpha;
184+
#endif
185+
OPENBLAS_COMPLEX_FLOAT zdot;
186+
CREAL(zdot) = 0.0;
187+
CIMAG(zdot) = 0.0;
164188

165-
}
189+
#if defined(SMP)
190+
if (inc_x == 0 || inc_y == 0 || n <= 10000)
191+
nthreads = 1;
192+
else
193+
nthreads = num_cpu_avail(1);
194+
195+
if (nthreads == 1) {
196+
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
197+
} else {
198+
int mode, i;
199+
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
200+
OPENBLAS_COMPLEX_FLOAT *ptr;
201+
202+
#if !defined(DOUBLE)
203+
mode = BLAS_SINGLE | BLAS_COMPLEX;
204+
#else
205+
mode = BLAS_DOUBLE | BLAS_COMPLEX;
206+
#endif
207+
208+
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
209+
x, inc_x, y, inc_y, result, 0,
210+
( void *)zdot_thread_function, nthreads);
166211

212+
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
213+
for (i = 0; i < nthreads; i++) {
214+
CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
215+
CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
216+
ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
217+
}
218+
}
219+
#else
220+
zdot_compute(n, x, inc_x, y, inc_y, &zdot);
221+
#endif
222+
223+
return zdot;
224+
}
167225

0 commit comments

Comments
 (0)