Skip to content

Commit 8c3386b

Browse files
Added missing Blas1 single fp {saxpy, caxpy, cdot, crot(refactored version of srot),isamax ,isamin, icamax, icamin},
Fixed idamin,icamin choosing the first occurance index of equal minimals
1 parent 28ca970 commit 8c3386b

File tree

11 files changed

+1802
-48
lines changed

11 files changed

+1802
-48
lines changed

kernel/power/KERNEL.POWER8

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
8989
#SMINKERNEL = ../arm/min.c
9090
#DMINKERNEL = ../arm/min.c
9191
#
92-
#ISAMAXKERNEL = ../arm/iamax.c
92+
ISAMAXKERNEL = isamax.c
9393
IDAMAXKERNEL = idamax.c
94-
#ICAMAXKERNEL = ../arm/izamax.c
95-
IZAMAXKERNEL = izamax.c
94+
ICAMAXKERNEL = icamax.c
95+
IZAMAXKERNEL = izamax.c
9696
#
97-
#ISAMINKERNEL = ../arm/iamin.c
98-
IDAMINKERNEL = idamin.c
99-
#ICAMINKERNEL = ../arm/izamin.c
97+
ISAMINKERNEL = isamin.c
98+
IDAMINKERNEL = idamin.c
99+
ICAMINKERNEL = icamin.c
100100
IZAMINKERNEL = izamin.c
101101
#
102102
#ISMAXKERNEL = ../arm/imax.c
@@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c
110110
CASUMKERNEL = casum.c
111111
ZASUMKERNEL = zasum.c
112112
#
113-
#SAXPYKERNEL = ../arm/axpy.c
113+
SAXPYKERNEL = saxpy.c
114114
DAXPYKERNEL = daxpy.c
115-
#CAXPYKERNEL = ../arm/zaxpy.c
115+
CAXPYKERNEL = caxpy.c
116116
ZAXPYKERNEL = zaxpy.c
117117
#
118118
SCOPYKERNEL = scopy.c
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c
123123
SDOTKERNEL = sdot.c
124124
DDOTKERNEL = ddot.c
125125
DSDOTKERNEL = sdot.c
126-
#CDOTKERNEL = ../arm/zdot.c
126+
CDOTKERNEL = cdot.c
127127
ZDOTKERNEL = zdot.c
128128
#
129129
SNRM2KERNEL = ../arm/nrm2.c
@@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c
133133
#
134134
SROTKERNEL = srot.c
135135
DROTKERNEL = drot.c
136-
#CROTKERNEL = ../arm/zrot.c
136+
CROTKERNEL = crot.c
137137
ZROTKERNEL = zrot.c
138138
#
139139
SSCALKERNEL = sscal.c

kernel/power/caxpy.c

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
/*
2+
Copyright (c) 2013-2018, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
30+
31+
#ifndef HAVE_ASM_KERNEL
32+
#include <altivec.h>
33+
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
34+
{
35+
36+
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
37+
38+
register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r};
39+
register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i};
40+
41+
#else
42+
register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r};
43+
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
44+
#endif
45+
46+
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
47+
register __vector float *vy = (__vector float *) y;
48+
register __vector float *vx = (__vector float *) x;
49+
BLASLONG i=0;
50+
for (; i < n/2; i += 8) {
51+
52+
register __vector float vy_0 = vy[i];
53+
register __vector float vy_1 = vy[i + 1];
54+
register __vector float vy_2 = vy[i + 2];
55+
register __vector float vy_3 = vy[i + 3];
56+
register __vector float vy_4 = vy[i + 4];
57+
register __vector float vy_5 = vy[i + 5];
58+
register __vector float vy_6 = vy[i + 6];
59+
register __vector float vy_7 = vy[i + 7];
60+
register __vector float vx_0 = vx[i];
61+
register __vector float vx_1 = vx[i + 1];
62+
register __vector float vx_2 = vx[i + 2];
63+
register __vector float vx_3 = vx[i + 3];
64+
register __vector float vx_4 = vx[i + 4];
65+
register __vector float vx_5 = vx[i + 5];
66+
register __vector float vx_6 = vx[i + 6];
67+
register __vector float vx_7 = vx[i + 7];
68+
vy_0 += vx_0*valpha_r;
69+
vy_1 += vx_1*valpha_r;
70+
vy_2 += vx_2*valpha_r;
71+
vy_3 += vx_3*valpha_r;
72+
vy_4 += vx_4*valpha_r;
73+
vy_5 += vx_5*valpha_r;
74+
vy_6 += vx_6*valpha_r;
75+
vy_7 += vx_7*valpha_r;
76+
vx_0 = vec_perm(vx_0, vx_0, swap_mask);
77+
vx_1 = vec_perm(vx_1, vx_1, swap_mask);
78+
vx_2 = vec_perm(vx_2, vx_2, swap_mask);
79+
vx_3 = vec_perm(vx_3, vx_3, swap_mask);
80+
vx_4 = vec_perm(vx_4, vx_4, swap_mask);
81+
vx_5 = vec_perm(vx_5, vx_5, swap_mask);
82+
vx_6 = vec_perm(vx_6, vx_6, swap_mask);
83+
vx_7 = vec_perm(vx_7, vx_7, swap_mask);
84+
vy_0 += vx_0*valpha_i;
85+
vy_1 += vx_1*valpha_i;
86+
vy_2 += vx_2*valpha_i;
87+
vy_3 += vx_3*valpha_i;
88+
vy_4 += vx_4*valpha_i;
89+
vy_5 += vx_5*valpha_i;
90+
vy_6 += vx_6*valpha_i;
91+
vy_7 += vx_7*valpha_i;
92+
vy[i] = vy_0;
93+
vy[i + 1] = vy_1;
94+
vy[i + 2] = vy_2;
95+
vy[i + 3] = vy_3;
96+
vy[i + 4] = vy_4;
97+
vy[i + 5] = vy_5 ;
98+
vy[i + 6] = vy_6 ;
99+
vy[i + 7] = vy_7 ;
100+
101+
}
102+
}
103+
#endif
104+
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) {
105+
BLASLONG i = 0;
106+
BLASLONG ix = 0, iy = 0;
107+
if (n <= 0) return (0);
108+
if ((inc_x == 1) && (inc_y == 1)) {
109+
BLASLONG n1 = n & -16;
110+
if (n1) {
111+
caxpy_kernel_16(n1, x, y, da_r,da_i);
112+
ix = 2 * n1;
113+
}
114+
i = n1;
115+
while (i < n) {
116+
#if !defined(CONJ)
117+
y[ix] += (da_r * x[ix] - da_i * x[ix + 1]);
118+
y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
119+
#else
120+
y[ix] += (da_r * x[ix] + da_i * x[ix + 1]);
121+
y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
122+
#endif
123+
i++;
124+
ix += 2;
125+
}
126+
return (0);
127+
128+
}
129+
inc_x *= 2;
130+
inc_y *= 2;
131+
while (i < n) {
132+
#if !defined(CONJ)
133+
y[iy] += (da_r * x[ix] - da_i * x[ix + 1]);
134+
y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]);
135+
#else
136+
y[iy] += (da_r * x[ix] + da_i * x[ix + 1]);
137+
y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]);
138+
#endif
139+
ix += inc_x;
140+
iy += inc_y;
141+
i++;
142+
}
143+
return (0);
144+
}
145+

kernel/power/cdot.c

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
/*Copyright (c) 2013-201\n8, The OpenBLAS Project
2+
All rights reserved.
3+
Redistribution and use in source and binary forms, with or without
4+
modification, are permitted provided that the following conditions are
5+
met:
6+
1. Redistributions of source code must retain the above copyright
7+
notice, this list of conditions and the following disclaimer.
8+
2. Redistributions in binary form must reproduce the above copyright
9+
notice, this list of conditions and the following disclaimer in
10+
the documentation and/or other materials provided with the
11+
distribution.
12+
3. Neither the name of the OpenBLAS project nor the names of
13+
its contributors may be used to endorse or promote products
14+
derived from this software without specific prior written permission.
15+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
19+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
24+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*****************************************************************************/
26+
27+
#include "common.h"
28+
29+
#ifndef HAVE_KERNEL_8
30+
#include <altivec.h>
31+
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
32+
{
33+
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
34+
register __vector float *vy = (__vector float *) y;
35+
register __vector float *vx = (__vector float *) x;
36+
BLASLONG i = 0;
37+
register __vector float vd_0 = { 0 };
38+
register __vector float vd_1 = { 0 };
39+
register __vector float vd_2 = { 0 };
40+
register __vector float vd_3 = { 0 };
41+
register __vector float vdd_0 = { 0 };
42+
register __vector float vdd_1 = { 0 };
43+
register __vector float vdd_2 = { 0 };
44+
register __vector float vdd_3 = { 0 };
45+
for (; i < n/2; i += 4) {
46+
47+
register __vector float vyy_0 ;
48+
register __vector float vyy_1 ;
49+
register __vector float vyy_2 ;
50+
register __vector float vyy_3 ;
51+
52+
register __vector float vy_0 = vy[i];
53+
register __vector float vy_1 = vy[i + 1];
54+
register __vector float vy_2 = vy[i + 2];
55+
register __vector float vy_3 = vy[i + 3];
56+
register __vector float vx_0= vx[i];
57+
register __vector float vx_1 = vx[i + 1];
58+
register __vector float vx_2 = vx[i + 2];
59+
register __vector float vx_3 = vx[i + 3];
60+
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
61+
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
62+
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
63+
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
64+
65+
vd_0 += vx_0 * vy_0;
66+
vd_1 += vx_1 * vy_1;
67+
vd_2 += vx_2 * vy_2;
68+
vd_3 += vx_3 * vy_3;
69+
70+
vdd_0 += vx_0 * vyy_0;
71+
vdd_1 += vx_1 * vyy_1;
72+
vdd_2 += vx_2 * vyy_2;
73+
vdd_3 += vx_3 * vyy_3;
74+
75+
76+
}
77+
//aggregate
78+
vd_0 = vd_0 + vd_1 +vd_2 +vd_3;
79+
vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3;
80+
//reverse and aggregate
81+
vd_1=vec_xxpermdi(vd_0,vd_0,2) ;
82+
vdd_1=vec_xxpermdi(vdd_0,vdd_0,2);
83+
vd_2=vd_0+vd_1;
84+
vdd_2=vdd_0+vdd_1;
85+
86+
dot[0]=vd_2[0];
87+
dot[1]=vd_2[1];
88+
dot[2]=vdd_2[0];
89+
dot[3]=vdd_2[1];
90+
91+
}
92+
#endif
93+
94+
95+
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
96+
BLASLONG i = 0;
97+
BLASLONG ix=0, iy=0;
98+
OPENBLAS_COMPLEX_FLOAT result;
99+
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
100+
101+
if (n <= 0) {
102+
CREAL(result) = 0.0;
103+
CIMAG(result) = 0.0;
104+
return (result);
105+
106+
}
107+
108+
if ((inc_x == 1) && (inc_y == 1)) {
109+
110+
BLASLONG n1 = n & -8;
111+
BLASLONG j=0;
112+
113+
if (n1){
114+
cdot_kernel_8(n1, x, y, dot);
115+
i = n1;
116+
j = n1 <<1;
117+
}
118+
119+
120+
while (i < n) {
121+
122+
dot[0] += x[j] * y[j];
123+
dot[1] += x[j + 1] * y[j + 1];
124+
dot[2] += x[j] * y[j + 1];
125+
dot[3] += x[j + 1] * y[j];
126+
127+
j += 2;
128+
i++;
129+
130+
}
131+
132+
133+
} else {
134+
i = 0;
135+
ix = 0;
136+
iy = 0;
137+
inc_x <<= 1;
138+
inc_y <<= 1;
139+
while (i < n) {
140+
141+
dot[0] += x[ix] * y[iy];
142+
dot[1] += x[ix + 1] * y[iy + 1];
143+
dot[2] += x[ix] * y[iy + 1];
144+
dot[3] += x[ix + 1] * y[iy];
145+
146+
ix += inc_x;
147+
iy += inc_y;
148+
i++;
149+
150+
}
151+
}
152+
153+
#if !defined(CONJ)
154+
CREAL(result) = dot[0] - dot[1];
155+
CIMAG(result) = dot[2] + dot[3];
156+
#else
157+
CREAL(result) = dot[0] + dot[1];
158+
CIMAG(result) = dot[2] - dot[3];
159+
160+
#endif
161+
162+
return (result);
163+
164+
}

0 commit comments

Comments
 (0)