|
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2013, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*****************************************************************************/ |
| 27 | + |
| 28 | +#include "common.h" |
| 29 | +#if !defined(DOUBLE) |
| 30 | +#define VSETVL(n) __riscv_vsetvl_e32m4(n) |
| 31 | +#define VSETVL_MAX __riscv_vsetvlmax_e32m1() |
| 32 | +#define FLOAT_V_T vfloat32m4_t |
| 33 | +#define FLOAT_V_T_M1 vfloat32m1_t |
| 34 | +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 |
| 35 | +#define VLSEV_FLOAT __riscv_vlse32_v_f32m4 |
| 36 | +#define VSSEV_FLOAT __riscv_vsse32_v_f32m4 |
| 37 | +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1 |
| 38 | +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4 |
| 39 | +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu |
| 40 | +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 |
| 41 | +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 |
| 42 | +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 |
| 43 | +#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4 |
| 44 | +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4 |
| 45 | +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4 |
| 46 | +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu |
| 47 | +#else |
| 48 | +#define VSETVL(n) __riscv_vsetvl_e64m4(n) |
| 49 | +#define VSETVL_MAX __riscv_vsetvlmax_e64m1() |
| 50 | +#define FLOAT_V_T vfloat64m4_t |
| 51 | +#define FLOAT_V_T_M1 vfloat64m1_t |
| 52 | +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 |
| 53 | +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 |
| 54 | +#define VSSEV_FLOAT __riscv_vsse64_v_f64m4 |
| 55 | +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1 |
| 56 | +#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4 |
| 57 | +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu |
| 58 | +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 |
| 59 | +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 |
| 60 | +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 |
| 61 | +#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4 |
| 62 | +#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4 |
| 63 | +#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4 |
| 64 | +#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu |
| 65 | +#endif |
| 66 | + |
| 67 | +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ |
| 68 | + BLASLONG i, j, k; |
| 69 | + BLASLONG ix, iy, ia; |
| 70 | + BLASLONG jx, jy, ja; |
| 71 | + FLOAT temp_r1, temp_i1; |
| 72 | + FLOAT temp_r2, temp_i2; |
| 73 | + FLOAT *a_ptr = a; |
| 74 | + unsigned int gvl = 0; |
| 75 | + FLOAT_V_T_M1 v_res, v_z0; |
| 76 | + gvl = VSETVL_MAX; |
| 77 | + v_res = VFMVVF_FLOAT_M1(0, gvl); |
| 78 | + v_z0 = VFMVVF_FLOAT_M1(0, gvl); |
| 79 | + |
| 80 | + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; |
| 81 | + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2; |
| 82 | + |
| 83 | + BLASLONG inc_x2 = incx * 2; |
| 84 | + BLASLONG inc_y2 = incy * 2; |
| 85 | + stride_x = inc_x2 * sizeof(FLOAT); |
| 86 | + stride_y = inc_y2 * sizeof(FLOAT); |
| 87 | + stride_a = 2 * sizeof(FLOAT); |
| 88 | + lda2 = lda * 2; |
| 89 | + |
| 90 | + jx = 0; |
| 91 | + jy = 0; |
| 92 | + ja = 0; |
| 93 | + for(j = 0; j < offset; j++){ |
| 94 | + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; |
| 95 | + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; |
| 96 | + temp_r2 = 0; |
| 97 | + temp_i2 = 0; |
| 98 | + y[jy] += temp_r1 * a_ptr[ja]; |
| 99 | + y[jy+1] += temp_i1 * a_ptr[ja]; |
| 100 | + ix = jx + inc_x2; |
| 101 | + iy = jy + inc_y2; |
| 102 | + ia = ja + 2; |
| 103 | + i = j + 1; |
| 104 | + len = m - i; |
| 105 | + if(len > 0){ |
| 106 | + gvl = VSETVL(len); |
| 107 | + inc_xv = incx * gvl * 2; |
| 108 | + inc_yv = incy * gvl * 2; |
| 109 | + inc_av = gvl * 2; |
| 110 | + vr0 = VFMVVF_FLOAT(0, gvl); |
| 111 | + vr1 = VFMVVF_FLOAT(0, gvl); |
| 112 | + for(k = 0; k < len / gvl; k++){ |
| 113 | + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); |
| 114 | + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); |
| 115 | + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); |
| 116 | + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); |
| 117 | +#ifndef HEMVREV |
| 118 | + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); |
| 119 | + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); |
| 120 | + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); |
| 121 | + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); |
| 122 | +#else |
| 123 | + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); |
| 124 | + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); |
| 125 | + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); |
| 126 | + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); |
| 127 | +#endif |
| 128 | + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); |
| 129 | + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); |
| 130 | + |
| 131 | + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); |
| 132 | + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); |
| 133 | +#ifndef HEMVREV |
| 134 | + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); |
| 135 | + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); |
| 136 | + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); |
| 137 | + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); |
| 138 | +#else |
| 139 | + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); |
| 140 | + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); |
| 141 | + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); |
| 142 | + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); |
| 143 | + |
| 144 | +#endif |
| 145 | + i += gvl; |
| 146 | + ix += inc_xv; |
| 147 | + iy += inc_yv; |
| 148 | + ia += inc_av; |
| 149 | + } |
| 150 | + |
| 151 | + if(i < m){ |
| 152 | + unsigned int gvl_rem = VSETVL(m-i); |
| 153 | + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem); |
| 154 | + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem); |
| 155 | + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem); |
| 156 | + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem); |
| 157 | +#ifndef HEMVREV |
| 158 | + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); |
| 159 | + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem); |
| 160 | + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem); |
| 161 | + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); |
| 162 | +#else |
| 163 | + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem); |
| 164 | + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem); |
| 165 | + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem); |
| 166 | + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem); |
| 167 | +#endif |
| 168 | + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem); |
| 169 | + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem); |
| 170 | + |
| 171 | + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem); |
| 172 | + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem); |
| 173 | +#ifndef HEMVREV |
| 174 | + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); |
| 175 | + vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); |
| 176 | + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); |
| 177 | + vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); |
| 178 | +#else |
| 179 | + vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem); |
| 180 | + vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem); |
| 181 | + vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem); |
| 182 | + vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem); |
| 183 | +#endif |
| 184 | + } |
| 185 | + v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl); |
| 186 | + temp_r2 = VFMVFS_FLOAT(v_res); |
| 187 | + v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl); |
| 188 | + temp_i2 = VFMVFS_FLOAT(v_res); |
| 189 | + } |
| 190 | + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; |
| 191 | + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; |
| 192 | + jx += inc_x2; |
| 193 | + jy += inc_y2; |
| 194 | + ja += 2; |
| 195 | + a_ptr += lda2; |
| 196 | + } |
| 197 | + return(0); |
| 198 | +} |
0 commit comments