|
| 1 | +/******************************************************************************* |
| 2 | +Copyright (c) 2024, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*******************************************************************************/ |
| 27 | +#define ASSEMBLER |
| 28 | + |
| 29 | +#include "common.h" |
| 30 | +#include "loongarch64_asm.S" |
| 31 | + |
| 32 | +/********************************************************************* |
| 33 | +* 2024/02/20 guxiwei |
| 34 | +* UTEST : OK |
| 35 | +* CTEST : OK |
| 36 | +* TEST : OK |
| 37 | +* |
| 38 | +* |
| 39 | +*********************************************************************/ |
| 40 | + |
| 41 | +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, |
| 42 | + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) |
| 43 | + */ |
| 44 | +#define M $r4 |
| 45 | +#define N $r5 |
| 46 | +#define ALPHA_R $f0 |
| 47 | +#define ALPHA_I $f1 |
| 48 | +#define A $r7 |
| 49 | +#define LDA $r8 |
| 50 | +#define X $r9 |
| 51 | +#define INC_X $r10 |
| 52 | +#define Y $r11 |
| 53 | +#define INC_Y $r6 |
| 54 | + |
| 55 | +#define J $r12 |
| 56 | +#define I $r13 |
| 57 | +#define K $r14 |
| 58 | +#define Y_ORG $r15 |
| 59 | +#define OFFSET $r16 |
| 60 | +#define K_LDA $r17 |
| 61 | +#define M8 $r18 |
| 62 | +#define T0 $r19 |
| 63 | +#define PA0 $r20 |
| 64 | +#define PA1 $r23 |
| 65 | +#define PA2 $r24 |
| 66 | +#define PA3 $r25 |
| 67 | +#define PA4 $r26 |
| 68 | +#define PA5 $r27 |
| 69 | +#define PA6 $r28 |
| 70 | +#define PA7 $r29 |
| 71 | + |
| 72 | +#define VALPHA $xr1 |
| 73 | +#define X0 $xr2 |
| 74 | +#define X1 $xr3 |
| 75 | +#define X2 $xr4 |
| 76 | +#define X3 $xr5 |
| 77 | +#define X4 $xr6 |
| 78 | +#define X5 $xr7 |
| 79 | +#define X6 $xr8 |
| 80 | +#define X7 $xr9 |
| 81 | +#define Y0 $xr10 |
| 82 | +#define Y1 $xr11 |
| 83 | +#define A0 $xr12 |
| 84 | +#define A1 $xr13 |
| 85 | +#define A2 $xr14 |
| 86 | +#define A3 $xr15 |
| 87 | +#define A4 $xr16 |
| 88 | +#define A5 $xr17 |
| 89 | +#define A6 $xr18 |
| 90 | +#define A7 $xr19 |
| 91 | +#define A8 $xr20 |
| 92 | +#define A9 $xr21 |
| 93 | +#define A10 $xr22 |
| 94 | +#define A11 $xr23 |
| 95 | +#define A12 $xr24 |
| 96 | +#define A13 $xr25 |
| 97 | +#define A14 $xr26 |
| 98 | +#define A15 $xr27 |
| 99 | +#define TMP0 $xr28 |
| 100 | +#define TMP1 $xr29 |
| 101 | +#define TMP2 $xr30 |
| 102 | + |
| 103 | +#if !defined(CONJ) |
| 104 | +#if !defined(XCONJ) |
| 105 | +#define GXCONJ 0 |
| 106 | +#define GCONJ 0 |
| 107 | +#else |
| 108 | +#define GXCONJ 1 |
| 109 | +#define GCONJ 0 |
| 110 | +#endif |
| 111 | +#else |
| 112 | +#if !defined(XCONJ) |
| 113 | +#define GXCONJ 0 |
| 114 | +#define GCONJ 1 |
| 115 | +#else |
| 116 | +#define GXCONJ 1 |
| 117 | +#define GCONJ 1 |
| 118 | +#endif |
| 119 | +#endif |
| 120 | + |
| 121 | +.macro CLOAD_X_8 |
| 122 | + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ |
| 123 | + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 |
| 124 | + GCOMPLEXMUL GXCONJ, \ |
| 125 | + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ |
| 126 | + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ |
| 127 | + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ |
| 128 | + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ |
| 129 | + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ |
| 130 | + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ |
| 131 | + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ |
| 132 | + X7, X7, VALPHA, TMP0, TMP1, TMP2 |
| 133 | +.endm |
| 134 | + |
| 135 | +.macro CLOAD_X_8_GAP |
| 136 | + xvldrepl.d X0, X, 0x00 |
| 137 | + PTR_ADD T0, X, INC_X |
| 138 | + xvldrepl.d X1, T0, 0x00 |
| 139 | + PTR_ADD T0, T0, INC_X |
| 140 | + xvldrepl.d X2, T0, 0x00 |
| 141 | + PTR_ADD T0, T0, INC_X |
| 142 | + xvldrepl.d X3, T0, 0x00 |
| 143 | + PTR_ADD T0, T0, INC_X |
| 144 | + xvldrepl.d X4, T0, 0x00 |
| 145 | + PTR_ADD T0, T0, INC_X |
| 146 | + xvldrepl.d X5, T0, 0x00 |
| 147 | + PTR_ADD T0, T0, INC_X |
| 148 | + xvldrepl.d X6, T0, 0x00 |
| 149 | + PTR_ADD T0, T0, INC_X |
| 150 | + xvldrepl.d X7, T0, 0x00 |
| 151 | + |
| 152 | + GCOMPLEXMUL GXCONJ, \ |
| 153 | + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \ |
| 154 | + X1, X1, VALPHA, TMP0, TMP1, TMP2, \ |
| 155 | + X2, X2, VALPHA, TMP0, TMP1, TMP2, \ |
| 156 | + X3, X3, VALPHA, TMP0, TMP1, TMP2, \ |
| 157 | + X4, X4, VALPHA, TMP0, TMP1, TMP2, \ |
| 158 | + X5, X5, VALPHA, TMP0, TMP1, TMP2, \ |
| 159 | + X6, X6, VALPHA, TMP0, TMP1, TMP2, \ |
| 160 | + X7, X7, VALPHA, TMP0, TMP1, TMP2 |
| 161 | +.endm |
| 162 | + |
| 163 | +.macro CLOAD_Y_8 |
| 164 | + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 |
| 165 | +.endm |
| 166 | + |
| 167 | +.macro CLOAD_Y_8_GAP |
| 168 | + fld.d $f10, Y, 0 |
| 169 | + fldx.d $f13, Y, INC_Y |
| 170 | + PTR_ALSL T0, INC_Y, Y, 1 |
| 171 | + fld.d $f14, T0, 0 |
| 172 | + fldx.d $f15, T0, INC_Y |
| 173 | + PTR_ALSL T0, INC_Y, Y, 2 |
| 174 | + fld.d $f11, T0, 0 |
| 175 | + fldx.d $f17, T0, INC_Y |
| 176 | + PTR_ADD T0, T0, INC_Y |
| 177 | + PTR_ADD T0, T0, INC_Y |
| 178 | + fld.d $f18, T0, 0 |
| 179 | + fldx.d $f19, T0, INC_Y |
| 180 | + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 |
| 181 | +.endm |
| 182 | + |
| 183 | +.macro CSTORE_Y_8_GAP |
| 184 | + xvstelm.d Y0, Y, 0, 0 |
| 185 | + PTR_ADD T0, Y, INC_Y |
| 186 | + xvstelm.d Y0, T0, 0, 1 |
| 187 | + PTR_ADD T0, T0, INC_Y |
| 188 | + xvstelm.d Y0, T0, 0, 2 |
| 189 | + PTR_ADD T0, T0, INC_Y |
| 190 | + xvstelm.d Y0, T0, 0, 3 |
| 191 | + |
| 192 | + PTR_ADD T0, T0, INC_Y |
| 193 | + xvstelm.d Y1, T0, 0, 0 |
| 194 | + PTR_ADD T0, T0, INC_Y |
| 195 | + xvstelm.d Y1, T0, 0, 1 |
| 196 | + PTR_ADD T0, T0, INC_Y |
| 197 | + xvstelm.d Y1, T0, 0, 2 |
| 198 | + PTR_ADD T0, T0, INC_Y |
| 199 | + xvstelm.d Y1, T0, 0, 3 |
| 200 | +.endm |
| 201 | + |
| 202 | +.macro CGEMV_N_8x8 |
| 203 | + GLD_INC xv, , 0x20, \ |
| 204 | + A0, PA0, 0, A1, PA0, 0, \ |
| 205 | + A2, PA1, 0, A3, PA1, 0, \ |
| 206 | + A4, PA2, 0, A5, PA2, 0, \ |
| 207 | + A6, PA3, 0, A7, PA3, 0, \ |
| 208 | + A8, PA4, 0, A9, PA4, 0, \ |
| 209 | + A10, PA5, 0, A11, PA5, 0, \ |
| 210 | + A12, PA6, 0, A13, PA6, 0, \ |
| 211 | + A14, PA7, 0, A15, PA7, 0 |
| 212 | + |
| 213 | + GCOMPLEXMADD GXCONJ, GCONJ, \ |
| 214 | + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \ |
| 215 | + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \ |
| 216 | + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \ |
| 217 | + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \ |
| 218 | + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \ |
| 219 | + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \ |
| 220 | + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \ |
| 221 | + Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2 |
| 222 | +.endm |
| 223 | + |
| 224 | +.macro CSTORE_Y_8 |
| 225 | + GST xv, , Y0, Y, 0, Y1, Y, 0x20 |
| 226 | +.endm |
| 227 | + |
| 228 | +.macro CLOAD_X_1 |
| 229 | + GLDREPL xv, d, X0, X, 0x00 |
| 230 | + GCOMPLEXMUL GXCONJ, \ |
| 231 | + xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2 |
| 232 | +.endm |
| 233 | + |
| 234 | +.macro CLOAD_Y_1 |
| 235 | + fld.d $f10, Y, 0 |
| 236 | +.endm |
| 237 | + |
| 238 | +.macro CGEMV_N_1x8 |
| 239 | + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ |
| 240 | + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 |
| 241 | + GCOMPLEXMADD GXCONJ, GCONJ, \ |
| 242 | + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \ |
| 243 | + Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \ |
| 244 | + Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \ |
| 245 | + Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \ |
| 246 | + Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \ |
| 247 | + Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \ |
| 248 | + Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \ |
| 249 | + Y0, X7, A14, Y0, TMP0, TMP1, TMP2 |
| 250 | +.endm |
| 251 | + |
| 252 | +.macro CSTORE_Y_1 |
| 253 | + fst.d $f10, Y, 0 |
| 254 | +.endm |
| 255 | + |
| 256 | +.macro CGEMV_N_1x1 |
| 257 | + fld.d $f12, PA0, 0 |
| 258 | + PTR_ADDI PA0, PA0, 0x08 |
| 259 | + GCOMPLEXMADD GXCONJ, GCONJ, \ |
| 260 | + xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2 |
| 261 | +.endm |
| 262 | + |
| 263 | +.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req |
| 264 | + PTR_SRLI J, N, 3 |
| 265 | + beqz J, .L_\XW\()_N_7 |
| 266 | + PTR_SLLI K_LDA, LDA, 3 |
| 267 | + PTR_SUB K_LDA, K_LDA, M8 |
| 268 | +.L_\XW\()_N_L8: |
| 269 | + CLOAD_\X_8 |
| 270 | + xor K, K, K |
| 271 | + move Y, Y_ORG |
| 272 | + PTR_SRLI I, M, 3 |
| 273 | + beqz I, .L_\XW\()_M_7 |
| 274 | +.align 5 |
| 275 | +.L_\XW\()_M_L8: |
| 276 | + CLOAD_\Y_8 |
| 277 | + CGEMV_N_8x8 |
| 278 | + CSTORE_\Y_8 |
| 279 | + PTR_ADDI I, I, -1 |
| 280 | + PTR_ALSL Y, INC_Y, Y, 3 |
| 281 | + PTR_ADDI K, K, 8 |
| 282 | + bnez I, .L_\XW\()_M_L8 |
| 283 | +.L_\XW\()_M_7: |
| 284 | + andi I, M, 7 |
| 285 | + beqz I, .L_\XW\()_M_END |
| 286 | +.align 5 |
| 287 | +.L_\XW\()_M_L1: |
| 288 | + CLOAD_\Y_1 |
| 289 | + CGEMV_N_1x8 |
| 290 | + CSTORE_\Y_1 |
| 291 | + PTR_ADDI I, I, -1 |
| 292 | + PTR_ADD Y, Y, INC_Y |
| 293 | + PTR_ADDI K, K, 1 |
| 294 | + bnez I, .L_\XW\()_M_L1 |
| 295 | +.L_\XW\()_M_END: |
| 296 | + PTR_ADDI J, J, -1 |
| 297 | +#if __loongarch_grlen == 64 |
| 298 | + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ |
| 299 | + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA |
| 300 | +#elif __loongarch_grlen == 32 |
| 301 | + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ |
| 302 | + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA |
| 303 | +#else |
| 304 | + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ |
| 305 | + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA |
| 306 | +#endif |
| 307 | + PTR_ALSL X, INC_X, X, 3 |
| 308 | + bnez J, .L_\XW\()_N_L8 |
| 309 | +.L_\XW\()_N_7: |
| 310 | + andi J, N, 7 |
| 311 | + beqz J, .L_END |
| 312 | +.L_\XW\()_N_L1: |
| 313 | + CLOAD_\X_1 |
| 314 | + xor K, K, K |
| 315 | + move Y, Y_ORG |
| 316 | + move I, M |
| 317 | + beqz I, .L_END |
| 318 | +.align 5 |
| 319 | +.L_\XW\()_N_1_M_L1: |
| 320 | + CLOAD_\Y_1 |
| 321 | + CGEMV_N_1x1 |
| 322 | + CSTORE_\Y_1 |
| 323 | + PTR_ADDI I, I, -1 |
| 324 | + PTR_ADD Y, Y, INC_Y |
| 325 | + PTR_ADDI K, K, 1 |
| 326 | + bnez I, .L_\XW\()_N_1_M_L1 |
| 327 | +.L_\XW\()_N_1_M_END: |
| 328 | + PTR_ADDI J, J, -1 |
| 329 | + PTR_SUB K_LDA, LDA, M8 |
| 330 | + PTR_ADD PA0, PA0, K_LDA |
| 331 | + PTR_ADD X, X, INC_X |
| 332 | + bnez J, .L_\XW\()_N_L1 |
| 333 | + |
| 334 | + b .L_END |
| 335 | +.endm |
| 336 | + |
| 337 | + PROLOGUE |
| 338 | + PTR_LD INC_Y, $sp, 0 |
| 339 | + push_if_used 17 + 7, 31 |
| 340 | + PTR_ADDI K, $r0, 0x01 |
| 341 | + PTR_SUB I, INC_X, K |
| 342 | + PTR_SUB J, INC_Y, K |
| 343 | + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ |
| 344 | + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ |
| 345 | + PTR_ALSL I, I, J, 1 |
| 346 | + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 |
| 347 | + // Init VALPHA |
| 348 | + xvpackev.w $xr0, $xr1, $xr0 |
| 349 | + xvreplve0.d VALPHA, $xr0 |
| 350 | + move Y_ORG, Y |
| 351 | + move PA0, A |
| 352 | +#if __loongarch_grlen == 64 |
| 353 | + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ |
| 354 | + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA |
| 355 | +#elif __loongarch_grlen == 32 |
| 356 | + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ |
| 357 | + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA |
| 358 | +#else |
| 359 | + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ |
| 360 | + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA |
| 361 | +#endif |
| 362 | + la.local T0, .L_GAP_TABLE |
| 363 | + PTR_ALSL I, I, T0, 1 |
| 364 | + ld.h K, I, 0 // Obtain the offset address |
| 365 | + PTR_ADD T0, T0, K |
| 366 | + jirl $r0, T0, 0 |
| 367 | +.L_GAP_TABLE: |
| 368 | + .hword .L_GAP_0_0 - .L_GAP_TABLE |
| 369 | + .hword .L_GAP_0_1 - .L_GAP_TABLE |
| 370 | + .hword .L_GAP_1_0 - .L_GAP_TABLE |
| 371 | + .hword .L_GAP_1_1 - .L_GAP_TABLE |
| 372 | +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ |
| 373 | + CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1 |
| 374 | +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ |
| 375 | + CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1 |
| 376 | +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ |
| 377 | + CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1 |
| 378 | +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ |
| 379 | + CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1 |
| 380 | +.L_END: |
| 381 | + pop_if_used 17 + 7, 31 |
| 382 | + jirl $r0, $r1, 0x0 |
| 383 | + EPILOGUE |
0 commit comments