|
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2024, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*****************************************************************************/ |
| 27 | + |
| 28 | +#include "common.h" |
| 29 | + |
| 30 | +#ifdef HAVE_AVX |
| 31 | + |
| 32 | +#define COLS_OF_BLOCK 384 |
| 33 | + |
| 34 | +/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ |
| 35 | +/* m: %5 = num_rows, %6 = alpha */ |
| 36 | +/* xmm15 = alpha */ |
| 37 | +#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ |
| 38 | + "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ |
| 39 | + "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ |
| 40 | + "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ |
| 41 | + "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" |
| 42 | + |
| 43 | +#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ |
| 44 | + "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ |
| 45 | + "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ |
| 46 | + "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ |
| 47 | + "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" |
| 48 | + |
| 49 | +#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ |
| 50 | + "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ |
| 51 | + "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" |
| 52 | + |
| 53 | +#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ |
| 54 | + "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ |
| 55 | + "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" |
| 56 | + |
| 57 | +#define COPY_4x16 "movq %1,%4; addq $16,%1;"\ |
| 58 | + "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ |
| 59 | + "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ |
| 60 | + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ |
| 61 | + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) |
| 62 | + |
| 63 | +#define COPY_4x8 "movq %1,%4; addq $16,%1;"\ |
| 64 | + "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ |
| 65 | + "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ |
| 66 | + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) |
| 67 | + |
| 68 | +#define COPY_4x4 "movq %1,%4; addq $16,%1;"\ |
| 69 | + "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ |
| 70 | + "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ |
| 71 | + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) |
| 72 | + |
| 73 | +#define COPY_4x2 \ |
| 74 | + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ |
| 75 | + "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ |
| 76 | + "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ |
| 77 | + "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" |
| 78 | + |
| 79 | +#define COPY_4x1 \ |
| 80 | + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ |
| 81 | + "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ |
| 82 | + "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" |
| 83 | + |
| 84 | +#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ |
| 85 | + "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ |
| 86 | + "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ |
| 87 | + "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ |
| 88 | + "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" |
| 89 | + |
| 90 | +#define COPY_2x16 "movq %1,%4; addq $8,%1;"\ |
| 91 | + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ |
| 92 | + "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ |
| 93 | + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) |
| 94 | + |
| 95 | +#define COPY_2x8 "movq %1,%4; addq $8,%1;"\ |
| 96 | + "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ |
| 97 | + "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ |
| 98 | + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) |
| 99 | + |
| 100 | +#define COPY_2x4 "movq %1,%4; addq $8,%1;"\ |
| 101 | + "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ |
| 102 | + SAVE_2x4(0,1,4,5) |
| 103 | + |
| 104 | +#define COPY_2x2 \ |
| 105 | + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ |
| 106 | + "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" |
| 107 | + |
| 108 | +#define COPY_2x1 \ |
| 109 | + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" |
| 110 | + |
| 111 | +#define SAVE_1x4(c1_no)\ |
| 112 | + "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ |
| 113 | + "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" |
| 114 | + |
| 115 | +#define COPY_1x16 "movq %1,%4; addq $4,%1;"\ |
| 116 | + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ |
| 117 | + "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" |
| 118 | + |
| 119 | +#define COPY_1x8 "movq %1,%4; addq $4,%1;"\ |
| 120 | + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" |
| 121 | + |
| 122 | +#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" |
| 123 | + |
| 124 | +#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" |
| 125 | + |
| 126 | +#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" |
| 127 | + |
| 128 | +#define COMPUTE(ndim){\ |
| 129 | + src = src_base; dst = dst_base;\ |
| 130 | + __asm__ __volatile__(\ |
| 131 | + "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ |
| 132 | + #ndim"31:\n\t"\ |
| 133 | + COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ |
| 134 | + #ndim"32:\n\t"\ |
| 135 | + "cmpq $2,%%r11; jb "#ndim"33f;"\ |
| 136 | + COPY_2x##ndim "subq $2,%%r11;"\ |
| 137 | + #ndim"33:\n\t"\ |
| 138 | + "testq %%r11,%%r11; jz "#ndim"34f;"\ |
| 139 | + COPY_1x##ndim "subq $1,%%r11;"\ |
| 140 | + #ndim"34:\n\t"\ |
| 141 | + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_cols),"m"(ALPHA):"r11","cc","memory"\ |
| 142 | + ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ |
| 143 | +} |
| 144 | +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ |
| 145 | + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; |
| 146 | + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_cols = 0; |
| 147 | + BLASLONG rows_left, cols_done; float ALPHA = alpha; |
| 148 | + if(ALPHA==0.0){ |
| 149 | + dst_base = b; |
| 150 | + for(rows_left=rows;rows_left>0;rows_left--) {memset(dst_base,0,cols*sizeof(float)); dst_base += ldb;} |
| 151 | + return 0; |
| 152 | + } |
| 153 | + for(cols_done=0;cols_done<cols;cols_done+=num_cols){ |
| 154 | + num_cols = cols-cols_done; |
| 155 | + if(num_cols > COLS_OF_BLOCK) num_cols = COLS_OF_BLOCK; |
| 156 | + rows_left = rows; src_base = a + (int64_t)lda * (int64_t)cols_done; dst_base = b + cols_done; |
| 157 | + if(ldb%1024>3 && ldb%1024<1021) for(;rows_left>15;rows_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} |
| 158 | + for(;rows_left>7;rows_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} |
| 159 | + for(;rows_left>3;rows_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} |
| 160 | + for(;rows_left>1;rows_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} |
| 161 | + if(rows_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} |
| 162 | + } |
| 163 | + return 0; |
| 164 | +} |
| 165 | + |
| 166 | +#else |
| 167 | + |
| 168 | +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) |
| 169 | +{ |
| 170 | + BLASLONG i, j; |
| 171 | + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; |
| 172 | + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; |
| 173 | + |
| 174 | + if (rows <= 0) return 0; |
| 175 | + if (cols <= 0) return 0; |
| 176 | + |
| 177 | + a_offset = a; |
| 178 | + b_offset = b; |
| 179 | + |
| 180 | + i = (cols >> 2); |
| 181 | + if (i > 0) { |
| 182 | + do { |
| 183 | + a_offset1 = a_offset; |
| 184 | + a_offset2 = a_offset1 + lda; |
| 185 | + a_offset3 = a_offset2 + lda; |
| 186 | + a_offset4 = a_offset3 + lda; |
| 187 | + a_offset += 4 * lda; |
| 188 | + |
| 189 | + b_offset1 = b_offset; |
| 190 | + b_offset2 = b_offset1 + ldb; |
| 191 | + b_offset3 = b_offset2 + ldb; |
| 192 | + b_offset4 = b_offset3 + ldb; |
| 193 | + b_offset += 4; |
| 194 | + |
| 195 | + j = (rows >> 2); |
| 196 | + if (j > 0) { |
| 197 | + do { |
| 198 | + /* Column 1 of MAT_B */ |
| 199 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A |
| 200 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 201 | + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; |
| 202 | + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; |
| 203 | + |
| 204 | + /* Column 2 of MAT_B */ |
| 205 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A |
| 206 | + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; |
| 207 | + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; |
| 208 | + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; |
| 209 | + |
| 210 | + /* Column 3 of MAT_B */ |
| 211 | + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A |
| 212 | + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; |
| 213 | + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; |
| 214 | + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; |
| 215 | + |
| 216 | + /* Column 4 of MAT_B */ |
| 217 | + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A |
| 218 | + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; |
| 219 | + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; |
| 220 | + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; |
| 221 | + |
| 222 | + a_offset1 += 4; |
| 223 | + a_offset2 += 4; |
| 224 | + a_offset3 += 4; |
| 225 | + a_offset4 += 4; |
| 226 | + b_offset1 += ldb * 4; |
| 227 | + b_offset2 += ldb * 4; |
| 228 | + b_offset3 += ldb * 4; |
| 229 | + b_offset4 += ldb * 4; |
| 230 | + |
| 231 | + j--; |
| 232 | + } while (j > 0); |
| 233 | + } // if(j > 0) |
| 234 | + |
| 235 | + |
| 236 | + if (rows & 2) { |
| 237 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 238 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 239 | + |
| 240 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; |
| 241 | + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; |
| 242 | + |
| 243 | + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; |
| 244 | + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; |
| 245 | + |
| 246 | + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; |
| 247 | + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; |
| 248 | + |
| 249 | + a_offset1 += 2; |
| 250 | + a_offset2 += 2; |
| 251 | + a_offset3 += 2; |
| 252 | + a_offset4 += 2; |
| 253 | + |
| 254 | + b_offset1 += ldb*2; |
| 255 | + |
| 256 | + } |
| 257 | + |
| 258 | + if (rows & 1) { |
| 259 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 260 | + |
| 261 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; |
| 262 | + |
| 263 | + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; |
| 264 | + |
| 265 | + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; |
| 266 | + } |
| 267 | + |
| 268 | + i--; |
| 269 | + } while (i > 0); |
| 270 | + } |
| 271 | + |
| 272 | + |
| 273 | + if (cols & 2) { |
| 274 | + a_offset1 = a_offset; |
| 275 | + a_offset2 = a_offset1 + lda; |
| 276 | + a_offset += 2 * lda; |
| 277 | + |
| 278 | + b_offset1 = b_offset; |
| 279 | + b_offset2 = b_offset1 + ldb; |
| 280 | + b_offset3 = b_offset2 + ldb; |
| 281 | + b_offset4 = b_offset3 + ldb; |
| 282 | + b_offset += 2; |
| 283 | + |
| 284 | + j = (rows >> 2); |
| 285 | + if (j > 0){ |
| 286 | + do { |
| 287 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 288 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 289 | + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; |
| 290 | + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; |
| 291 | + |
| 292 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; |
| 293 | + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; |
| 294 | + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; |
| 295 | + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; |
| 296 | + |
| 297 | + a_offset1 += 4; |
| 298 | + a_offset2 += 4; |
| 299 | + b_offset1 += ldb * 4; |
| 300 | + b_offset2 += ldb * 4; |
| 301 | + b_offset3 += ldb * 4; |
| 302 | + b_offset4 += ldb * 4; |
| 303 | + |
| 304 | + j--; |
| 305 | + } while (j > 0); |
| 306 | + } |
| 307 | + |
| 308 | + |
| 309 | + if (rows & 2){ |
| 310 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 311 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 312 | + |
| 313 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; |
| 314 | + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; |
| 315 | + |
| 316 | + a_offset1 += 2; |
| 317 | + a_offset2 += 2; |
| 318 | + b_offset1 += ldb*2; |
| 319 | + |
| 320 | + } |
| 321 | + |
| 322 | + |
| 323 | + if (rows & 1){ |
| 324 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 325 | + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; |
| 326 | + } |
| 327 | + } // if (cols & 2) |
| 328 | + |
| 329 | + |
| 330 | + if (cols & 1) { |
| 331 | + a_offset1 = a_offset; |
| 332 | + a_offset += lda; |
| 333 | + |
| 334 | + b_offset1 = b_offset; |
| 335 | + b_offset2 = b_offset1 + ldb; |
| 336 | + b_offset3 = b_offset2 + ldb; |
| 337 | + b_offset4 = b_offset3 + ldb; |
| 338 | + |
| 339 | + j = (rows >> 2); |
| 340 | + if (j > 0){ |
| 341 | + do { |
| 342 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 343 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 344 | + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; |
| 345 | + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; |
| 346 | + |
| 347 | + a_offset1 += 4; |
| 348 | + b_offset1 += ldb * 4; |
| 349 | + b_offset2 += ldb * 4; |
| 350 | + b_offset3 += ldb * 4; |
| 351 | + b_offset4 += ldb * 4; |
| 352 | + |
| 353 | + j--; |
| 354 | + } while (j > 0); |
| 355 | + } |
| 356 | + |
| 357 | + if (rows & 2){ |
| 358 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 359 | + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; |
| 360 | + |
| 361 | + a_offset1 += 2; |
| 362 | + b_offset1 += ldb * 2; |
| 363 | + } |
| 364 | + |
| 365 | + if (rows & 1){ |
| 366 | + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; |
| 367 | + } |
| 368 | + } |
| 369 | + |
| 370 | + return 0; |
| 371 | +} |
| 372 | + |
| 373 | +#endif |
0 commit comments