OpenMathLib
diff --git a/‎GotoBLAS_06WeirdPerformance.txt
Lines changed: 1 addition & 1 deletion b/‎GotoBLAS_06WeirdPerformance.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎kernel/loongarch64/KERNEL.LOONGSON2K1000
Lines changed: 51 additions & 0 deletions b/‎kernel/loongarch64/KERNEL.LOONGSON2K1000
Lines changed: 51 additions & 0 deletions
diff --git a/‎kernel/loongarch64/KERNEL.LOONGSON3R5
Lines changed: 51 additions & 0 deletions b/‎kernel/loongarch64/KERNEL.LOONGSON3R5
Lines changed: 51 additions & 0 deletions
diff --git a/‎kernel/loongarch64/damax_lasx.S
Lines changed: 183 additions & 0 deletions b/‎kernel/loongarch64/damax_lasx.S
Lines changed: 183 additions & 0 deletions
@@ -11,7 +11,7 @@
    operation is finished.
 
 
-2. Simlar problem may happen under virtual machine. If supervisor
+2. Similar problem may happen under virtual machine. If supervisor
    allocates different cores for each scheduling, BLAS performnace
    will be bad. This is because BLAS also utilizes all cache,
    unexpected re-schedule for different core may result of heavy
 
@@ -4,4 +4,55 @@ SDOTKERNEL  = dot_lsx.S
 DSDOTKERNEL = dot_lsx.S
 DDOTKERNEL  = dot_lsx.S
 
+SSCALKERNEL  = sscal_lsx.S
+DSCALKERNEL  = dscal_lsx.S
+
+SAMAXKERNEL =  samax_lsx.S
+DAMAXKERNEL =  damax_lsx.S
+
+SAMINKERNEL =  samin_lsx.S
+DAMINKERNEL =  damin_lsx.S
+
+SMAXKERNEL  =  smax_lsx.S
+DMAXKERNEL  =  dmax_lsx.S
+
+SMINKERNEL  =  smin_lsx.S
+DMINKERNEL  =  dmin_lsx.S
+
+ISMAXKERNEL =  ismax_lsx.S
+IDMAXKERNEL =  idmax_lsx.S
+
+ISMINKERNEL =  ismin_lsx.S
+IDMINKERNEL =  idmin_lsx.S
+
+ISAMAXKERNEL = isamax_lsx.S
+IDAMAXKERNEL = idamax_lsx.S
+
+ISAMINKERNEL = isamin_lsx.S
+IDAMINKERNEL = idamin_lsx.S
+
+SCOPYKERNEL =  scopy_lsx.S
+DCOPYKERNEL =  dcopy_lsx.S
+
+SSWAPKERNEL =  sswap_lsx.S
+DSWAPKERNEL =  dswap_lsx.S
+
+SAXPYKERNEL =  saxpy_lsx.S
+DAXPYKERNEL =  daxpy_lsx.S
+
+SAXPBYKERNEL = saxpby_lsx.S
+DAXPBYKERNEL = daxpby_lsx.S
+
+SSUMKERNEL  =  ssum_lsx.S
+DSUMKERNEL  =  dsum_lsx.S
+
+SASUMKERNEL =  sasum_lsx.S
+DASUMKERNEL =  dasum_lsx.S
+
+SROTKERNEL  =  srot_lsx.S
+DROTKERNEL  =  drot_lsx.S
+
+SNRM2KERNEL =  snrm2_lsx.S
+DNRM2KERNEL =  dnrm2_lsx.S
+
 endif
@@ -4,6 +4,57 @@ SDOTKERNEL  = dot_lasx.S
 DSDOTKERNEL = dot_lasx.S
 DDOTKERNEL  = dot_lasx.S
 
+SSCALKERNEL  = sscal_lasx.S
+DSCALKERNEL  = dscal_lasx.S
+
+SAMAXKERNEL =  samax_lasx.S
+DAMAXKERNEL =  damax_lasx.S
+
+SAMINKERNEL =  samin_lasx.S
+DAMINKERNEL =  damin_lasx.S
+
+SMAXKERNEL  =  smax_lasx.S
+DMAXKERNEL =   dmax_lasx.S
+
+SMINKERNEL =   smin_lasx.S
+DMINKERNEL =   dmin_lasx.S
+
+ISMAXKERNEL =  ismax_lasx.S
+IDMAXKERNEL =  idmax_lasx.S
+
+ISMINKERNEL =  ismin_lasx.S
+IDMINKERNEL =  idmin_lasx.S
+
+ISAMAXKERNEL = isamax_lasx.S
+IDAMAXKERNEL = idamax_lasx.S
+
+ISAMINKERNEL = isamin_lasx.S
+IDAMINKERNEL = idamin_lasx.S
+
+SCOPYKERNEL =  scopy_lasx.S
+DCOPYKERNEL =  dcopy_lasx.S
+
+SSWAPKERNEL =  sswap_lasx.S
+DSWAPKERNEL =  dswap_lasx.S
+
+SAXPYKERNEL =  saxpy_lasx.S
+DAXPYKERNEL =  daxpy_lasx.S
+
+SAXPBYKERNEL = saxpby_lasx.S
+DAXPBYKERNEL = daxpby_lasx.S
+
+SSUMKERNEL  =  ssum_lasx.S
+DSUMKERNEL  =  dsum_lasx.S
+
+SASUMKERNEL =  sasum_lasx.S
+DASUMKERNEL =  dasum_lasx.S
+
+SROTKERNEL  =  srot_lasx.S
+DROTKERNEL  =  drot_lasx.S
+
+SNRM2KERNEL =  snrm2_lasx.S
+DNRM2KERNEL =  dnrm2_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
 
@@ -0,0 +1,183 @@
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define J $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define TEMP $r16
+#define m0 $xr8
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define x5 $xr13
+#define x6 $xr14
+#define x7 $xr15
+#define x8 $xr16
+#define VX0 $xr20
+#define VX1 $xr21
+#define VM0 $xr22
+#define VM1 $xr23
+#define VM2 $xr18
+#define VM3 $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+    srai.d I, N, 3
+    bge $r0, I, .L12
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvfmaxa.d VM1, VX1, VX0
+    addi.d X, X, 8 * SIZE
+    xvfmaxa.d VM0, VM0, VM1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d VM2, x3, x4
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L12: //INCX==1 and N<8
+    andi I, N, 7
+    li.d J, 4
+    bge J, I, .L13 // 4<N<8
+    xvld VX0, X, 0
+    slli.d J, J, 1 // 8
+    sub.d I, J, I
+    slli.d I, I, BASE_SHIFT
+    xvldx VX1, X, I
+    xvfmaxa.d m0, VX0, VX1 //patial repeat read
+    xvpickve.d x1, m0, 0
+    xvpickve.d x2, m0, 1
+    xvpickve.d x3, m0, 2
+    xvpickve.d x4, m0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d m0, x3, x4
+    xvfmaxa.d m0, m0, VM1
+    xvfmaxa.d VM0, m0, VM0
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L13: //INCX==1 and 0<=N<=4
+    bge $r0, I, .L15
+    .align 3
+
+.L14:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.d VM0, VM0, x1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    .align 3
+
+.L15:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X // initialize the maxa value
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    addi.d I, I, -1
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM1, VM0
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfmaxa.d VM2, x3, x4
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    xvld x1, X, 0
+    addi.d I, I, -1
+    xvfmaxa.d VM0, VM0, x1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fabs.d $f22, $f22
+    fmov.d $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE