Loongarch64: fixed snrm2_lasx

pengxu · pengxu · commit b471fa337bdc · 2025-04-30T16:42:36.000+08:00
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
@@ -43,45 +43,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define t2     $r13
 #define t3     $r14
 #define t4     $r15
-
-/* Don't change following FR unless you know the effects. */
 #define VX0    $xr15
 #define VX1    $xr16
 #define VX2    $xr17
 #define VX3    $xr18
 #define VX4    $xr21
+#define VX5    $xr22
+/* Don't change following FR unless you know the effects. */
 #define res1   $xr19
 #define res2   $xr20
+#define RCP    $f2
+#define VALPHA $xr3
+
+// The optimization for snrm2 cannot simply involve
+// extending the data type from float to double and
+// then summing the squares of the data. LAPACK tests
+// have shown that this approach can still lead to data overflow.
+// Instead, we need to find the maximum absolute value in the entire
+// array and divide each data element by this maximum value before
+// performing the calculation. This approach can avoid overflow (and does not require extending the data type).
 
    PROLOGUE
 
 #ifdef F_INTERFACE
    LDINT   N,     0(N)
    LDINT   INCX,  0(INCX)
 #endif
+   bge $r0,    N, .L999
+   beq $r0, INCX, .L999
 
+   addi.d  $sp,     $sp,     -32
+   st.d    $ra,     $sp,     0
+   st.d    N,       $sp,     8
+   st.d    X,       $sp,     16
+   st.d    INCX,    $sp,     24
+#ifdef DYNAMIC_ARCH
+   bl samax_k_LA264
+#else
+   bl samax_k
+#endif
+   ld.d    $ra,     $sp,     0
+   ld.d    N,       $sp,     8
+   ld.d    X,       $sp,     16
+   ld.d    INCX,    $sp,     24
+   addi.d  $sp,     $sp,     32
+
+   frecip.s RCP, $f0
+   vreplvei.w   $vr3, $vr2, 0
+   xvpermi.d    VALPHA, $xr3,0x00
    xvxor.v res1, res1, res1
    xvxor.v res2, res2, res2
-   bge $r0, N, .L999
-   beq $r0, INCX, .L999
+   fcmp.ceq.s  $fcc0, $f0,  $f19
+   bcnez  $fcc0, .L999
    li.d  TEMP, SIZE
    slli.d INCX, INCX, BASE_SHIFT
-   srai.d I, N, 3
+   srai.d I, N, 4
    bne INCX, TEMP, .L20
-   bge $r0,  I, .L997
+   bge  $r0,    I, .L997
    .align 3
 
 .L10:
-   xvld VX0, X, 0
-   xvfcvtl.d.s VX1, VX0
-   xvfcvth.d.s VX2, VX0
-   xvfmadd.d res1, VX1, VX1, res1
-   xvfmadd.d res2, VX2, VX2, res2
+   xvld  VX0, X, 0
+   xvld  VX5, X, 8 * SIZE
    addi.d I, I, -1
-   addi.d X, X, 8 * SIZE
+   addi.d X, X, 16 * SIZE
+
+   xvfmul.s  VX0,  VX0,  VALPHA
+   xvfmul.s  VX5,  VX5,  VALPHA
+
+   xvfmadd.s res1, VX0, VX0, res1
+   xvfmadd.s res2, VX5, VX5, res2
    blt $r0, I, .L10
-   .align 3
    b .L996
+   .align 3
 
 .L20:
    bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ld.w t3, X, 0
    add.d X, X, INCX
    ld.w t4, X, 0
+   add.d X, X, INCX
    xvinsgr2vr.w VX0, t1, 4
    xvinsgr2vr.w VX0, t2, 5
    xvinsgr2vr.w VX0, t3, 6
    xvinsgr2vr.w VX0, t4, 7
+   xvfmul.s  VX0,  VX0,  VALPHA
+   xvfmadd.s res1, VX0, VX0, res1
+
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
    add.d X, X, INCX
-   xvfcvtl.d.s VX1, VX0
-   xvfcvth.d.s VX2, VX0
-   xvfmadd.d res1, VX1, VX1, res1
-   xvfmadd.d res2, VX2, VX2, res2
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   xvfmul.s  VX0,  VX0,  VALPHA
+   xvfmadd.s res2, VX0, VX0, res2
    addi.d  I, I, -1
    blt $r0, I, .L21
-   b .L996
+   .align 3
 
 .L996:
-   xvfadd.d res1, res1, res2
-   xvpickve.d VX1, res1, 1
-   xvpickve.d VX2, res1, 2
-   xvpickve.d VX3, res1, 3
-   fadd.d   $f19, $f19, $f16
-   fadd.d   $f19, $f19, $f17
-   fadd.d   $f19, $f19, $f18
+   xvfadd.s res1, res1, res2
+   xvpermi.d  VX1, res1, 0x4e
+   xvfadd.s res1, res1, VX1
+   vreplvei.w $vr16, $vr19, 1
+   vreplvei.w $vr17, $vr19, 2
+   vreplvei.w $vr18, $vr19, 3
+   xvfadd.s res1, VX1, res1
+   xvfadd.s res1, VX2, res1
+   xvfadd.s res1, VX3, res1
    .align 3
 
 .L997:
-   andi I, N, 7
+   andi I, N, 15
    bge $r0, I, .L999
    .align 3
 
 .L998:
    fld.s $f15, X, 0
-   add.d    X, X, INCX
-   addi.d   I, I, -1
-   fcvt.d.s $f15, $f15
-   fmadd.d  $f19, $f15, $f15, $f19
+   addi.d I, I, -1
+   fmul.s  $f15, $f15, RCP
+   fmadd.s $f19, $f15, $f15, $f19
+   add.d X, X, INCX
    blt $r0, I, .L998
    .align 3
 
 .L999:
-   fsqrt.d $f19, $f19
+   fsqrt.s $f19, $f19
+   fmul.s  $f0, $f19, $f0
    move $r4, $r17
-   fcvt.s.d $f0, $f19
    jirl $r0, $r1, 0x0
+   .align 3
 
    EPILOGUE