Skip to content

Commit 13b8c44

Browse files
committed
loongarch: Add optimization for dsdot kernel.
1 parent 3def6a8 commit 13b8c44

File tree

2 files changed

+74
-14
lines changed

2 files changed

+74
-14
lines changed

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
ifndef NO_LASX
22

3-
SDOTKERNEL = dot_lasx.S
4-
DDOTKERNEL = dot_lasx.S
3+
SDOTKERNEL = dot_lasx.S
4+
DSDOTKERNEL = dot_lasx.S
5+
DDOTKERNEL = dot_lasx.S
56

67
DGEMMKERNEL = dgemm_kernel_16x4.S
78
DGEMMINCOPY = dgemm_ncopy_16.S

kernel/loongarch64/dot_lasx.S

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ PROLOGUE
5151
LDINT INCX, 0(INCX)
5252
LDINT INCY, 0(INCY)
5353
#endif
54+
55+
/* init $f8 and $f9 to zero */
5456
SUB s1, s1, s1
5557
SUB s2, s2, s2
5658
slli.d INCX, INCX, BASE_SHIFT
@@ -59,25 +61,33 @@ PROLOGUE
5961
bge $r0, N, .L999
6062
bne INCX, TEMP, .L20 /* inc_x=1 */
6163
bne INCY, TEMP, .L20 /* inc_y=1 */
62-
#ifdef DOUBLE
63-
srai.d I, N, 4
64-
#else
65-
srai.d I, N, 5
66-
#endif
64+
65+
/* !((inc_x == 1) && (inc_y == 1)) */
6766

6867
/* init $xr8 and $xr9 to zero */
6968
#ifdef DOUBLE
7069
xvldrepl.d $xr0, X, 0
7170
#else
7271
xvldrepl.w $xr0, X, 0
7372
#endif
73+
#ifdef DSDOT
74+
xvfcvtl.d.s $xr0, $xr0
75+
xvfsub.d $xr8, $xr0, $xr0
76+
xvfsub.d $xr9, $xr0, $xr0
77+
#else
7478
XVFSUB $xr8, $xr0, $xr0
7579
XVFSUB $xr9, $xr0, $xr0
80+
#endif
7681

77-
/* !((inc_x == 1) && (inc_y == 1)) */
78-
bge $r0, I, .L12 /* <32 */
82+
#ifdef DOUBLE
83+
srai.d I, N, 4
84+
#else
85+
srai.d I, N, 5
86+
#endif
87+
bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
88+
.align 3
7989
.L11:
80-
/* case 32~ */
90+
/* FLOAT: 32~ ; DOUBLE: 16~ */
8191
xvld $xr0, X, 0
8292
xvld $xr1, X, 32
8393
xvld $xr2, X, 64
@@ -89,11 +99,39 @@ PROLOGUE
8999
addi.w I, I, -1
90100
addi.d X, X, 128
91101
addi.d Y, Y, 128
102+
#ifdef DSDOT
103+
xvfcvtl.d.s $xr10, $xr0
104+
xvfcvtl.d.s $xr11, $xr4
105+
xvfcvth.d.s $xr12, $xr0
106+
xvfcvth.d.s $xr13, $xr4
107+
xvfmadd.d $xr8, $xr10, $xr12, $xr8
108+
xvfmadd.d $xr9, $xr11, $xr13, $xr9
109+
xvfcvtl.d.s $xr10, $xr1
110+
xvfcvtl.d.s $xr11, $xr5
111+
xvfcvth.d.s $xr12, $xr1
112+
xvfcvth.d.s $xr13, $xr5
113+
xvfmadd.d $xr8, $xr10, $xr12, $xr8
114+
xvfmadd.d $xr9, $xr11, $xr13, $xr9
115+
xvfcvtl.d.s $xr10, $xr2
116+
xvfcvtl.d.s $xr11, $xr6
117+
xvfcvth.d.s $xr12, $xr2
118+
xvfcvth.d.s $xr13, $xr6
119+
xvfmadd.d $xr8, $xr10, $xr12, $xr8
120+
xvfmadd.d $xr9, $xr11, $xr13, $xr9
121+
xvfcvtl.d.s $xr10, $xr3
122+
xvfcvtl.d.s $xr11, $xr7
123+
xvfcvth.d.s $xr12, $xr3
124+
xvfcvth.d.s $xr13, $xr7
125+
xvfmadd.d $xr8, $xr10, $xr12, $xr8
126+
xvfmadd.d $xr9, $xr11, $xr13, $xr9
127+
#else
92128
XVFMADD $xr8, $xr0, $xr4, $xr8
93129
XVFMADD $xr9, $xr1, $xr5, $xr9
94130
XVFMADD $xr8, $xr2, $xr6, $xr8
95131
XVFMADD $xr9, $xr3, $xr7, $xr9
132+
#endif
96133
bnez I, .L11
134+
.align 3
97135
.L12:
98136
#ifdef DOUBLE
99137
andi I, N, 0xf
@@ -102,18 +140,37 @@ PROLOGUE
102140
andi I, N, 0x1f
103141
srai.d I, I, 3
104142
#endif
105-
bge $r0, I, .L14 /* <8 */
143+
bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
144+
.align 3
106145
.L13:
107-
/* case 8~31 */
146+
/* FLOAT: 8~31 ; DOUBLE: 4~15 */
108147
xvld $xr0, X, 0
109148
xvld $xr4, Y, 0
110149
addi.w I, I, -1
111150
addi.d X, X, 32
112151
addi.d Y, Y, 32
152+
#ifdef DSDOT
153+
xvfcvtl.d.s $xr10, $xr0
154+
xvfcvtl.d.s $xr11, $xr4
155+
xvfcvth.d.s $xr12, $xr0
156+
xvfcvth.d.s $xr13, $xr4
157+
xvfmadd.d $xr8, $xr10, $xr12, $xr8
158+
xvfmadd.d $xr9, $xr11, $xr13, $xr9
159+
#else
113160
XVFMADD $xr8, $xr0, $xr4, $xr8
161+
#endif
114162
bnez I, .L13
163+
.align 3
115164
.L14:
116165
/* store dot in s1 $f8 */
166+
#ifdef DSDOT
167+
xvfadd.d $xr8, $xr8, $xr9
168+
fsub.s s2, s2, s2, /* set s2 to 0.0 */
169+
xvpermi.q $xr0, $xr8, 0x1
170+
vfadd.d $vr8, $vr8, $vr0
171+
vpackod.d $vr0, $vr8, $vr8
172+
vfadd.d $vr8, $vr8, $vr0
173+
#else
117174
XVFADD $xr8, $xr8, $xr9
118175
SUB s2, s2, s2 /* set s2 to 0.0 */
119176
xvpermi.q $xr0, $xr8, 0x1
@@ -125,7 +182,9 @@ PROLOGUE
125182
VFADD $vr8, $vr8, $vr0
126183
vpackod.w $vr0, $vr8, $vr8
127184
VFADD $vr8, $vr8, $vr0
128-
#endif
185+
#endif /* defined DOUBLE */
186+
#endif /* defined DSDOT */
187+
.align 3
129188
.L15:
130189
#ifdef DOUBLE
131190
andi I, N, 0x3
@@ -135,7 +194,7 @@ PROLOGUE
135194
bge $r0, I, .L999 /* =0 */
136195
.align 3
137196
.L16:
138-
/* case 1~7 */
197+
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
139198
LD a1, X, 0
140199
LD b1, Y, 0
141200
#ifdef DSDOT

0 commit comments

Comments
 (0)