Skip to content

Commit 7bc714d

Browse files
limeidanabner-chenc
authored andcommitted
internal/bytealg: optimize the function indexbyte using SIMD on loong64
goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3C5000 @ 2200.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 19.32n ± 0% 11.84n ± 0% -38.72% (p=0.000 n=10) IndexByte/32 49.34n ± 0% 14.11n ± 0% -71.40% (p=0.000 n=10) IndexByte/4K 5608.0n ± 0% 138.8n ± 0% -97.52% (p=0.000 n=10) IndexByte/4M 3822.8µ ± 0% 119.4µ ± 0% -96.88% (p=0.000 n=10) IndexByte/64M 61.826m ± 1% 3.812m ± 0% -93.83% (p=0.000 n=10) geomean 16.61µ 1.602µ -90.35% goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 6.809n ± 0% 5.804n ± 0% -14.75% (p=0.000 n=10) IndexByte/32 16.015n ± 0% 6.404n ± 0% -60.01% (p=0.000 n=10) IndexByte/4K 1651.00n ± 0% 52.83n ± 0% -96.80% (p=0.000 n=10) IndexByte/4M 1680.76µ ± 0% 91.10µ ± 0% -94.58% (p=0.000 n=10) IndexByte/64M 26.878m ± 0% 2.010m ± 27% -92.52% (p=0.000 n=10) geomean 6.054µ 815.0n -86.54% Change-Id: Ib75b997249708f921c6717eba43543c6650bf376 Reviewed-on: https://go-review.googlesource.com/c/go/+/668055 Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
1 parent d65c209 commit 7bc714d

File tree

1 file changed

+269
-29
lines changed

1 file changed

+269
-29
lines changed

src/internal/bytealg/indexbyte_loong64.s

Lines changed: 269 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,48 +5,288 @@
55
#include "go_asm.h"
66
#include "textflag.h"
77

8+
// input:
9+
// R4 = b_base
10+
// R5 = b_len
11+
// R6 = b_cap (unused)
12+
// R7 = byte to find
813
TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
9-
// R4 = b_base
10-
// R5 = b_len
11-
// R6 = b_cap (unused)
12-
// R7 = byte to find
1314
AND $0xff, R7
15+
JMP indexbytebody<>(SB)
16+
17+
// input:
18+
// R4 = s_base
19+
// R5 = s_len
20+
// R6 = byte to find
21+
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
22+
AND $0xff, R6, R7 // byte to find
23+
JMP indexbytebody<>(SB)
24+
25+
// input:
26+
// R4: b_base
27+
// R5: len
28+
// R7: byte to find
29+
TEXT indexbytebody<>(SB),NOSPLIT,$0
30+
BEQ R5, notfound // len == 0
31+
1432
MOVV R4, R6 // store base for later
15-
ADDV R4, R5 // end
16-
ADDV $-1, R4
33+
ADDV R4, R5, R8 // end
34+
35+
MOVV $32, R9
36+
BGE R5, R9, lasx
37+
tail:
38+
MOVV $8, R9
39+
BLT R5, R9, lt_8
40+
generic8_loop:
41+
MOVV (R4), R10
42+
43+
AND $0xff, R10, R11
44+
BEQ R7, R11, found
45+
46+
BSTRPICKV $15, R10, $8, R11
47+
BEQ R7, R11, byte_1th
48+
49+
BSTRPICKV $23, R10, $16, R11
50+
BEQ R7, R11, byte_2th
51+
52+
BSTRPICKV $31, R10, $24, R11
53+
BEQ R7, R11, byte_3th
1754

18-
PCALIGN $16
19-
loop:
55+
BSTRPICKV $39, R10, $32, R11
56+
BEQ R7, R11, byte_4th
57+
58+
BSTRPICKV $47, R10, $40, R11
59+
BEQ R7, R11, byte_5th
60+
61+
BSTRPICKV $55, R10, $48, R11
62+
BEQ R7, R11, byte_6th
63+
64+
BSTRPICKV $63, R10, $56, R11
65+
BEQ R7, R11, byte_7th
66+
67+
ADDV $8, R4
68+
ADDV $-8, R5
69+
BGE R5, R9, generic8_loop
70+
71+
lt_8:
72+
BEQ R4, R8, notfound
73+
MOVBU (R4), R10
74+
BEQ R7, R10, found
2075
ADDV $1, R4
21-
BEQ R4, R5, notfound
22-
MOVBU (R4), R8
23-
BNE R7, R8, loop
76+
JMP lt_8
2477

25-
SUBV R6, R4 // remove base
78+
byte_1th:
79+
ADDV $1, R4
80+
SUBV R6, R4
2681
RET
2782

28-
notfound:
29-
MOVV $-1, R4
83+
byte_2th:
84+
ADDV $2, R4
85+
SUBV R6, R4
3086
RET
3187

32-
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
33-
// R4 = s_base
34-
// R5 = s_len
35-
// R6 = byte to find
36-
MOVV R4, R7 // store base for later
37-
ADDV R4, R5 // end
38-
ADDV $-1, R4
39-
40-
PCALIGN $16
41-
loop:
42-
ADDV $1, R4
43-
BEQ R4, R5, notfound
44-
MOVBU (R4), R8
45-
BNE R6, R8, loop
88+
byte_3th:
89+
ADDV $3, R4
90+
SUBV R6, R4
91+
RET
92+
93+
byte_4th:
94+
ADDV $4, R4
95+
SUBV R6, R4
96+
RET
97+
98+
byte_5th:
99+
ADDV $5, R4
100+
SUBV R6, R4
101+
RET
102+
103+
byte_6th:
104+
ADDV $6, R4
105+
SUBV R6, R4
106+
RET
107+
108+
byte_7th:
109+
ADDV $7, R4
46110

47-
SUBV R7, R4 // remove base
111+
found:
112+
SUBV R6, R4
48113
RET
49114

50115
notfound:
51116
MOVV $-1, R4
52117
RET
118+
119+
lasx:
120+
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
121+
BEQ R9, lsx
122+
XVMOVQ R7, X0.B32
123+
124+
MOVV $128, R9
125+
BLT R5, R9, lasx32_loop
126+
lasx128_loop:
127+
XVMOVQ 0(R4), X1
128+
XVMOVQ 32(R4), X2
129+
XVMOVQ 64(R4), X3
130+
XVMOVQ 96(R4), X4
131+
132+
XVSEQB X1, X0, X1
133+
XVSETNEV X1, FCC0
134+
BFPT lasx_found_add_0
135+
136+
XVSEQB X2, X0, X1
137+
XVSETNEV X1, FCC0
138+
BFPT lasx_found_add_32
139+
140+
XVSEQB X3, X0, X1
141+
XVSETNEV X1, FCC0
142+
BFPT lasx_found_add_64
143+
144+
XVSEQB X4, X0, X1
145+
XVSETNEV X1, FCC0
146+
BFPT lasx_found_add_96
147+
148+
ADDV $128, R4
149+
ADDV $-128, R5
150+
BGE R5, R9, lasx128_loop
151+
152+
BEQ R5, notfound
153+
154+
MOVV $32, R9
155+
BLT R5, R9, tail
156+
lasx32_loop:
157+
XVMOVQ 0(R4), X1
158+
159+
XVSEQB X1, X0, X1
160+
XVSETNEV X1, FCC0
161+
BFPT lasx_found_add_0
162+
163+
ADDV $32, R4
164+
ADDV $-32, R5
165+
BGE R5, R9, lasx32_loop
166+
167+
BEQ R5, notfound
168+
169+
JMP tail
170+
171+
lasx_found_add_0:
172+
MOVV R0, R11
173+
JMP lasx_index_cal
174+
175+
lasx_found_add_32:
176+
MOVV $32, R11
177+
JMP lasx_index_cal
178+
179+
lasx_found_add_64:
180+
MOVV $64, R11
181+
JMP lasx_index_cal
182+
183+
lasx_found_add_96:
184+
MOVV $96, R11
185+
JMP lasx_index_cal
186+
187+
lasx_index_cal:
188+
MOVV $64, R9
189+
XVMOVQ X1.V[0], R10
190+
CTZV R10, R10
191+
BNE R10, R9, index_cal
192+
ADDV $8, R11
193+
194+
XVMOVQ X1.V[1], R10
195+
CTZV R10, R10
196+
BNE R10, R9, index_cal
197+
ADDV $8, R11
198+
199+
XVMOVQ X1.V[2], R10
200+
CTZV R10, R10
201+
BNE R10, R9, index_cal
202+
ADDV $8, R11
203+
204+
XVMOVQ X1.V[3], R10
205+
CTZV R10, R10
206+
JMP index_cal
207+
208+
lsx:
209+
MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
210+
BEQ R9, tail
211+
VMOVQ R7, V0.B16
212+
213+
MOVV $64, R9
214+
BLT R5, R9, lsx16_loop
215+
lsx64_loop:
216+
VMOVQ 0(R4), V1
217+
VMOVQ 16(R4), V2
218+
VMOVQ 32(R4), V3
219+
VMOVQ 48(R4), V4
220+
221+
VSEQB V1, V0, V1
222+
VSETNEV V1, FCC0
223+
BFPT lsx_found_add_0
224+
225+
VSEQB V2, V0, V1
226+
VSETNEV V1, FCC0
227+
BFPT lsx_found_add_16
228+
229+
VSEQB V3, V0, V1
230+
VSETNEV V1, FCC0
231+
BFPT lsx_found_add_32
232+
233+
VSEQB V4, V0, V1
234+
VSETNEV V1, FCC0
235+
BFPT lsx_found_add_48
236+
237+
ADDV $64, R4
238+
ADDV $-64, R5
239+
BGE R5, R9, lsx64_loop
240+
241+
BEQ R5, notfound
242+
243+
MOVV $16, R9
244+
BLT R5, R9, tail
245+
lsx16_loop:
246+
VMOVQ 0(R4), V1
247+
248+
VSEQB V1, V0, V1
249+
VSETNEV V1, FCC0
250+
BFPT lsx_found_add_0
251+
252+
ADDV $16, R4
253+
ADDV $-16, R5
254+
BGE R5, R9, lsx16_loop
255+
256+
BEQ R5, notfound
257+
258+
JMP tail
259+
260+
lsx_found_add_0:
261+
MOVV R0, R11
262+
JMP lsx_index_cal
263+
264+
lsx_found_add_16:
265+
MOVV $16, R11
266+
JMP lsx_index_cal
267+
268+
lsx_found_add_32:
269+
MOVV $32, R11
270+
JMP lsx_index_cal
271+
272+
lsx_found_add_48:
273+
MOVV $48, R11
274+
JMP lsx_index_cal
275+
276+
lsx_index_cal:
277+
MOVV $64, R9
278+
279+
VMOVQ V1.V[0], R10
280+
CTZV R10, R10
281+
BNE R10, R9, index_cal
282+
ADDV $8, R11
283+
284+
VMOVQ V1.V[1], R10
285+
CTZV R10, R10
286+
JMP index_cal
287+
288+
index_cal:
289+
SRLV $3, R10
290+
ADDV R11, R10
291+
ADDV R10, R4
292+
JMP found

0 commit comments

Comments
 (0)