|
5 | 5 | #include "go_asm.h" |
6 | 6 | #include "textflag.h" |
7 | 7 |
|
| 8 | +// input: |
| 9 | +// R4 = b_base |
| 10 | +// R5 = b_len |
| 11 | +// R6 = b_cap (unused) |
| 12 | +// R7 = byte to find |
8 | 13 | TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40 |
9 | | - // R4 = b_base |
10 | | - // R5 = b_len |
11 | | - // R6 = b_cap (unused) |
12 | | - // R7 = byte to find |
13 | 14 | AND $0xff, R7 |
| 15 | + JMP indexbytebody<>(SB) |
| 16 | + |
| 17 | +// input: |
| 18 | +// R4 = s_base |
| 19 | +// R5 = s_len |
| 20 | +// R6 = byte to find |
| 21 | +TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32 |
| 22 | + AND $0xff, R6, R7 // byte to find |
| 23 | + JMP indexbytebody<>(SB) |
| 24 | + |
| 25 | +// input: |
| 26 | +// R4: b_base |
| 27 | +// R5: len |
| 28 | +// R7: byte to find |
| 29 | +TEXT indexbytebody<>(SB),NOSPLIT,$0 |
| 30 | + BEQ R5, notfound // len == 0 |
| 31 | + |
14 | 32 | MOVV R4, R6 // store base for later |
15 | | - ADDV R4, R5 // end |
16 | | - ADDV $-1, R4 |
| 33 | + ADDV R4, R5, R8 // end |
| 34 | + |
| 35 | + MOVV $32, R9 |
| 36 | + BGE R5, R9, lasx |
| 37 | +tail: |
| 38 | + MOVV $8, R9 |
| 39 | + BLT R5, R9, lt_8 |
| 40 | +generic8_loop: |
| 41 | + MOVV (R4), R10 |
| 42 | + |
| 43 | + AND $0xff, R10, R11 |
| 44 | + BEQ R7, R11, found |
| 45 | + |
| 46 | + BSTRPICKV $15, R10, $8, R11 |
| 47 | + BEQ R7, R11, byte_1th |
| 48 | + |
| 49 | + BSTRPICKV $23, R10, $16, R11 |
| 50 | + BEQ R7, R11, byte_2th |
| 51 | + |
| 52 | + BSTRPICKV $31, R10, $24, R11 |
| 53 | + BEQ R7, R11, byte_3th |
17 | 54 |
|
18 | | - PCALIGN $16 |
19 | | -loop: |
| 55 | + BSTRPICKV $39, R10, $32, R11 |
| 56 | + BEQ R7, R11, byte_4th |
| 57 | + |
| 58 | + BSTRPICKV $47, R10, $40, R11 |
| 59 | + BEQ R7, R11, byte_5th |
| 60 | + |
| 61 | + BSTRPICKV $55, R10, $48, R11 |
| 62 | + BEQ R7, R11, byte_6th |
| 63 | + |
| 64 | + BSTRPICKV $63, R10, $56, R11 |
| 65 | + BEQ R7, R11, byte_7th |
| 66 | + |
| 67 | + ADDV $8, R4 |
| 68 | + ADDV $-8, R5 |
| 69 | + BGE R5, R9, generic8_loop |
| 70 | + |
| 71 | +lt_8: |
| 72 | + BEQ R4, R8, notfound |
| 73 | + MOVBU (R4), R10 |
| 74 | + BEQ R7, R10, found |
20 | 75 | ADDV $1, R4 |
21 | | - BEQ R4, R5, notfound |
22 | | - MOVBU (R4), R8 |
23 | | - BNE R7, R8, loop |
| 76 | + JMP lt_8 |
24 | 77 |
|
25 | | - SUBV R6, R4 // remove base |
| 78 | +byte_1th: |
| 79 | + ADDV $1, R4 |
| 80 | + SUBV R6, R4 |
26 | 81 | RET |
27 | 82 |
|
28 | | -notfound: |
29 | | - MOVV $-1, R4 |
| 83 | +byte_2th: |
| 84 | + ADDV $2, R4 |
| 85 | + SUBV R6, R4 |
30 | 86 | RET |
31 | 87 |
|
32 | | -TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32 |
33 | | - // R4 = s_base |
34 | | - // R5 = s_len |
35 | | - // R6 = byte to find |
36 | | - MOVV R4, R7 // store base for later |
37 | | - ADDV R4, R5 // end |
38 | | - ADDV $-1, R4 |
39 | | - |
40 | | - PCALIGN $16 |
41 | | -loop: |
42 | | - ADDV $1, R4 |
43 | | - BEQ R4, R5, notfound |
44 | | - MOVBU (R4), R8 |
45 | | - BNE R6, R8, loop |
| 88 | +byte_3th: |
| 89 | + ADDV $3, R4 |
| 90 | + SUBV R6, R4 |
| 91 | + RET |
| 92 | + |
| 93 | +byte_4th: |
| 94 | + ADDV $4, R4 |
| 95 | + SUBV R6, R4 |
| 96 | + RET |
| 97 | + |
| 98 | +byte_5th: |
| 99 | + ADDV $5, R4 |
| 100 | + SUBV R6, R4 |
| 101 | + RET |
| 102 | + |
| 103 | +byte_6th: |
| 104 | + ADDV $6, R4 |
| 105 | + SUBV R6, R4 |
| 106 | + RET |
| 107 | + |
| 108 | +byte_7th: |
| 109 | + ADDV $7, R4 |
46 | 110 |
|
47 | | - SUBV R7, R4 // remove base |
| 111 | +found: |
| 112 | + SUBV R6, R4 |
48 | 113 | RET |
49 | 114 |
|
50 | 115 | notfound: |
51 | 116 | MOVV $-1, R4 |
52 | 117 | RET |
| 118 | + |
| 119 | +lasx: |
| 120 | + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9 |
| 121 | + BEQ R9, lsx |
| 122 | + XVMOVQ R7, X0.B32 |
| 123 | + |
| 124 | + MOVV $128, R9 |
| 125 | + BLT R5, R9, lasx32_loop |
| 126 | +lasx128_loop: |
| 127 | + XVMOVQ 0(R4), X1 |
| 128 | + XVMOVQ 32(R4), X2 |
| 129 | + XVMOVQ 64(R4), X3 |
| 130 | + XVMOVQ 96(R4), X4 |
| 131 | + |
| 132 | + XVSEQB X1, X0, X1 |
| 133 | + XVSETNEV X1, FCC0 |
| 134 | + BFPT lasx_found_add_0 |
| 135 | + |
| 136 | + XVSEQB X2, X0, X1 |
| 137 | + XVSETNEV X1, FCC0 |
| 138 | + BFPT lasx_found_add_32 |
| 139 | + |
| 140 | + XVSEQB X3, X0, X1 |
| 141 | + XVSETNEV X1, FCC0 |
| 142 | + BFPT lasx_found_add_64 |
| 143 | + |
| 144 | + XVSEQB X4, X0, X1 |
| 145 | + XVSETNEV X1, FCC0 |
| 146 | + BFPT lasx_found_add_96 |
| 147 | + |
| 148 | + ADDV $128, R4 |
| 149 | + ADDV $-128, R5 |
| 150 | + BGE R5, R9, lasx128_loop |
| 151 | + |
| 152 | + BEQ R5, notfound |
| 153 | + |
| 154 | + MOVV $32, R9 |
| 155 | + BLT R5, R9, tail |
| 156 | +lasx32_loop: |
| 157 | + XVMOVQ 0(R4), X1 |
| 158 | + |
| 159 | + XVSEQB X1, X0, X1 |
| 160 | + XVSETNEV X1, FCC0 |
| 161 | + BFPT lasx_found_add_0 |
| 162 | + |
| 163 | + ADDV $32, R4 |
| 164 | + ADDV $-32, R5 |
| 165 | + BGE R5, R9, lasx32_loop |
| 166 | + |
| 167 | + BEQ R5, notfound |
| 168 | + |
| 169 | + JMP tail |
| 170 | + |
| 171 | +lasx_found_add_0: |
| 172 | + MOVV R0, R11 |
| 173 | + JMP lasx_index_cal |
| 174 | + |
| 175 | +lasx_found_add_32: |
| 176 | + MOVV $32, R11 |
| 177 | + JMP lasx_index_cal |
| 178 | + |
| 179 | +lasx_found_add_64: |
| 180 | + MOVV $64, R11 |
| 181 | + JMP lasx_index_cal |
| 182 | + |
| 183 | +lasx_found_add_96: |
| 184 | + MOVV $96, R11 |
| 185 | + JMP lasx_index_cal |
| 186 | + |
| 187 | +lasx_index_cal: |
| 188 | + MOVV $64, R9 |
| 189 | + XVMOVQ X1.V[0], R10 |
| 190 | + CTZV R10, R10 |
| 191 | + BNE R10, R9, index_cal |
| 192 | + ADDV $8, R11 |
| 193 | + |
| 194 | + XVMOVQ X1.V[1], R10 |
| 195 | + CTZV R10, R10 |
| 196 | + BNE R10, R9, index_cal |
| 197 | + ADDV $8, R11 |
| 198 | + |
| 199 | + XVMOVQ X1.V[2], R10 |
| 200 | + CTZV R10, R10 |
| 201 | + BNE R10, R9, index_cal |
| 202 | + ADDV $8, R11 |
| 203 | + |
| 204 | + XVMOVQ X1.V[3], R10 |
| 205 | + CTZV R10, R10 |
| 206 | + JMP index_cal |
| 207 | + |
| 208 | +lsx: |
| 209 | + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9 |
| 210 | + BEQ R9, tail |
| 211 | + VMOVQ R7, V0.B16 |
| 212 | + |
| 213 | + MOVV $64, R9 |
| 214 | + BLT R5, R9, lsx16_loop |
| 215 | +lsx64_loop: |
| 216 | + VMOVQ 0(R4), V1 |
| 217 | + VMOVQ 16(R4), V2 |
| 218 | + VMOVQ 32(R4), V3 |
| 219 | + VMOVQ 48(R4), V4 |
| 220 | + |
| 221 | + VSEQB V1, V0, V1 |
| 222 | + VSETNEV V1, FCC0 |
| 223 | + BFPT lsx_found_add_0 |
| 224 | + |
| 225 | + VSEQB V2, V0, V1 |
| 226 | + VSETNEV V1, FCC0 |
| 227 | + BFPT lsx_found_add_16 |
| 228 | + |
| 229 | + VSEQB V3, V0, V1 |
| 230 | + VSETNEV V1, FCC0 |
| 231 | + BFPT lsx_found_add_32 |
| 232 | + |
| 233 | + VSEQB V4, V0, V1 |
| 234 | + VSETNEV V1, FCC0 |
| 235 | + BFPT lsx_found_add_48 |
| 236 | + |
| 237 | + ADDV $64, R4 |
| 238 | + ADDV $-64, R5 |
| 239 | + BGE R5, R9, lsx64_loop |
| 240 | + |
| 241 | + BEQ R5, notfound |
| 242 | + |
| 243 | + MOVV $16, R9 |
| 244 | + BLT R5, R9, tail |
| 245 | +lsx16_loop: |
| 246 | + VMOVQ 0(R4), V1 |
| 247 | + |
| 248 | + VSEQB V1, V0, V1 |
| 249 | + VSETNEV V1, FCC0 |
| 250 | + BFPT lsx_found_add_0 |
| 251 | + |
| 252 | + ADDV $16, R4 |
| 253 | + ADDV $-16, R5 |
| 254 | + BGE R5, R9, lsx16_loop |
| 255 | + |
| 256 | + BEQ R5, notfound |
| 257 | + |
| 258 | + JMP tail |
| 259 | + |
| 260 | +lsx_found_add_0: |
| 261 | + MOVV R0, R11 |
| 262 | + JMP lsx_index_cal |
| 263 | + |
| 264 | +lsx_found_add_16: |
| 265 | + MOVV $16, R11 |
| 266 | + JMP lsx_index_cal |
| 267 | + |
| 268 | +lsx_found_add_32: |
| 269 | + MOVV $32, R11 |
| 270 | + JMP lsx_index_cal |
| 271 | + |
| 272 | +lsx_found_add_48: |
| 273 | + MOVV $48, R11 |
| 274 | + JMP lsx_index_cal |
| 275 | + |
| 276 | +lsx_index_cal: |
| 277 | + MOVV $64, R9 |
| 278 | + |
| 279 | + VMOVQ V1.V[0], R10 |
| 280 | + CTZV R10, R10 |
| 281 | + BNE R10, R9, index_cal |
| 282 | + ADDV $8, R11 |
| 283 | + |
| 284 | + VMOVQ V1.V[1], R10 |
| 285 | + CTZV R10, R10 |
| 286 | + JMP index_cal |
| 287 | + |
| 288 | +index_cal: |
| 289 | + SRLV $3, R10 |
| 290 | + ADDV R11, R10 |
| 291 | + ADDV R10, R4 |
| 292 | + JMP found |
0 commit comments