|
1 | 1 | using System;
|
2 | 2 | using System.Runtime.Intrinsics;
|
3 | 3 | using System.Runtime.Intrinsics.X86;
|
| 4 | +using System.Runtime.Intrinsics.Arm; |
4 | 5 | using System.Runtime.CompilerServices;
|
5 | 6 |
|
6 | 7 | namespace SimdUnicode
|
@@ -253,6 +254,8 @@ public static class UTF8
|
253 | 254 | Vector256<byte> shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21);
|
254 | 255 | prevInputBlock = currentBlock;
|
255 | 256 | Vector256<byte> prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1));
|
| 257 | + // Vector256.Shuffle vs Avx2.Shuffle |
| 258 | + // https://github.com/dotnet/runtime/blob/1400c1e7a888ea1e710e5c08d55c800e0b04bf8a/docs/coding-guidelines/vectorization-guidelines.md#vector256shuffle-vs-avx2shuffle |
256 | 259 | Vector256<byte> byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
|
257 | 260 | Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f));
|
258 | 261 | Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
|
@@ -304,16 +307,171 @@ public static class UTF8
|
304 | 307 | return pInputBuffer + inputLength;
|
305 | 308 | }
|
306 | 309 |
|
| 310 | + public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength) |
| 311 | + { |
| 312 | + int processedLength = 0; |
| 313 | + |
| 314 | + if (pInputBuffer == null || inputLength <= 0) |
| 315 | + { |
| 316 | + return pInputBuffer; |
| 317 | + } |
| 318 | + if (inputLength > 128) |
| 319 | + { |
| 320 | + // We skip any ASCII characters at the start of the buffer |
| 321 | + int asciirun = 0; |
| 322 | + for(; asciirun + 64 <= inputLength; asciirun += 64) |
| 323 | + { |
| 324 | + Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + asciirun); |
| 325 | + Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 16); |
| 326 | + Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 32); |
| 327 | + Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 48); |
| 328 | + Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4)); |
| 329 | + if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127) |
| 330 | + { |
| 331 | + break; |
| 332 | + } |
| 333 | + } |
| 334 | + processedLength = asciirun; |
| 335 | + |
| 336 | + if (processedLength + 32 < inputLength) |
| 337 | + { |
| 338 | + // We still have work to do! |
| 339 | + Vector128<byte> prevInputBlock = Vector128<byte>.Zero; |
| 340 | + |
| 341 | + Vector128<byte> maxValue = Vector128.Create( |
| 342 | + 255, 255, 255, 255, 255, 255, 255, 255, |
| 343 | + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); |
| 344 | + Vector128<byte> prevIncomplete = AdvSimd.SubtractSaturate(prevInputBlock, maxValue); |
| 345 | + |
| 346 | + |
| 347 | + Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, |
| 348 | + TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, |
| 349 | + TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS, |
| 350 | + TOO_SHORT | OVERLONG_2, |
| 351 | + TOO_SHORT, |
| 352 | + TOO_SHORT | OVERLONG_3 | SURROGATE, |
| 353 | + TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4); |
| 354 | + |
| 355 | + Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4, |
| 356 | + CARRY | OVERLONG_2, |
| 357 | + CARRY, |
| 358 | + CARRY, |
| 359 | + CARRY | TOO_LARGE, |
| 360 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 361 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 362 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 363 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 364 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 365 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 366 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 367 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 368 | + CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE, |
| 369 | + CARRY | TOO_LARGE | TOO_LARGE_1000, |
| 370 | + CARRY | TOO_LARGE | TOO_LARGE_1000); |
| 371 | + Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, |
| 372 | + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, |
| 373 | + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4, |
| 374 | + TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE, |
| 375 | + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, |
| 376 | + TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE, |
| 377 | + TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT); |
| 378 | + |
| 379 | + Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80)); |
| 380 | + Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); |
| 381 | + Vector128<byte> v0f = Vector128.Create((byte)0x0F); |
| 382 | + Vector128<byte> v80 = Vector128.Create((byte)0x80); |
| 383 | + // Performance note: we could process 64 bytes at a time for better speed in some cases. |
| 384 | + for (; processedLength + 16 <= inputLength; processedLength += 16) |
| 385 | + { |
| 386 | + |
| 387 | + Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength); |
| 388 | + |
| 389 | + if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() > 127) |
| 390 | + { |
| 391 | + // We have an ASCII block, no need to process it, but |
| 392 | + // we need to check if the previous block was incomplete. |
| 393 | + if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) |
| 394 | + { |
| 395 | + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); |
| 396 | + } |
| 397 | + prevIncomplete = Vector128<byte>.Zero; |
| 398 | + } |
| 399 | + else |
| 400 | + { |
| 401 | + // Contains non-ASCII characters, we need to do non-trivial processing |
| 402 | + Vector128<byte> prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1)); |
| 403 | + Vector128<byte> byte_1_high = Vector128.Shuffle(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); |
| 404 | + Vector128<byte> byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f)); |
| 405 | + Vector128<byte> byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); |
| 406 | + Vector128<byte> sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high); |
| 407 | + Vector128<byte> prev2 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 2)); |
| 408 | + Vector128<byte> prev3 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 3)); |
| 409 | + prevInputBlock = currentBlock; |
| 410 | + Vector128<byte> isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte); |
| 411 | + Vector128<byte> isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte); |
| 412 | + Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte); |
| 413 | + Vector128<byte> must23As80 = AdvSimd.And(must23, v80); |
| 414 | + Vector128<byte> error = AdvSimd.Xor(must23As80, sc); |
| 415 | + if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0) |
| 416 | + { |
| 417 | + return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength); |
| 418 | + } |
| 419 | + prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); |
| 420 | + } |
| 421 | + } |
| 422 | + } |
| 423 | + } |
| 424 | + // We have processed all the blocks using SIMD, we need to process the remaining bytes. |
| 425 | + |
| 426 | + // Process the remaining bytes with the scalar function |
| 427 | + if (processedLength < inputLength) |
| 428 | + { |
| 429 | + // We need to possibly backtrack to the start of the last code point |
| 430 | + // worst possible case is 4 bytes, where we need to backtrack 3 bytes |
| 431 | + // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte |
| 432 | + if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) |
| 433 | + { |
| 434 | + processedLength -= 1; |
| 435 | + if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) |
| 436 | + { |
| 437 | + processedLength -= 1; |
| 438 | + if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) |
| 439 | + { |
| 440 | + processedLength -= 1; |
| 441 | + } |
| 442 | + } |
| 443 | + } |
| 444 | + byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength); |
| 445 | + if (invalidBytePointer != pInputBuffer + inputLength) |
| 446 | + { |
| 447 | + // An invalid byte was found by the scalar function |
| 448 | + return invalidBytePointer; |
| 449 | + } |
| 450 | + } |
| 451 | + |
| 452 | + return pInputBuffer + inputLength; |
| 453 | + } |
307 | 454 | public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
|
308 | 455 | {
|
309 |
| - if (Avx2.IsSupported) |
| 456 | + if (AdvSimd.Arm64.IsSupported) |
310 | 457 | {
|
311 |
| - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength); |
| 458 | + return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); |
312 | 459 | }
|
313 |
| - else |
| 460 | + if (Avx2.IsSupported) |
314 | 461 | {
|
315 |
| - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); |
| 462 | + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength); |
316 | 463 | }
|
| 464 | + // TODO add support for other ISAs |
| 465 | + //if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) |
| 466 | + //{ |
| 467 | + // return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); |
| 468 | + // |
| 469 | + //} |
| 470 | + //if (Sse2.IsSupported) |
| 471 | + //{ |
| 472 | + // return GetPointerToFirstInvalidByteSse2(pInputBuffer, inputLength); |
| 473 | + //} |
| 474 | + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); |
317 | 475 | }
|
318 | 476 |
|
319 | 477 | }
|
|
0 commit comments