From 1cce4a6441fb6617495ed5ea336974a160254ed9 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 25 Jun 2024 19:03:13 +0000 Subject: [PATCH 1/3] trying to reduce the cost of the 4-byte char --- src/UTF8.cs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index 2f4d1f3..a0cf99f 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -1352,6 +1352,8 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust Vector128 fourthByte = Vector128.Create((byte)(0b11110000u - 0x80)); Vector128 v0f = Vector128.Create((byte)0x0F); Vector128 v80 = Vector128.Create((byte)0x80); + Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); + Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111 // Performance note: we could process 64 bytes at a time for better speed in some cases. int start_point = processedLength; @@ -1457,15 +1459,15 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust return invalidBytePointer; } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); - Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111 contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar(); - - // computing n4 is more expensive than we would like: - Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); Vector128 largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne); - byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar(); - int negn4add = (int)(byte)-n4add; - n4 += negn4add; + ulong n4marker = AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(largerthan0f)).ToScalar(); + if (n4marker != 0) + { + byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar(); + int negn4add = (int)(byte)-n4add; + n4 += negn4add; + } } } bool hasIncompete = AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(prevIncomplete)).ToScalar() != 0; From 7bdac733ac1e2197fa0762ee7213314d3ef59214 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 25 Jun 2024 19:45:58 +0000 Subject: [PATCH 2/3] adding results --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index 1068010..f97e54f 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,39 @@ boost as the Neoverse V1. | Latin-Lipsum | 50 | 20 | 2.5 x | | Russian-Lipsum | 4.0 | 1.2 | 3.3 x | + +On a Neoverse N1 (Graviton 2), our validation function is 1.3 to over four times +faster than the standard library. + +| data set | SimdUnicode speed (GB/s) | .NET speed (GB/s) | speed up | +|:----------------|:-----------|:--------------------------|:-------------------| +| Twitter.json | 12 | 8.7 | 1.4 x | +| Arabic-Lipsum | 3.4 | 2.0 | 1.7 x | +| Chinese-Lipsum | 3.4 | 2.6 | 1.3 x | +| Emoji-Lipsum | 3.4 | 0.8 | 4.3 x | +| Hebrew-Lipsum | 3.4 | 2.0 | 1.7 x | +| Hindi-Lipsum | 3.4 | 1.6 | 2.1 x | +| Japanese-Lipsum | 3.4 | 2.4  | 1.4 x | +| Korean-Lipsum | 3.4 | 1.3 | 2.6 x | +| Latin-Lipsum | 42 | 17 | 2.5 x | +| Russian-Lipsum | 3.3 | 0.95 | 3.5 x | + +On a Neoverse N1 (Graviton 2), our validation function is up to three times +faster than the standard library. + +| data set | SimdUnicode speed (GB/s) | .NET speed (GB/s) | speed up | +|:----------------|:-----------|:--------------------------|:-------------------| +| Twitter.json | 7.0 | 5.7 | 1.2 x | +| Arabic-Lipsum | 2.2 | 0.9 | 2.4 x | +| Chinese-Lipsum | 2.1 | 1.8 | 1.1 x | +| Emoji-Lipsum | 1.8 | 0.7 | 2.6 x | +| Hebrew-Lipsum | 2.0 | 0.9 | 2.2 x | +| Hindi-Lipsum | 2.0 | 1.0 | 2.0 x | +| Japanese-Lipsum | 2.1 | 1.7  | 1.2 x | +| Korean-Lipsum | 2.2 | 1.0 | 2.2 x | +| Latin-Lipsum | 24 | 13 | 1.8 x | +| Russian-Lipsum | 2.1 | 0.7 | 3.0 x | + One difficulty with ARM processors is that they have varied SIMD/NEON performance. For example, Neoverse N1 processors, not to be confused with the Neoverse V1 design used by AWS Graviton 3, have weak SIMD performance. Of course, one can pick and choose which approach is best and it is not necessary to apply SimdUnicode is all cases. We expect good performance on recent ARM-based Qualcomm processors. ## Building the library From 906201f2817d394e7dd377d3abb0060753f72c69 Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Tue, 25 Jun 2024 16:58:33 -0400 Subject: [PATCH 3/3] making some of the arm code prettier --- src/UTF8.cs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/UTF8.cs b/src/UTF8.cs index a0cf99f..c23c099 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -1364,13 +1364,13 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust { Vector128 currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength); - if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(currentBlock, v80))).ToScalar() == 0) + if ((currentBlock & v80) == Vector128.Zero) // We could also use (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some // hardware. { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. - if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) + if (prevIncomplete != Vector128.Zero) { int off = processedLength >= 3 ? processedLength - 3 : processedLength; byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); @@ -1404,7 +1404,7 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust Vector128 block4 = AdvSimd.LoadVector128(pInputBuffer + processedLength + localasciirun + 48); Vector128 or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4)); - if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(AdvSimd.And(or, v80))).ToScalar() != 0) + if ((or & v80) != Vector128.Zero) { break; } @@ -1435,7 +1435,7 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some // hardware: - if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)).ToScalar() != 0) + if (error != Vector128.Zero) { byte* invalidBytePointer; if (processedLength == 0) @@ -1461,8 +1461,7 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar(); Vector128 largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne); - ulong n4marker = AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(largerthan0f)).ToScalar(); - if (n4marker != 0) + if (largerthan0f != Vector128.Zero) { byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar(); int negn4add = (int)(byte)-n4add; @@ -1470,7 +1469,7 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust } } } - bool hasIncompete = AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(prevIncomplete)).ToScalar() != 0; + bool hasIncompete = (prevIncomplete != Vector128.Zero); if (processedLength < inputLength || hasIncompete) { byte* invalidBytePointer;