|
5 | 5 | using System.Runtime.CompilerServices;
|
6 | 6 | using System.Runtime.Intrinsics.Arm;
|
7 | 7 | using static System.Net.Mime.MediaTypeNames;
|
| 8 | +using System.Numerics; |
| 9 | + |
8 | 10 |
|
9 | 11 | // C# already have something that is *more or less* equivalent to our C++ simd class:
|
10 | 12 | // Vector256 https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.vector256-1?view=net-7.0
|
@@ -117,6 +119,11 @@ public static unsafe class Utf8Utility
|
117 | 119 | // return string.Join(" ", binaryStrings);
|
118 | 120 | // }
|
119 | 121 |
|
| 122 | + public static byte* SIMDGetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength) |
| 123 | + { |
| 124 | + |
| 125 | + } |
| 126 | + |
120 | 127 | // Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
|
121 | 128 | // [MethodImpl(MethodImplOptions.AggressiveInlining)]
|
122 | 129 | public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
|
@@ -194,34 +201,77 @@ public static unsafe class Utf8Utility
|
194 | 201 |
|
195 | 202 | // CheckForGCCollections("After processed remaining bytes");
|
196 | 203 |
|
197 |
| - // | Method | FileName | Mean | Error | StdDev | Median | Allocated | |
198 |
| - // |---------------------------- |----------------------- |----------:|----------:|----------:|----------:|----------:| |
199 |
| - // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 36.968 us | 0.4637 us | 0.4111 us | 37.018 us | 56 B | |
200 |
| - // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 31.851 us | 0.2252 us | 0.2107 us | 31.865 us | 56 B | |
201 |
| - // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 10.541 us | 0.0870 us | 0.0814 us | 10.490 us | 56 B | |
202 |
| - // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 12.404 us | 0.2447 us | 0.2913 us | 12.355 us | 56 B | |
203 |
| - // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 14.297 us | 0.2786 us | 0.4655 us | 14.225 us | 56 B | |
204 |
| - // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 14.207 us | 0.0272 us | 0.0241 us | 14.211 us | 56 B | |
205 |
| - // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 23.993 us | 0.2287 us | 0.2140 us | 23.879 us | 56 B | |
206 |
| - // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 26.856 us | 0.5314 us | 1.4367 us | 27.005 us | 56 B | |
207 |
| - // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 8.541 us | 0.1702 us | 0.2090 us | 8.456 us | 56 B | |
208 |
| - // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 8.728 us | 0.1618 us | 0.3938 us | 8.567 us | 56 B | |
209 |
| - // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 11.108 us | 0.1767 us | 0.1653 us | 11.184 us | 56 B | |
210 |
| - // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 9.533 us | 0.1288 us | 0.1205 us | 9.490 us | 56 B | |
211 |
| - // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 10.128 us | 0.0544 us | 0.0454 us | 10.126 us | 56 B | |
212 |
| - // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 10.078 us | 0.0499 us | 0.0467 us | 10.079 us | 56 B | |
213 |
| - |
| 204 | +// | Method | FileName | Mean | Error | StdDev | Allocated | |
| 205 | +// |---------------------------- |----------------------- |----------:|----------:|----------:|----------:| |
| 206 | +// | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 31.509 us | 0.2234 us | 0.2089 us | - | |
| 207 | +// | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 28.280 us | 0.2042 us | 0.1810 us | - | |
| 208 | +// | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 6.682 us | 0.0400 us | 0.0354 us | - | |
| 209 | +// | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 6.750 us | 0.1294 us | 0.1080 us | - | |
| 210 | +// | SIMDUtf8ValidationRealData | data/english.utf8.txt | 9.291 us | 0.0345 us | 0.0323 us | - | |
| 211 | +// | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 9.483 us | 0.0486 us | 0.0454 us | - | |
| 212 | +// | SIMDUtf8ValidationRealData | data/french.utf8.txt | 19.547 us | 0.3349 us | 0.3132 us | - | |
| 213 | +// | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 18.264 us | 0.2890 us | 0.2703 us | - | |
| 214 | +// | SIMDUtf8ValidationRealData | data/german.utf8.txt | 4.972 us | 0.0402 us | 0.0357 us | - | |
| 215 | +// | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 4.936 us | 0.0468 us | 0.0438 us | - | |
| 216 | +// | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 6.039 us | 0.0680 us | 0.0636 us | - | |
| 217 | +// | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 5.683 us | 0.0970 us | 0.0907 us | - | |
| 218 | +// | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 6.054 us | 0.1161 us | 0.1627 us | - | |
| 219 | +// | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 5.909 us | 0.0483 us | 0.0452 us | - | |
214 | 220 | // scalar results:
|
215 |
| - if (processedLength < inputLength) |
216 |
| - { |
217 |
| - byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength); |
218 |
| - if (invalidBytePointer != pInputBuffer + inputLength) |
219 |
| - { |
220 |
| - // An invalid byte was found. Adjust error handling as needed. |
221 |
| - error = Vector256.Create((byte)1); |
222 |
| - } |
223 |
| - processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength)); |
224 |
| - } |
| 221 | + // if (processedLength < inputLength) |
| 222 | + // { |
| 223 | + // byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength); |
| 224 | + // // This makes little difference |
| 225 | + // if (invalidBytePointer != pInputBuffer + inputLength) |
| 226 | + // { |
| 227 | + // // An invalid byte was found. Adjust error handling as needed. |
| 228 | + // error = Vector256.Create((byte)1); |
| 229 | + // } |
| 230 | + // processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength)); |
| 231 | + // } |
| 232 | + |
| 233 | + |
| 234 | +// ThreadStaticAttribute approach is buggy |
| 235 | + // if (processedLength < inputLength) |
| 236 | + // { |
| 237 | + |
| 238 | + // // int mask = Avx2.MoveMask(prev_incomplete.AsSByte()); |
| 239 | + // // int index = BitOperations.TrailingZeroCount(mask); |
| 240 | + |
| 241 | + |
| 242 | + // // byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength); |
| 243 | + // // // This makes little difference |
| 244 | + // // if (invalidBytePointer != pInputBuffer + inputLength) |
| 245 | + // // { |
| 246 | + // // // An invalid byte was found. Adjust error handling as needed. |
| 247 | + // // error = Vector256.Create((byte)1); |
| 248 | + // // } |
| 249 | + |
| 250 | + // // Find the position of the first set bit in incompleteMask, indicating the start of an incomplete sequence. |
| 251 | + // int incompleteMask = Avx2.MoveMask(prev_incomplete.AsSByte()); |
| 252 | + // int firstIncompletePos = BitOperations.LeadingZeroCount((uint)incompleteMask); |
| 253 | + |
| 254 | + // // Calculate the pointer adjustment based on the position of the incomplete sequence. |
| 255 | + // byte* startPtrForScalarValidation = pInputBuffer + processedLength + firstIncompletePos; |
| 256 | + |
| 257 | + // // Ensure startPtrForScalarValidation does not precede pInputBuffer. |
| 258 | + // // startPtrForScalarValidation = Math.Max(pInputBuffer, startPtrForScalarValidation); |
| 259 | + |
| 260 | + // // Now, ensure startPtrForScalarValidation points to a leading byte by backtracking if it's pointing to a continuation byte. |
| 261 | + // // while (startPtrForScalarValidation > pInputBuffer && (*startPtrForScalarValidation & 0xC0) == 0x80) { |
| 262 | + // // startPtrForScalarValidation--; |
| 263 | + // // } |
| 264 | + |
| 265 | + // // Invoke scalar validation from the identified leading byte position. |
| 266 | + // byte* invalidBytePointer = UTF8.GetPointerToFirstInvalidByte(startPtrForScalarValidation, inputLength - (int)(startPtrForScalarValidation - pInputBuffer)); |
| 267 | + // if (invalidBytePointer != pInputBuffer + inputLength) |
| 268 | + // { |
| 269 | + // // An invalid byte was found. Adjust error handling as needed. |
| 270 | + // error = Vector256.Create((byte)1); |
| 271 | + // } |
| 272 | + // processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength)); |
| 273 | + // } |
| 274 | + |
225 | 275 |
|
226 | 276 |
|
227 | 277 | // | Method | FileName | Mean | Error | StdDev | Allocated |
|
|
0 commit comments