Skip to content

Commit 40fd1f1

Browse files
committed
save game
1 parent e144a1b commit 40fd1f1

File tree

1 file changed

+77
-27
lines changed

1 file changed

+77
-27
lines changed

src/UTF8_validation.cs

Lines changed: 77 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
using System.Runtime.CompilerServices;
66
using System.Runtime.Intrinsics.Arm;
77
using static System.Net.Mime.MediaTypeNames;
8+
using System.Numerics;
9+
810

911
// C# already have something that is *more or less* equivalent to our C++ simd class:
1012
// Vector256 https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.vector256-1?view=net-7.0
@@ -117,6 +119,11 @@ public static unsafe class Utf8Utility
117119
// return string.Join(" ", binaryStrings);
118120
// }
119121

122+
public static byte* SIMDGetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
123+
{
124+
125+
}
126+
120127
// Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
121128
// [MethodImpl(MethodImplOptions.AggressiveInlining)]
122129
public static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
@@ -194,34 +201,77 @@ public static unsafe class Utf8Utility
194201

195202
// CheckForGCCollections("After processed remaining bytes");
196203

197-
// | Method | FileName | Mean | Error | StdDev | Median | Allocated |
198-
// |---------------------------- |----------------------- |----------:|----------:|----------:|----------:|----------:|
199-
// | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 36.968 us | 0.4637 us | 0.4111 us | 37.018 us | 56 B |
200-
// | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 31.851 us | 0.2252 us | 0.2107 us | 31.865 us | 56 B |
201-
// | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 10.541 us | 0.0870 us | 0.0814 us | 10.490 us | 56 B |
202-
// | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 12.404 us | 0.2447 us | 0.2913 us | 12.355 us | 56 B |
203-
// | SIMDUtf8ValidationRealData | data/english.utf8.txt | 14.297 us | 0.2786 us | 0.4655 us | 14.225 us | 56 B |
204-
// | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 14.207 us | 0.0272 us | 0.0241 us | 14.211 us | 56 B |
205-
// | SIMDUtf8ValidationRealData | data/french.utf8.txt | 23.993 us | 0.2287 us | 0.2140 us | 23.879 us | 56 B |
206-
// | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 26.856 us | 0.5314 us | 1.4367 us | 27.005 us | 56 B |
207-
// | SIMDUtf8ValidationRealData | data/german.utf8.txt | 8.541 us | 0.1702 us | 0.2090 us | 8.456 us | 56 B |
208-
// | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 8.728 us | 0.1618 us | 0.3938 us | 8.567 us | 56 B |
209-
// | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 11.108 us | 0.1767 us | 0.1653 us | 11.184 us | 56 B |
210-
// | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 9.533 us | 0.1288 us | 0.1205 us | 9.490 us | 56 B |
211-
// | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 10.128 us | 0.0544 us | 0.0454 us | 10.126 us | 56 B |
212-
// | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 10.078 us | 0.0499 us | 0.0467 us | 10.079 us | 56 B |
213-
204+
// | Method | FileName | Mean | Error | StdDev | Allocated |
205+
// |---------------------------- |----------------------- |----------:|----------:|----------:|----------:|
206+
// | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 31.509 us | 0.2234 us | 0.2089 us | - |
207+
// | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 28.280 us | 0.2042 us | 0.1810 us | - |
208+
// | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 6.682 us | 0.0400 us | 0.0354 us | - |
209+
// | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 6.750 us | 0.1294 us | 0.1080 us | - |
210+
// | SIMDUtf8ValidationRealData | data/english.utf8.txt | 9.291 us | 0.0345 us | 0.0323 us | - |
211+
// | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 9.483 us | 0.0486 us | 0.0454 us | - |
212+
// | SIMDUtf8ValidationRealData | data/french.utf8.txt | 19.547 us | 0.3349 us | 0.3132 us | - |
213+
// | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 18.264 us | 0.2890 us | 0.2703 us | - |
214+
// | SIMDUtf8ValidationRealData | data/german.utf8.txt | 4.972 us | 0.0402 us | 0.0357 us | - |
215+
// | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 4.936 us | 0.0468 us | 0.0438 us | - |
216+
// | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 6.039 us | 0.0680 us | 0.0636 us | - |
217+
// | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 5.683 us | 0.0970 us | 0.0907 us | - |
218+
// | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 6.054 us | 0.1161 us | 0.1627 us | - |
219+
// | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 5.909 us | 0.0483 us | 0.0452 us | - |
214220
// scalar results:
215-
if (processedLength < inputLength)
216-
{
217-
byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength);
218-
if (invalidBytePointer != pInputBuffer + inputLength)
219-
{
220-
// An invalid byte was found. Adjust error handling as needed.
221-
error = Vector256.Create((byte)1);
222-
}
223-
processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength));
224-
}
221+
// if (processedLength < inputLength)
222+
// {
223+
// byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength);
224+
// // This makes little difference
225+
// if (invalidBytePointer != pInputBuffer + inputLength)
226+
// {
227+
// // An invalid byte was found. Adjust error handling as needed.
228+
// error = Vector256.Create((byte)1);
229+
// }
230+
// processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength));
231+
// }
232+
233+
234+
// ThreadStaticAttribute approach is buggy
235+
// if (processedLength < inputLength)
236+
// {
237+
238+
// // int mask = Avx2.MoveMask(prev_incomplete.AsSByte());
239+
// // int index = BitOperations.TrailingZeroCount(mask);
240+
241+
242+
// // byte* invalidBytePointer = UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength, inputLength - processedLength);
243+
// // // This makes little difference
244+
// // if (invalidBytePointer != pInputBuffer + inputLength)
245+
// // {
246+
// // // An invalid byte was found. Adjust error handling as needed.
247+
// // error = Vector256.Create((byte)1);
248+
// // }
249+
250+
// // Find the position of the first set bit in incompleteMask, indicating the start of an incomplete sequence.
251+
// int incompleteMask = Avx2.MoveMask(prev_incomplete.AsSByte());
252+
// int firstIncompletePos = BitOperations.LeadingZeroCount((uint)incompleteMask);
253+
254+
// // Calculate the pointer adjustment based on the position of the incomplete sequence.
255+
// byte* startPtrForScalarValidation = pInputBuffer + processedLength + firstIncompletePos;
256+
257+
// // Ensure startPtrForScalarValidation does not precede pInputBuffer.
258+
// // startPtrForScalarValidation = Math.Max(pInputBuffer, startPtrForScalarValidation);
259+
260+
// // Now, ensure startPtrForScalarValidation points to a leading byte by backtracking if it's pointing to a continuation byte.
261+
// // while (startPtrForScalarValidation > pInputBuffer && (*startPtrForScalarValidation & 0xC0) == 0x80) {
262+
// // startPtrForScalarValidation--;
263+
// // }
264+
265+
// // Invoke scalar validation from the identified leading byte position.
266+
// byte* invalidBytePointer = UTF8.GetPointerToFirstInvalidByte(startPtrForScalarValidation, inputLength - (int)(startPtrForScalarValidation - pInputBuffer));
267+
// if (invalidBytePointer != pInputBuffer + inputLength)
268+
// {
269+
// // An invalid byte was found. Adjust error handling as needed.
270+
// error = Vector256.Create((byte)1);
271+
// }
272+
// processedLength += (int)(invalidBytePointer - (pInputBuffer + processedLength));
273+
// }
274+
225275

226276

227277
// | Method | FileName | Mean | Error | StdDev | Allocated |

0 commit comments

Comments
 (0)