12
12
// Vector256 https://learn.microsoft.com/en-us/dotnet/api/system.runtime.intrinsics.vector256-1?view=net-7.0
13
13
// I extend it as needed
14
14
15
- // non-static benchmarks
16
- // | Method | FileName | Mean | Error | StdDev | Median | Allocated |
17
- // |---------------------------- |----------------------- |-----------:|----------:|----------:|-----------:|----------:|
18
- // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 419.461 us | 4.7151 us | 4.4105 us | 420.020 us | - |
19
- // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 268.504 us | 2.5139 us | 2.2285 us | 267.491 us | - |
20
- // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 113.877 us | 2.2331 us | 3.4101 us | 113.649 us | - |
21
- // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 16.100 us | 0.3168 us | 0.3648 us | 16.059 us | - |
22
- // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 11.170 us | 0.1277 us | 0.1132 us | 11.130 us | - |
23
- // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 11.010 us | 0.1023 us | 0.0957 us | 11.007 us | - |
24
- // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 12.987 us | 0.1030 us | 0.0963 us | 12.980 us | - |
25
- // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 12.786 us | 0.1989 us | 0.1860 us | 12.824 us | - |
26
- // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 100.692 us | 2.0088 us | 5.2921 us | 102.429 us | - |
27
- // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 33.260 us | 0.4813 us | 0.4502 us | 33.186 us | - |
28
- // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 134.439 us | 1.0321 us | 0.9149 us | 134.324 us | - |
29
- // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 65.396 us | 1.2923 us | 1.1456 us | 65.504 us | - |
30
- // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 5.519 us | 0.0311 us | 0.0275 us | 5.517 us | - |
31
- // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 5.470 us | 0.0270 us | 0.0253 us | 5.466 us | - |
32
-
33
15
34
16
// | Method | N | Mean | Error | StdDev | Gen0 | Allocated |
35
17
// |---------------------------- |----- |-----------:|----------:|----------:|-------:|----------:|
@@ -159,7 +141,7 @@ public static unsafe class Utf8Utility
159
141
public static byte * GetPointerToFirstInvalidByte ( byte * pInputBuffer , int inputLength )
160
142
{
161
143
162
- var checker = new Utf8Validation . utf8_checker ( ) ;
144
+
163
145
164
146
int processedLength = 0 ;
165
147
@@ -171,15 +153,10 @@ public static unsafe class Utf8Utility
171
153
while ( processedLength + 64 <= inputLength )
172
154
{
173
155
174
- // SIMDGetPointerToFirstInvalidByte(pInputBuffer,processedLength);
175
-
176
- Vector256 < byte > currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
177
- checker . CheckNextInput ( currentBlock ) ;
178
- currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
179
- checker . CheckNextInput ( currentBlock ) ;
180
-
181
- checker . CheckEof ( ) ;
182
- if ( checker . Errors ( ) )
156
+ SIMDGetPointerToFirstInvalidByte ( pInputBuffer , processedLength ) ;
157
+
158
+ Utf8Validation . utf8_checker . CheckEof ( ) ;
159
+ if ( Utf8Validation . utf8_checker . Errors ( ) )
183
160
{
184
161
// return pInputBuffer + processedLength;
185
162
return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( pInputBuffer + processedLength , inputLength - processedLength ) ;
@@ -243,10 +220,10 @@ public static unsafe class Utf8Utility
243
220
244
221
ReadOnlySpan < Byte > remainingBytesReadOnly = remainingBytes ;
245
222
Vector256 < byte > remainingBlock = Vector256 . Create ( remainingBytesReadOnly ) ;
246
- checker . CheckNextInput ( remainingBlock ) ;
223
+ Utf8Validation . utf8_checker . CheckNextInput ( remainingBlock ) ;
247
224
248
- checker . CheckEof ( ) ;
249
- if ( checker . Errors ( ) )
225
+ Utf8Validation . utf8_checker . CheckEof ( ) ;
226
+ if ( Utf8Validation . utf8_checker . Errors ( ) )
250
227
{
251
228
// return pInputBuffer + processedLength;
252
229
return SimdUnicode . UTF8 . GetPointerToFirstInvalidByte ( pInputBuffer + processedLength , inputLength - processedLength ) ;
@@ -264,28 +241,28 @@ public static unsafe class Utf8Utility
264
241
265
242
// Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
266
243
// [MethodImpl(MethodImplOptions.AggressiveInlining)]
267
- // public static byte* SIMDGetPointerToFirstInvalidByte(byte* pInputBuffer, int processedLength)
268
- // {
269
- // ////////////////
270
- // // TODO: I recommend taking this code and calling it something
271
- // // else. Then have the current function (GetPointerToFirstInvalidByte)
272
- // // call the SIMD function only if inputLength is sufficiently large (maybe 64 bytes),
273
- // // otherwise, use the scalar function.
274
- // ////////////////
244
+ public static byte * SIMDGetPointerToFirstInvalidByte ( byte * pInputBuffer , int processedLength )
245
+ {
246
+ ////////////////
247
+ // TODO: I recommend taking this code and calling it something
248
+ // else. Then have the current function (GetPointerToFirstInvalidByte)
249
+ // call the SIMD function only if inputLength is sufficiently large (maybe 64 bytes),
250
+ // otherwise, use the scalar function.
251
+ ////////////////
275
252
276
253
277
254
278
- // Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
279
- // checker .CheckNextInput(currentBlock);
255
+ Vector256 < byte > currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
256
+ Utf8Validation . utf8_checker . CheckNextInput ( currentBlock ) ;
280
257
281
- // processedLength += 32;
258
+ processedLength += 32 ;
282
259
283
- // currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
284
- // checker .CheckNextInput(currentBlock);
285
- // processedLength += 32;
260
+ currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
261
+ Utf8Validation . utf8_checker . CheckNextInput ( currentBlock ) ;
262
+ processedLength += 32 ;
286
263
287
- // return pInputBuffer + processedLength;
288
- // }
264
+ return pInputBuffer + processedLength ;
265
+ }
289
266
}
290
267
291
268
// C# docs suggests that classes are allocated on the heap:
@@ -296,9 +273,9 @@ public struct utf8_checker
296
273
{
297
274
298
275
299
- Vector256 < byte > error = Vector256 < byte > . Zero ;
300
- Vector256 < byte > prev_input_block = Vector256 < byte > . Zero ;
301
- Vector256 < byte > prev_incomplete = Vector256 < byte > . Zero ;
276
+ static Vector256 < byte > error = Vector256 < byte > . Zero ;
277
+ static Vector256 < byte > prev_input_block = Vector256 < byte > . Zero ;
278
+ static Vector256 < byte > prev_incomplete = Vector256 < byte > . Zero ;
302
279
303
280
// Explicit constructor
304
281
public utf8_checker ( )
@@ -315,7 +292,7 @@ public utf8_checker()
315
292
// This is the simplest least time-consuming implementation.
316
293
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
317
294
318
- public void CheckNextInput ( Vector256 < byte > input )
295
+ public static void CheckNextInput ( Vector256 < byte > input )
319
296
{
320
297
// Compiles to:
321
298
/*
@@ -381,7 +358,7 @@ je G_M000_IG04
381
358
382
359
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
383
360
384
- public void CheckUtf8Bytes ( Vector256 < byte > input )
361
+ public static void CheckUtf8Bytes ( Vector256 < byte > input )
385
362
{
386
363
// compiles to
387
364
// vmovups ymm0, ymmword ptr [rcx]
@@ -422,7 +399,7 @@ public void CheckUtf8Bytes(Vector256<byte> input)
422
399
423
400
// [MethodImpl(MethodImplOptions.AggressiveInlining)]
424
401
425
- public bool Errors ( )
402
+ public static bool Errors ( )
426
403
{
427
404
// Console.WriteLine("Error Vector at the end: " + VectorToString(error));
428
405
// compiles to:
@@ -434,7 +411,7 @@ public bool Errors()
434
411
435
412
// [MethodImpl(MethodImplOptions.AggressiveInlining)]
436
413
437
- public void CheckEof ( )
414
+ public static void CheckEof ( )
438
415
{
439
416
// Console.WriteLine("Error Vector before check_eof(): " + VectorToString(error));
440
417
// Console.WriteLine("prev_incomplete Vector in check_eof(): " + VectorToString(prev_incomplete));
@@ -460,7 +437,7 @@ public void CheckEof()
460
437
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
461
438
462
439
// This corresponds to section 6.1 e.g Table 6 of the paper e.g. 1-2 bytes
463
- private Vector256 < byte > CheckSpecialCases ( Vector256 < byte > input , Vector256 < byte > prev1 )
440
+ private static Vector256 < byte > CheckSpecialCases ( Vector256 < byte > input , Vector256 < byte > prev1 )
464
441
{
465
442
466
443
// define bits that indicate error code
@@ -556,7 +533,7 @@ private Vector256<byte> CheckSpecialCases(Vector256<byte> input, Vector256<byte>
556
533
}
557
534
558
535
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
559
- private Vector256 < byte > CheckMultibyteLengths ( Vector256 < byte > input , Vector256 < byte > prev_input , Vector256 < byte > sc )
536
+ private static Vector256 < byte > CheckMultibyteLengths ( Vector256 < byte > input , Vector256 < byte > prev_input , Vector256 < byte > sc )
560
537
{
561
538
// Console.WriteLine("sc: " + VectorToString(sc));
562
539
@@ -587,7 +564,7 @@ private Vector256<byte> CheckMultibyteLengths(Vector256<byte> input, Vector256<b
587
564
}
588
565
589
566
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
590
- private Vector256 < byte > MustBe23Continuation ( Vector256 < byte > prev2 , Vector256 < byte > prev3 )
567
+ private static Vector256 < byte > MustBe23Continuation ( Vector256 < byte > prev2 , Vector256 < byte > prev3 )
591
568
{
592
569
// Compiles to
593
570
// vmovups ymm0, ymmword ptr [rdx]
@@ -621,7 +598,7 @@ private Vector256<byte> MustBe23Continuation(Vector256<byte> prev2, Vector256<by
621
598
622
599
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
623
600
624
- private Vector256 < byte > IsIncomplete ( Vector256 < byte > input )
601
+ private static Vector256 < byte > IsIncomplete ( Vector256 < byte > input )
625
602
{
626
603
// Console.WriteLine("Input Vector is_incomplete: " + VectorToString(input));
627
604
// byte[] maxArray = new byte[32]
@@ -647,7 +624,7 @@ private Vector256<byte> IsIncomplete(Vector256<byte> input)
647
624
648
625
[ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
649
626
650
- private Vector256 < byte > SaturatingSubtractUnsigned ( Vector256 < byte > left , Vector256 < byte > right )
627
+ private static Vector256 < byte > SaturatingSubtractUnsigned ( Vector256 < byte > left , Vector256 < byte > right )
651
628
{
652
629
// Compiles to
653
630
// vpsubusw ymm0, ymm0, ymmword ptr [r8]
0 commit comments