@@ -546,6 +546,21 @@ public static unsafe class Utf8Utility
546
546
return pInputBuffer ;
547
547
}
548
548
549
+ // while (processedLength + 128 <= inputLength)
550
+ // {
551
+
552
+ // SIMDGetPointerToFirstInvalidByte(pInputBuffer,processedLength);
553
+
554
+ // Utf8Validation.utf8_checker.CheckEof();
555
+ // if (Utf8Validation.utf8_checker.Errors())
556
+ // {
557
+ // // return pInputBuffer + processedLength;
558
+ // return SimdUnicode.UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength,inputLength - processedLength);
559
+ // }
560
+ // processedLength += 128;
561
+
562
+ // }
563
+
549
564
while ( processedLength + 64 <= inputLength )
550
565
{
551
566
@@ -559,7 +574,24 @@ public static unsafe class Utf8Utility
559
574
}
560
575
processedLength += 64 ;
561
576
562
- }
577
+ }
578
+
579
+
580
+
581
+ // while (processedLength + 32 <= inputLength)
582
+ // {
583
+
584
+ // SIMDGetPointerToFirstInvalidByte(pInputBuffer,processedLength);
585
+
586
+ // Utf8Validation.utf8_checker.CheckEof();
587
+ // if (Utf8Validation.utf8_checker.Errors())
588
+ // {
589
+ // // return pInputBuffer + processedLength;
590
+ // return SimdUnicode.UTF8.RewindAndValidateWithErrors(pInputBuffer + processedLength,inputLength - processedLength);
591
+ // }
592
+ // processedLength += 32;
593
+
594
+ // }
563
595
564
596
// First fix bencrmarks static utf checker
565
597
//
@@ -583,15 +615,15 @@ public static unsafe class Utf8Utility
583
615
584
616
585
617
// Process the remaining bytes with the scalar function
586
- // if (processedLength < inputLength)
587
- // {
588
- // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInputBuffer + processedLength, inputLength - processedLength);
589
- // if (invalidBytePointer != pInputBuffer + inputLength)
590
- // {
591
- // // An invalid byte was found by the scalar function
592
- // return invalidBytePointer;
593
- // }
594
- // }
618
+ if ( processedLength < inputLength )
619
+ {
620
+ byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByte ( pInputBuffer + processedLength , inputLength - processedLength ) ;
621
+ if ( invalidBytePointer != pInputBuffer + inputLength )
622
+ {
623
+ // An invalid byte was found by the scalar function
624
+ return invalidBytePointer ;
625
+ }
626
+ }
595
627
596
628
// | Method | FileName | Mean | Error | StdDev | Allocated |
597
629
// |---------------------------- |----------------------- |-----------:|----------:|----------:|----------:|
@@ -647,26 +679,26 @@ public static unsafe class Utf8Utility
647
679
// | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 109.607 us | 0.6636 us | 0.5542 us | - |
648
680
649
681
650
- if ( processedLength < inputLength )
651
- {
682
+ // if (processedLength < inputLength)
683
+ // {
652
684
653
- Span < byte > remainingBytes = stackalloc byte [ 64 ] ;
654
- for ( int i = 0 ; i < inputLength - processedLength ; i ++ )
655
- {
656
- remainingBytes [ i ] = pInputBuffer [ processedLength + i ] ;
657
- }
685
+ // Span<byte> remainingBytes = stackalloc byte[64];
686
+ // for (int i = 0; i < inputLength - processedLength; i++)
687
+ // {
688
+ // remainingBytes[i] = pInputBuffer[processedLength + i];
689
+ // }
658
690
659
- ReadOnlySpan < Byte > remainingBytesReadOnly = remainingBytes ;
660
- Vector256 < byte > remainingBlock = Vector256 . Create ( remainingBytesReadOnly ) ;
661
- Utf8Validation . utf8_checker . CheckNextInput ( remainingBlock ) ;
662
- Utf8Validation . utf8_checker . CheckEof ( ) ;
663
- if ( Utf8Validation . utf8_checker . Errors ( ) )
664
- {
665
- // return pInputBuffer + processedLength;
666
- return SimdUnicode . UTF8 . GetPointerToFirstInvalidByte ( pInputBuffer + processedLength , inputLength - processedLength ) ;
667
- }
668
- processedLength += inputLength - processedLength ;
669
- }
691
+ // ReadOnlySpan<Byte> remainingBytesReadOnly = remainingBytes;
692
+ // Vector256<byte> remainingBlock = Vector256.Create(remainingBytesReadOnly);
693
+ // Utf8Validation.utf8_checker.CheckNextInput(remainingBlock);
694
+ // Utf8Validation.utf8_checker.CheckEof();
695
+ // if (Utf8Validation.utf8_checker.Errors())
696
+ // {
697
+ // // return pInputBuffer + processedLength;
698
+ // return SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInputBuffer + processedLength,inputLength - processedLength);
699
+ // }
700
+ // processedLength += inputLength - processedLength;
701
+ // }
670
702
671
703
672
704
@@ -675,18 +707,47 @@ public static unsafe class Utf8Utility
675
707
676
708
}
677
709
678
- Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
679
- // [MethodImpl(MethodImplOptions.AggressiveInlining)]
680
- public static byte * SIMDGetPointerToFirstInvalidByte ( byte * pInputBuffer , int processedLength )
681
- {
682
- ////////////////
683
- // TODO: I recommend taking this code and calling it something
684
- // else. Then have the current function (GetPointerToFirstInvalidByte)
685
- // call the SIMD function only if inputLength is sufficiently large (maybe 64 bytes),
686
- // otherwise, use the scalar function.
687
- ////////////////
710
+ // | Method | FileName | Mean | Error | StdDev | Allocated |
711
+ // |---------------------------- |----------------------- |-----------:|----------:|-----------:|----------:|
712
+ // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 472.648 us | 9.2039 us | 14.3294 us | - |
713
+ // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 270.666 us | 1.8206 us | 1.6139 us | - |
714
+ // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 129.587 us | 2.4394 us | 2.2818 us | - |
715
+ // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 14.699 us | 0.2902 us | 0.4254 us | - |
716
+ // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 10.944 us | 0.1793 us | 0.1590 us | - |
717
+ // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 10.954 us | 0.1190 us | 0.1113 us | - |
718
+ // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 12.971 us | 0.2540 us | 0.2495 us | - |
719
+ // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 12.692 us | 0.1270 us | 0.1126 us | - |
720
+ // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 5.751 us | 0.0576 us | 0.0539 us | - |
721
+ // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 5.735 us | 0.0164 us | 0.0145 us | - |
722
+ // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 132.404 us | 1.3084 us | 1.2239 us | - |
723
+ // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 74.305 us | 1.4385 us | 1.4128 us | - |
724
+ // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 161.232 us | 1.5357 us | 1.4365 us | - |
725
+ // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 107.539 us | 1.0781 us | 0.9557 us | - |
726
+
727
+ // public static byte* SIMDGetPointerToFirstInvalidByte(byte* pInputBuffer, int processedLength)
728
+ // {
729
+ // Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
730
+ // Utf8Validation.utf8_checker.CheckNextInput(currentBlock);
731
+
732
+ // processedLength += 32;
733
+
734
+ // currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
735
+ // Utf8Validation.utf8_checker.CheckNextInput(currentBlock);
736
+ // processedLength += 32;
737
+
738
+ // currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
739
+ // Utf8Validation.utf8_checker.CheckNextInput(currentBlock);
688
740
689
- // G_M000_IG01:; ; offset = 0x0000
741
+ // processedLength += 32;
742
+
743
+ // currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
744
+ // Utf8Validation.utf8_checker.CheckNextInput(currentBlock);
745
+ // processedLength += 32;
746
+
747
+ // return pInputBuffer + processedLength;
748
+ // }
749
+
750
+ // G_M000_IG01:; ; offset = 0x0000
690
751
// push rbp
691
752
// sub rsp, 112
692
753
// vzeroupper
@@ -729,7 +790,33 @@ public static unsafe class Utf8Utility
729
790
730
791
//; Total bytes of code 114
731
792
732
-
793
+ ////////////////
794
+ // TODO: I recommend taking this code and calling it something
795
+ // else. Then have the current function (GetPointerToFirstInvalidByte)
796
+ // call the SIMD function only if inputLength is sufficiently large (maybe 64 bytes),
797
+ // otherwise, use the scalar function.
798
+ ////////////////
799
+ // | Method | FileName | Mean | Error | StdDev | Allocated |
800
+ // |---------------------------- |----------------------- |-----------:|----------:|----------:|----------:|
801
+ // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 428.127 us | 7.9313 us | 7.7896 us | - |
802
+ // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 263.689 us | 5.2244 us | 7.4927 us | - |
803
+ // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 112.669 us | 1.7434 us | 1.5455 us | - |
804
+ // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 16.209 us | 0.3105 us | 0.4250 us | - |
805
+ // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 10.804 us | 0.0878 us | 0.0821 us | - |
806
+ // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 10.873 us | 0.0428 us | 0.0379 us | - |
807
+ // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 12.423 us | 0.0771 us | 0.0721 us | - |
808
+ // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 13.878 us | 0.2719 us | 0.4152 us | - |
809
+ // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 6.425 us | 0.1266 us | 0.2044 us | - |
810
+ // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 6.452 us | 0.1281 us | 0.2277 us | - |
811
+ // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 148.702 us | 2.9438 us | 6.1447 us | - |
812
+ // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 81.048 us | 1.5900 us | 3.3538 us | - |
813
+ // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 177.423 us | 3.5294 us | 7.2096 us | - |
814
+ // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 116.685 us | 2.3214 us | 4.0044 us | - |
815
+
816
+ // Returns a pointer to the first invalid byte in the input buffer if it's invalid, or a pointer to the end if it's valid.
817
+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
818
+ public static byte * SIMDGetPointerToFirstInvalidByte ( byte * pInputBuffer , int processedLength )
819
+ {
733
820
Vector256 < byte > currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
734
821
Utf8Validation . utf8_checker . CheckNextInput ( currentBlock ) ;
735
822
@@ -741,8 +828,42 @@ public static unsafe class Utf8Utility
741
828
742
829
return pInputBuffer + processedLength ;
743
830
}
831
+
832
+
833
+ // | Method | FileName | Mean | Error | StdDev | Allocated |
834
+ // |---------------------------- |----------------------- |-----------:|----------:|----------:|----------:|
835
+ // | SIMDUtf8ValidationRealData | data/arabic.utf8.txt | 456.220 us | 9.1097 us | 9.7472 us | - |
836
+ // | SIMDUtf8ValidationErrorData | data/arabic.utf8.txt | 263.690 us | 3.8144 us | 3.3813 us | - |
837
+ // | SIMDUtf8ValidationRealData | data/chinese.utf8.txt | 128.735 us | 2.1841 us | 2.0430 us | - |
838
+ // | SIMDUtf8ValidationErrorData | data/chinese.utf8.txt | 14.677 us | 0.2860 us | 0.3060 us | - |
839
+ // | SIMDUtf8ValidationRealData | data/english.utf8.txt | 11.059 us | 0.1237 us | 0.1157 us | - |
840
+ // | SIMDUtf8ValidationErrorData | data/english.utf8.txt | 11.031 us | 0.1627 us | 0.1270 us | - |
841
+ // | SIMDUtf8ValidationRealData | data/french.utf8.txt | 12.780 us | 0.2398 us | 0.2126 us | - |
842
+ // | SIMDUtf8ValidationErrorData | data/french.utf8.txt | 12.776 us | 0.2530 us | 0.2367 us | - |
843
+ // | SIMDUtf8ValidationRealData | data/german.utf8.txt | 5.851 us | 0.1000 us | 0.0887 us | - |
844
+ // | SIMDUtf8ValidationErrorData | data/german.utf8.txt | 5.801 us | 0.0567 us | 0.0530 us | - |
845
+ // | SIMDUtf8ValidationRealData | data/japanese.utf8.txt | 133.673 us | 2.1092 us | 1.7612 us | - |
846
+ // | SIMDUtf8ValidationErrorData | data/japanese.utf8.txt | 73.525 us | 0.8027 us | 0.7116 us | - |
847
+ // | SIMDUtf8ValidationRealData | data/turkish.utf8.txt | 165.167 us | 3.1097 us | 3.3274 us | - |
848
+ // | SIMDUtf8ValidationErrorData | data/turkish.utf8.txt | 113.276 us | 2.1983 us | 2.9346 us | - |
849
+
850
+
851
+ // // unroll once
852
+ // public static byte* SIMDGetPointerToFirstInvalidByte(byte* pInputBuffer, int processedLength)
853
+ // {
854
+ // Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
855
+ // Utf8Validation.utf8_checker.CheckNextInput(currentBlock);
856
+
857
+ // processedLength += 32;
858
+
859
+ // return pInputBuffer + processedLength;
860
+ // }
744
861
}
745
862
863
+
864
+
865
+
866
+
746
867
// C# docs suggests that classes are allocated on the heap:
747
868
// it doesnt seem to do much in this case but I thought the suggestion to be sensible.
748
869
public struct Utf8Validation
0 commit comments