@@ -10,7 +10,7 @@ namespace SimdUnicode
10
10
public static class UTF8
11
11
{
12
12
13
- // //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
13
+
14
14
static void PrintHexAndBinary ( byte [ ] bytes , int highlightIndex = - 1 )
15
15
{
16
16
int chunkSize = 16 ; // 128 bits = 16 bytes
@@ -78,20 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
78
78
79
79
public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
80
80
{
81
- // // Console.WriteLine("CALLING REWIND");//debug
81
+
82
82
int extraLen = 0 ;
83
83
bool foundLeadingBytes = false ;
84
84
85
85
for ( int i = 0 ; i <= howFarBack ; i ++ )
86
86
{
87
87
byte candidateByte = buf [ 0 - i ] ;
88
88
foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
89
- // Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");//debug
90
- // Console.WriteLine(foundLeadingBytes);//debug
89
+
90
+
91
91
92
92
if ( foundLeadingBytes )
93
93
{
94
- // Console.WriteLine("Found leading byte");//debug
94
+
95
95
buf -= i ;
96
96
break ;
97
97
}
@@ -257,8 +257,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
257
257
{
258
258
if ( ( pInputBuffer [ - i ] & 0b11000000 ) != 0b10000000 )
259
259
{
260
- // string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug
261
- // Console.WriteLine($"Stopping at byte {binaryString}"); //debug
260
+
261
+
262
262
break ;
263
263
}
264
264
contbyteadjust -= 1 ;
@@ -278,40 +278,18 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
278
278
279
279
public static ( int utfadjust , int scalaradjust ) CalculateN2N3FinalSIMDAdjustments ( int asciibytes , int n4 , int contbytes , int totalbyte )
280
280
{
281
- // Console.WriteLine("---------"); //debug
282
- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
281
+
282
+
283
283
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte ;
284
284
int n2 = - 2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte ;
285
285
int utfadjust = - 2 * n4 - 2 * n3 - n2 ;
286
286
int scalaradjust = - n4 ;
287
287
288
- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
288
+
289
289
290
290
return ( utfadjust , scalaradjust ) ;
291
291
}
292
292
293
- // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
294
- // {
295
- // // Calculate the total bytes from start_point to processedLength
296
- // int totalbyte = processedLength - start_point;
297
- // int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
298
-
299
- // // Adjust the length to include a complete character, if necessary
300
- // if (totalbyte > 0)
301
- // {
302
- // (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
303
- // }
304
-
305
- // // Pseudocode:
306
- // // if 'TooLongErroronEdge' bool is true then
307
- // // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
308
-
309
- // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
310
-
311
-
312
- // return (utfadjust, scalaradjust);
313
- // }
314
-
315
293
public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes , bool TooLongErroronEdge = false )
316
294
{
317
295
// Calculate the total bytes from start_point to processedLength
@@ -324,10 +302,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
324
302
( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
325
303
}
326
304
327
- // Adjust the counters if 'TooLongErroronEdge' is true
328
305
if ( TooLongErroronEdge )
329
306
{
330
- // If you can figure out why this makes a difference,youre golden
331
307
asciibytes += adjustascii ;
332
308
contbytes += adjustcont ;
333
309
n4 += adjustn4 ;
@@ -494,9 +470,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
494
470
495
471
public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
496
472
{
497
- // Console.ForegroundColor = ConsoleColor.Blue; //debug
498
- // Console.WriteLine("-------------------------------------");//debug
499
- // Console.ResetColor();//debug
473
+
474
+
475
+
500
476
501
477
int processedLength = 0 ;
502
478
int TempUtf16CodeUnitCountAdjustment = 0 ;
@@ -674,78 +650,29 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
674
650
Vector256 < byte > byte_1_low = Avx2 . Shuffle ( shuf2 , ( prev1 & v0f ) ) ; // takes the 0000 XXXX part of the previous part
675
651
Vector256 < byte > byte_2_high = Avx2 . Shuffle ( shuf3 , Avx2 . ShiftRightLogical ( currentBlock . AsUInt16 ( ) , 4 ) . AsByte ( ) & v0f ) ; // takes the XXXX 0000 part of the current byte
676
652
Vector256 < byte > sc = Avx2 . And ( Avx2 . And ( byte_1_high , byte_1_low ) , byte_2_high ) ;
677
-
678
- // Create a span from the Vector256<byte>
679
- // Console.WriteLine("");
680
- // Span<byte> byteSpan = MemoryMarshal.Cast<Vector256<byte>, byte>(MemoryMarshal.CreateSpan(ref sc, 1));
681
- // byte[] scbytes = byteSpan.ToArray();
682
- // PrintHexAndBinary(scbytes);55555555555555555
683
-
684
653
Vector256 < byte > prev2 = Avx2 . AlignRight ( prevInputBlock , shuffled , ( byte ) ( 16 - 2 ) ) ;
685
654
Vector256 < byte > prev3 = Avx2 . AlignRight ( prevInputBlock , shuffled , ( byte ) ( 16 - 3 ) ) ;
686
655
Vector256 < byte > isThirdByte = Avx2 . SubtractSaturate ( prev2 , thirdByte ) ;
687
656
Vector256 < byte > isFourthByte = Avx2 . SubtractSaturate ( prev3 , fourthByte ) ;
688
657
Vector256 < byte > must23 = Avx2 . Or ( isThirdByte , isFourthByte ) ;
689
658
Vector256 < byte > must23As80 = Avx2 . And ( must23 , v80 ) ;
690
659
Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
691
- // if (!Avx2.TestZ(error, error))
692
- // {
693
- // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
694
-
695
- // int off = processedLength >= 32 ? processedLength - 32 : processedLength;
696
- // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
697
-
698
- // utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
699
- // scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
700
-
701
- // // We need to take care of eg
702
- // // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
703
- // // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
704
- // // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
705
- // // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
706
- // // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
707
- // // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
708
- // // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
709
- // // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
710
- // // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
711
-
712
- // // so in short , we want to solve this error while at the same time not disturbing anything else
713
- // // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
714
- // // *TODO:Fill code here *
715
- // // Peudocode for now
716
- // // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
717
- // // pass on true to the
718
660
719
661
720
- // int totalbyteasciierror = processedLength - start_point;
721
- // var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
722
-
723
- // utf16CodeUnitCountAdjustment += utfadjustasciierror;
724
- // scalarCountAdjustment += scalaradjustasciierror;
725
-
726
- // TailScalarCodeUnitCountAdjustment =0;
727
- // TailUtf16CodeUnitCountAdjustment =0;
728
-
729
-
730
-
731
- // return invalidBytePointer;
732
- // }
733
-
734
662
if ( ! Avx2 . TestZ ( error , error ) )
735
663
{
736
- // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
664
+
737
665
738
666
int off = processedLength > 32 ? processedLength - 32 : processedLength ; // this does not backup ff processedlength = 32
739
- // int off = processedLength >= 32 ? processedLength - 32 : processedLength; original/main algorithm working
740
667
741
- // Console.WriteLine($"Offset backup by: {off}");//debug
668
+
742
669
byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
743
670
bool TooLongErroronEdge = false ;
744
671
745
672
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment ;
746
673
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment ;
747
674
748
- // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");//debug
675
+
749
676
750
677
// We need to take care of eg
751
678
// 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
@@ -756,26 +683,21 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
756
683
// Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
757
684
// but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
758
685
// the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
759
- // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
760
-
761
- // so in short , we want to solve this error while at the same time not disturbing anything else
762
- // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
763
- // *TODO:Fill code here *
764
- // Peudocode for now
765
- // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
766
- // pass on true to the
686
+ // If this error arrive at the edge of 2 simd vector, that is where problem abound
767
687
768
688
// Calculate the offset of the invalid byte pointer from the start of the input buffer
769
689
ulong offsetFromStart = ( ulong ) ( invalidBytePointer - pInputBuffer ) ;
770
690
771
691
// Debugging output
692
+
772
693
bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
694
+
773
695
bool isOneByteAfterProcessedLength = ( invalidBytePointer == pInputBuffer + processedLength ) ;
774
696
775
- // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
697
+
776
698
if ( isContinuationByte && isOneByteAfterProcessedLength )
777
699
{
778
- // Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");//debug
700
+
779
701
TooLongErroronEdge = true ;
780
702
}
781
703
@@ -797,31 +719,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
797
719
var ( totalbyteadjustment , i , tempascii , tempcont , tempn4 ) = adjustmentFactor ( pInputBuffer + processedLength + 32 ) ;
798
720
799
721
processedLength -= i ;
800
- n4 += tempn4 ; // this is + because the adjustment function returns something negative already
722
+ n4 += tempn4 ;
801
723
contbytes += tempcont ;
802
- // Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
803
- }
804
-
805
-
806
-
807
-
808
-
809
- // Vector256<byte> contbyto = Vector256.Create((byte)(0b11000000u - 0x80));
810
- // Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
811
- // Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
812
- // Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
813
-
814
- // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
815
- // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
816
- // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
817
724
725
+ }
818
726
819
- // No errors! Updating the variables we keep track of
820
- // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation.
821
-
822
- // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes
823
- // Placeholder until andether way to do with contbyte is found
824
-
825
727
Vector256 < byte > top2bits = Vector256 . Create ( ( byte ) 0b11000000 ) ; // Mask to isolate the two most significant bits
826
728
Vector256 < byte > contbytemask = Vector256 . Create ( ( byte ) 0b10000000 ) ; // The expected pattern for continuation bytes: 10xxxxxx
827
729
@@ -843,9 +745,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
843
745
asciibytes += ( int ) ( 32 - Popcnt . PopCount ( ( uint ) mask ) ) ;
844
746
}
845
747
846
-
847
-
848
-
849
748
// There are 2 possible scenarios here : either
850
749
// A) it arrives flush en the border. eg it doesnt need to be processed further
851
750
// B) There is some bytes remaining in which case we need to call the scalar functien
@@ -862,11 +761,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
862
761
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
863
762
// Process the remaining bytes with the scalar function
864
763
865
-
866
764
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
867
765
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
868
766
if ( processedLength < inputLength )
869
767
{
768
+
870
769
byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( 32 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
871
770
if ( invalidBytePointer != pInputBuffer + inputLength )
872
771
{
0 commit comments