@@ -24,38 +24,133 @@ public static class UTF8
24
24
int extraLen = 0 ;
25
25
bool foundLeadingBytes = false ;
26
26
27
+ // this is the generic function called when there is an error:
27
28
// TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length
28
29
// Even with no errors, it sometime double counts, why.. ? because it goes back even further
29
30
// even though the scalar doesnt thread
30
31
// adjust for double counting
31
32
// for (int i = 0; i <= howFarBack; i++)
32
- for ( int i = 0 ; i <= howFarBack ; i ++ )
33
+ // {
34
+ // if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
35
+ // // TODO: written like this for readability, I know its ugly so this needs to be rewritten
36
+ // byte candidateByte = buf[0 - i];
37
+ // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
38
+ // if (foundLeadingBytes)
39
+ // {
40
+
41
+ // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
42
+ // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
43
+
44
+ // // adjustment to avoid double counting
45
+ // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
46
+ // {
47
+ // // Console.WriteLine("Found 2 byte");
48
+ // TempUtf16CodeUnitCountAdjustment += 1;
49
+ // }
50
+ // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
51
+ // {
52
+ // // Console.WriteLine("Found 3 byte");
53
+ // TempUtf16CodeUnitCountAdjustment += 2;
54
+ // }
55
+ // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
56
+ // {
57
+ // // Console.WriteLine("Found 4 byte");
58
+ // TempUtf16CodeUnitCountAdjustment += 2;
59
+ // TempScalarCountAdjustment += 1;
60
+ // }
61
+ // break;
62
+ // }
63
+ // }
64
+
65
+ for ( int i = 0 ; i <= howFarBack ; i ++ )
66
+ {
67
+ Console . WriteLine ( "backup stat:" + i ) ;
68
+ byte candidateByte = buf [ 0 - i ] ;
69
+ foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
70
+ if ( foundLeadingBytes )
71
+ {
72
+ buf -= i ;
73
+ extraLen = i ;
74
+ Console . WriteLine ( howFarBack ) ;
75
+ Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
76
+
77
+ // Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
78
+ break ;
79
+ }
80
+ }
81
+
82
+
83
+ if ( ! foundLeadingBytes )
84
+ {
85
+ return buf - howFarBack ;
86
+ }
87
+
88
+ utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
89
+ scalarCountAdjustment += TempScalarCountAdjustment ;
90
+
91
+ int TailUtf16CodeUnitCountAdjustment = 0 ;
92
+ int TailScalarCountAdjustment = 0 ;
93
+
94
+ // Now buf points to the start of a UTF-8 sequence or the start of the buffer.
95
+ // Validate from this new start point with the adjusted length.
96
+
97
+ // TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?!
98
+ byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
99
+
100
+ utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
101
+ scalarCountAdjustment += TailScalarCountAdjustment ;
102
+
103
+ Console . WriteLine ( "rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment ) ;
104
+ Console . WriteLine ( "scalarcount adjstment after rewind:" + TempScalarCountAdjustment ) ;
105
+ Console . WriteLine ( " " ) ;
106
+ Console . WriteLine ( "rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment ) ;
107
+ Console . WriteLine ( "scalarcount after rewind(ditto):" + TailScalarCountAdjustment ) ;
108
+
109
+ return invalidBytePointer ;
110
+ }
111
+
112
+ // I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD
113
+ // function calculates
114
+ public unsafe static byte * RewindAndValidateWithErrorsRemaining ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
115
+ {
116
+ Console . WriteLine ( "--Rewind Validate with Errors Remaining" ) ;
117
+ Console . WriteLine ( "current Byte:" + Convert . ToString ( buf [ 0 ] , 2 ) . PadLeft ( 8 , '0' ) ) ;
118
+
119
+ int TempUtf16CodeUnitCountAdjustment = 0 ;
120
+ int TempScalarCountAdjustment = 0 ;
121
+
122
+ int extraLen = 0 ;
123
+ bool foundLeadingBytes = false ;
124
+
125
+ for ( int i = 0 ; i <= 3 ; i ++ )
33
126
{
34
127
if ( i == 0 ) { continue ; } ; // we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
128
+ // TODO: written like this for readability, I know its ugly so this needs to be rewritten
35
129
byte candidateByte = buf [ 0 - i ] ;
130
+ Console . WriteLine ( "Checking Byte:" + candidateByte . ToString ( "X2" ) ) ;
131
+
36
132
foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
37
133
if ( foundLeadingBytes )
38
134
{
39
-
40
- // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
41
- Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
135
+ Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + candidateByte . ToString ( "X2" ) ) ;
136
+ // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
42
137
43
138
// adjustment to avoid double counting
44
139
if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
45
140
{
46
- // Console.WriteLine("Found 2 byte");
47
- TempUtf16CodeUnitCountAdjustment + = 1 ;
141
+ Console . WriteLine ( "Found 2 byte" ) ;
142
+ TempUtf16CodeUnitCountAdjustment - = 1 ;
48
143
}
49
144
if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
50
145
{
51
- // Console.WriteLine("Found 3 byte");
52
- TempUtf16CodeUnitCountAdjustment + = 2 ;
146
+ Console . WriteLine ( "Found 3 byte" ) ;
147
+ TempUtf16CodeUnitCountAdjustment - = 2 ;
53
148
}
54
149
if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
55
150
{
56
- // Console.WriteLine("Found 4 byte");
57
- TempUtf16CodeUnitCountAdjustment + = 2 ;
58
- TempScalarCountAdjustment + = 1 ;
151
+ Console . WriteLine ( "Found 4 byte" ) ;
152
+ TempUtf16CodeUnitCountAdjustment - = 2 ;
153
+ TempScalarCountAdjustment - = 1 ;
59
154
}
60
155
break ;
61
156
}
@@ -73,7 +168,6 @@ public static class UTF8
73
168
Console . WriteLine ( howFarBack ) ;
74
169
Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
75
170
76
-
77
171
// Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
78
172
break ;
79
173
}
@@ -109,6 +203,7 @@ public static class UTF8
109
203
return invalidBytePointer ;
110
204
}
111
205
206
+
112
207
public unsafe static byte * GetPointerToFirstInvalidByteScalar ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
113
208
{
114
209
@@ -511,15 +606,57 @@ public static class UTF8
511
606
Vector256 < byte > v0f = Vector256 . Create ( ( byte ) 0x0F ) ;
512
607
Vector256 < byte > v80 = Vector256 . Create ( ( byte ) 0x80 ) ;
513
608
609
+ bool prevWasSimd = false ;
610
+
514
611
for ( ; processedLength + 32 <= inputLength ; processedLength += 32 )
515
612
{
613
+
614
+
615
+
616
+ // TODO: there is a problem with the fastpath : namely that if it is followed by a vector with all ascii,
617
+ // there is a gap where
618
+ // this is because
619
+ // Now we have 2 choices : either still use prev3 to count dutf and check if there is a gap here OR
620
+ //
516
621
Vector256 < byte > currentBlock = Avx . LoadVector256 ( pInputBuffer + processedLength ) ;
517
622
518
623
int mask = Avx2 . MoveMask ( currentBlock ) ;
519
624
if ( mask == 0 )
520
625
{
521
626
// We have an ASCII block, no need to process it, but
522
627
// we need to check if the previous block was incomplete.
628
+
629
+ if ( prevWasSimd ) { // recall that the non ascii simd checks counts the adjustment on prev3, hence we need to backtrack in case the
630
+ // it was called
631
+ Console . WriteLine ( "--prev was simd!" ) ;
632
+ for ( int k = 1 ; k <= 3 ; k ++ ) // we dont want to double count the current byte
633
+ {
634
+ int candidateByte = pInputBuffer [ processedLength - k ] ;
635
+ if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
636
+ {
637
+ {
638
+ if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
639
+ {
640
+ TempUtf16CodeUnitCountAdjustment -= 1 ;
641
+ }
642
+ if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
643
+ {
644
+ TempUtf16CodeUnitCountAdjustment -= 2 ;
645
+ }
646
+ if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
647
+ {
648
+ Console . WriteLine ( "Found 4-byte" ) ;
649
+ TempUtf16CodeUnitCountAdjustment -= 2 ;
650
+ TempScalarCountAdjustment -= 1 ;
651
+ }
652
+ // break;
653
+
654
+ }
655
+ }
656
+ }
657
+
658
+ }
659
+
523
660
if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
524
661
{
525
662
@@ -562,14 +699,15 @@ public static class UTF8
562
699
// }
563
700
564
701
565
- // TODO this needs S
566
702
return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off , ref utf16CodeUnitCountAdjustment , ref scalarCountAdjustment ) ;
567
703
}
568
704
prevIncomplete = Vector256 < byte > . Zero ;
705
+ prevWasSimd = false ;
569
706
}
570
707
else // Contains non-ASCII characters, we need to do non-trivial processing
571
708
{
572
709
Console . WriteLine ( "--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes" ) ;
710
+ prevWasSimd = true ;
573
711
574
712
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
575
713
// TODO:integrate this better with the rest of the code
@@ -616,6 +754,7 @@ public static class UTF8
616
754
Vector256 < byte > must23 = Avx2 . Or ( isThirdByte , isFourthByte ) ;
617
755
Vector256 < byte > must23As80 = Avx2 . And ( must23 , v80 ) ;
618
756
Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
757
+
619
758
if ( ! Avx2 . TestZ ( error , error ) ) //context: we are dealing with a 32 bit
620
759
{
621
760
Console . WriteLine ( "-----Error path!!" ) ;
@@ -676,9 +815,9 @@ public static class UTF8
676
815
677
816
for ( int k = 0 ; k < 3 ; k ++ )
678
817
{
679
- int candidateByte = pInputBuffer [ processedLength + k ] ;
818
+ int candidateByte = pInputBuffer [ processedLength + 32 + k ] ;
680
819
Console . WriteLine ( "Backing up " + k + " bytes" ) ;
681
- Console . WriteLine ( "CurrentByte after backing up:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
820
+ Console . WriteLine ( "Byte after backing up:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
682
821
683
822
backedup = 3 - k + 1 ;
684
823
// TODO:
@@ -743,21 +882,21 @@ public static class UTF8
743
882
if ( processedLength < inputLength )
744
883
{
745
884
746
- Console . WriteLine ( "----Process remaining Scalar" ) ;
747
- Console . WriteLine ( "processed length before:" + processedLength ) ;
885
+ Console . WriteLine ( "----Process remaining Scalar @ " + processedLength + "bytes ") ;
886
+ // Console.WriteLine("processed length before:" + processedLength);
748
887
int overlapCount = 0 ;
749
888
750
889
// // We need to possibly backtrack to the start of the last code point
751
- while ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
752
- {
753
- processedLength -= 1 ;
754
- overlapCount += 1 ;
755
- }
890
+ // while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
891
+ // {
892
+ // processedLength -= 1;
893
+ // overlapCount +=1;
894
+ // }
756
895
757
896
Console . WriteLine ( "processed length after backtrack:" + processedLength ) ;
758
897
759
898
760
- // TOCHECK:See if rewind is better here
899
+ // PERFORMANCE TOCHECK:See if rewind is better here
761
900
// for(int k = 0; k < overlapCount; k++)
762
901
// {
763
902
// // There is no error here hence the loop is straigthforward and we avoid double counting every byte
@@ -785,8 +924,8 @@ public static class UTF8
785
924
Console . WriteLine ( "TempScalar '' '' '':" + TempScalarCountAdjustment ) ;
786
925
787
926
788
- byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByteScalar ( pInputBuffer + processedLength , inputLength - processedLength , out TailUtf16CodeUnitCountAdjustment , out TailScalarCodeUnitCountAdjustment ) ;
789
- // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3 ,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
927
+ // byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
928
+ byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrorsRemaining ( 32 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
790
929
if ( invalidBytePointer != pInputBuffer + inputLength )
791
930
{
792
931
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
0 commit comments