@@ -111,7 +111,7 @@ public static class UTF8
111
111
112
112
// I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD
113
113
// function calculates
114
- public unsafe static byte * RewindAndValidateWithErrorsRemaining ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
114
+ public unsafe static byte * RewindAndValidateWithErrorsRemaining ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment , bool prevWasUnterminated = false )
115
115
{
116
116
Console . WriteLine ( "--Rewind Validate with Errors Remaining" ) ;
117
117
Console . WriteLine ( "current Byte:" + Convert . ToString ( buf [ 0 ] , 2 ) . PadLeft ( 8 , '0' ) ) ;
@@ -122,40 +122,43 @@ public static class UTF8
122
122
int extraLen = 0 ;
123
123
bool foundLeadingBytes = false ;
124
124
125
- for ( int i = 0 ; i <= 3 ; i ++ )
126
- {
127
- if ( i == 0 ) { continue ; } ; // we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
128
- // TODO: written like this for readability, I know its ugly so this needs to be rewritten
129
- byte candidateByte = buf [ 0 - i ] ;
130
- Console . WriteLine ( "Checking Byte:" + candidateByte . ToString ( "X2" ) ) ;
131
-
132
- foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
133
- if ( foundLeadingBytes )
125
+ // This was created in the context of incomplete tests: namely a gap is created when the SIMD vector is followed by a processremainingscalar
126
+ if ( ! prevWasUnterminated ) //
127
+ {
128
+ for ( int i = 0 ; i <= 3 ; i ++ )
134
129
{
135
- Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + candidateByte . ToString ( "X2" ) ) ;
136
- // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
130
+ if ( i == 0 ) { continue ; } ; // we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
131
+ // TODO: written like this for readability, I know its ugly so this needs to be rewritten
132
+ byte candidateByte = buf [ 0 - i ] ;
133
+ Console . WriteLine ( "Checking Byte:" + candidateByte . ToString ( "X2" ) ) ;
137
134
138
- // adjustment to avoid double counting
139
- if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
140
- {
141
- Console . WriteLine ( "Found 2 byte" ) ;
142
- TempUtf16CodeUnitCountAdjustment -= 1 ;
143
- }
144
- if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
145
- {
146
- Console . WriteLine ( "Found 3 byte" ) ;
147
- TempUtf16CodeUnitCountAdjustment -= 2 ;
148
- }
149
- if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
135
+ foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
136
+ if ( foundLeadingBytes )
150
137
{
151
- Console . WriteLine ( "Found 4 byte" ) ;
152
- TempUtf16CodeUnitCountAdjustment -= 2 ;
153
- TempScalarCountAdjustment -= 1 ;
138
+ Console . WriteLine ( "Double counting.Found leading byte at:" + i + ",Byte:" + candidateByte . ToString ( "X2" ) ) ;
139
+ // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
140
+
141
+ // adjustment to avoid double counting
142
+ if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
143
+ {
144
+ Console . WriteLine ( "Found 2 byte" ) ;
145
+ TempUtf16CodeUnitCountAdjustment -= 1 ;
146
+ }
147
+ if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
148
+ {
149
+ Console . WriteLine ( "Found 3 byte" ) ;
150
+ TempUtf16CodeUnitCountAdjustment -= 2 ;
151
+ }
152
+ if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
153
+ {
154
+ Console . WriteLine ( "Found 4 byte" ) ;
155
+ TempUtf16CodeUnitCountAdjustment -= 2 ;
156
+ TempScalarCountAdjustment -= 1 ;
157
+ }
158
+ break ;
154
159
}
155
- break ;
156
160
}
157
161
}
158
-
159
162
for ( int i = 0 ; i <= howFarBack ; i ++ )
160
163
{
161
164
Console . WriteLine ( "backup stat:" + i ) ;
@@ -501,6 +504,7 @@ public static class UTF8
501
504
int TailUtf16CodeUnitCountAdjustment = 0 ;
502
505
503
506
bool prevWasSimd = false ;
507
+ bool prevWasUnterminated = false ;
504
508
505
509
506
510
if ( pInputBuffer == null || inputLength <= 0 )
@@ -836,39 +840,42 @@ public static class UTF8
836
840
837
841
Console . WriteLine ( "Backed up " + backedup + " bytes" ) ;
838
842
839
- for ( int k = backedup ; k < 3 ; k ++ )
840
- {
841
- int candidateByte = pInputBuffer [ processedLength - k ] ;
842
- if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
843
- {
844
- // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3
845
- // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error
846
- // if (k != 0)
847
- {
848
- if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
849
- {
850
- TempUtf16CodeUnitCountAdjustment += 1 ;
851
- }
852
- if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
853
- {
854
- TempUtf16CodeUnitCountAdjustment += 2 ;
855
- }
856
- if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
857
- {
858
- Console . WriteLine ( "Found 4-byte" ) ;
859
- TempUtf16CodeUnitCountAdjustment += 2 ;
860
- TempScalarCountAdjustment += 1 ;
861
- }
862
- // break;
843
+ // for(int k = backedup; k < 3 ; k++)
844
+ // {
845
+ // int candidateByte = pInputBuffer[processedLength - k];
846
+ // if ((candidateByte & 0b11000000) == 0b11000000)
847
+ // {
848
+ // // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3
849
+ // // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error
850
+ // // if (k != 0)
851
+ // {
852
+ // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
853
+ // {
854
+ // TempUtf16CodeUnitCountAdjustment += 1;
855
+ // }
856
+ // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
857
+ // {
858
+ // TempUtf16CodeUnitCountAdjustment += 2;
859
+ // }
860
+ // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
861
+ // {
862
+ // Console.WriteLine("Found 4-byte");
863
+ // TempUtf16CodeUnitCountAdjustment += 2;
864
+ // TempScalarCountAdjustment += 1;
865
+ // }
866
+ // // break;
863
867
864
- }
865
- }
866
- }
868
+ // }
869
+ // }
870
+ // }
867
871
868
872
869
873
Console . WriteLine ( "TempUTF16:" + TempUtf16CodeUnitCountAdjustment ) ;
870
874
Console . WriteLine ( "TempScalar:" + TempScalarCountAdjustment ) ;
871
875
Console . WriteLine ( "-----------------" ) ;
876
+
877
+ prevWasUnterminated = true ;
878
+ prevWasSimd = true ;
872
879
}
873
880
}
874
881
}
@@ -927,7 +934,7 @@ public static class UTF8
927
934
928
935
929
936
// byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
930
- byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrorsRemaining ( 32 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
937
+ byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrorsRemaining ( 32 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment , prevWasUnterminated ) ;
931
938
if ( invalidBytePointer != pInputBuffer + inputLength )
932
939
{
933
940
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
@@ -939,28 +946,38 @@ public static class UTF8
939
946
940
947
Console . WriteLine ( "TempUTF16 after tail remaining check:" + TempUtf16CodeUnitCountAdjustment ) ;
941
948
Console . WriteLine ( "TempScalar '' '' '':" + TempScalarCountAdjustment ) ;
949
+ // prevWasSimd = false;
950
+
951
+ }
952
+ else
953
+ if ( processedLength == inputLength && prevWasSimd ) { // without this there is a 3 byte gap at the end
954
+ Console . Write ( "Closing in the gap\n " ) ;
942
955
943
- } else if ( processedLength == inputLength && prevWasSimd ) {
944
- for ( int k = 0 ; k < 3 ; k ++ )
956
+ for ( int k = 0 ; k <= 3 ; k ++ )
945
957
{
946
- // There is no error here hence the loop is straigthforward and we avoid double counting every byte
958
+
959
+ // There is no error here hence the loop is straigthforward and we avoid double counting every byte
947
960
int candidateByte = pInputBuffer [ processedLength - k ] ;
948
961
if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
949
962
{
950
963
if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
951
964
{
965
+ Console . Write ( "Found 2 byte \n " ) ;
966
+
952
967
TempUtf16CodeUnitCountAdjustment -= 1 ;
953
968
}
954
969
if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
955
970
{
971
+ Console . Write ( "Found 3 byte \n " ) ;
956
972
TempUtf16CodeUnitCountAdjustment -= 2 ;
957
973
}
958
974
if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
959
975
{
976
+ Console . Write ( "Found 4 byte \n " ) ;
960
977
TempUtf16CodeUnitCountAdjustment -= 2 ;
961
978
TempScalarCountAdjustment -= 1 ;
962
979
}
963
- break ;
980
+ // break;
964
981
}
965
982
}
966
983
}
0 commit comments