@@ -76,12 +76,44 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
76
76
77
77
static Func < byte , string > byteToBinaryString = b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ; //for debugging
78
78
79
+ // prevents double counting in case there is a toolong error on the edge
80
+ public static ( int utfAdjust , int scalarAdjust ) GetFinalScalarUtfAdjustments ( byte headerByte )
81
+ {
82
+ // Check if the header byte belongs to a 2-byte UTF-8 character
83
+ if ( ( headerByte & 0b11100000 ) == 0b11000000 )
84
+ {
85
+ return ( 1 , 0 ) ;
86
+ }
87
+ // Check if the header byte belongs to a 3-byte UTF-8 character
88
+ else if ( ( headerByte & 0b11110000 ) == 0b11100000 )
89
+ {
90
+ return ( 2 , 0 ) ;
91
+ }
92
+ // Check if the header byte belongs to a 4-byte UTF-8 character
93
+ else if ( ( headerByte & 0b11111000 ) == 0b11110000 )
94
+ {
95
+
96
+ return ( 2 , 1 ) ;
97
+ }
98
+ // Otherwise, it's a 1-byte character or continuation byte
99
+ return ( 0 , 0 ) ;
100
+ }
101
+
102
+
79
103
public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
80
104
{
81
105
82
106
int extraLen = 0 ;
83
107
bool foundLeadingBytes = false ;
84
108
109
+ // Print the byte value at the buf pointer
110
+ byte * PinputPlusProcessedlength = buf ;
111
+
112
+
113
+
114
+ int TooLongErroronEdgeUtfadjust = 0 ;
115
+ int TooLongErroronEdgeScalaradjust = 0 ;
116
+
85
117
for ( int i = 0 ; i <= howFarBack ; i ++ )
86
118
{
87
119
byte candidateByte = buf [ 0 - i ] ;
@@ -92,6 +124,8 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
92
124
if ( foundLeadingBytes )
93
125
{
94
126
127
+ ( TooLongErroronEdgeUtfadjust , TooLongErroronEdgeScalaradjust ) = GetFinalScalarUtfAdjustments ( candidateByte ) ;
128
+
95
129
buf -= i ;
96
130
break ;
97
131
}
@@ -108,6 +142,24 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
108
142
byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
109
143
// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
110
144
145
+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
146
+ bool isOneByteAfterProcessedLength = ( invalidBytePointer == PinputPlusProcessedlength ) ;
147
+
148
+
149
+
150
+ // // Print the byte value at the invalidBytePointer
151
+
152
+
153
+
154
+
155
+ if ( isContinuationByte && isOneByteAfterProcessedLength )
156
+ {
157
+
158
+ utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust ;
159
+ scalarCountAdjustment += TooLongErroronEdgeScalaradjust ;
160
+
161
+ }
162
+
111
163
112
164
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
113
165
scalarCountAdjustment += TailScalarCountAdjustment ;
@@ -302,12 +354,12 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
302
354
( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
303
355
}
304
356
305
- if ( TooLongErroronEdge )
306
- {
307
- asciibytes += adjustascii ;
308
- contbytes += adjustcont ;
309
- n4 += adjustn4 ;
310
- }
357
+ // if (TooLongErroronEdge)
358
+ // {
359
+ // asciibytes += adjustascii;
360
+ // contbytes += adjustcont;
361
+ // n4 += adjustn4;
362
+ // }
311
363
312
364
var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
313
365
@@ -698,7 +750,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
698
750
if ( isContinuationByte && isOneByteAfterProcessedLength )
699
751
{
700
752
701
- TooLongErroronEdge = true ;
753
+ // TooLongErroronEdge = true;
702
754
}
703
755
704
756
0 commit comments