@@ -10,7 +10,7 @@ namespace SimdUnicode
10
10
public static class UTF8
11
11
{
12
12
13
- // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
13
+ //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
14
14
static void PrintHexAndBinary ( byte [ ] bytes , int highlightIndex = - 1 )
15
15
{
16
16
int chunkSize = 16 ; // 128 bits = 16 bytes
@@ -78,19 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
78
78
79
79
public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
80
80
{
81
-
82
- int TempUtf16CodeUnitCountAdjustment = 0 ;
83
- int TempScalarCountAdjustment = 0 ;
84
-
81
+ // Console.WriteLine("CALLING REWIND");
85
82
int extraLen = 0 ;
86
83
bool foundLeadingBytes = false ;
87
84
88
85
for ( int i = 0 ; i <= howFarBack ; i ++ )
89
86
{
90
87
byte candidateByte = buf [ 0 - i ] ;
91
88
foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
89
+ Console . WriteLine ( $ "Rewinding byte to offset { - i } : { candidateByte : X2} ") ;
90
+ Console . WriteLine ( foundLeadingBytes ) ;
91
+
92
92
if ( foundLeadingBytes )
93
- {
93
+ {
94
+ Console . WriteLine ( "Found leading byte" ) ;
94
95
buf -= i ;
95
96
break ;
96
97
}
@@ -101,13 +102,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
101
102
return buf - howFarBack ;
102
103
}
103
104
104
- utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
105
- scalarCountAdjustment += TempScalarCountAdjustment ;
106
-
107
105
int TailUtf16CodeUnitCountAdjustment = 0 ;
108
106
int TailScalarCountAdjustment = 0 ;
109
107
110
108
byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
109
+ // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
110
+
111
111
112
112
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
113
113
scalarCountAdjustment += TailScalarCountAdjustment ;
@@ -219,7 +219,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
219
219
}
220
220
else
221
221
{
222
- // we may have a continuation
222
+ // we may have a continuation/too long error
223
223
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
224
224
scalarCountAdjustment = TempScalarCountAdjustment ;
225
225
return pInputBuffer + pos ;
@@ -257,12 +257,11 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
257
257
{
258
258
if ( ( pInputBuffer [ - i ] & 0b11000000 ) != 0b10000000 )
259
259
{
260
- string binaryString = Convert . ToString ( pInputBuffer [ - i ] , 2 ) . PadLeft ( 8 , '0' ) ;
261
- // Console.WriteLine($"Stopping at byte {binaryString}"); //debug
260
+ string binaryString = Convert . ToString ( pInputBuffer [ - i ] , 2 ) . PadLeft ( 8 , '0' ) ; //debug
261
+ Console . WriteLine ( $ "Stopping at byte { binaryString } ") ; //debug
262
262
break ;
263
263
}
264
264
contbyteadjust -= 1 ;
265
-
266
265
}
267
266
if ( ( pInputBuffer [ - i ] & 0b10000000 ) == 0 ) {
268
267
return ( 0 , i , - 1 , contbyteadjust , 0 ) ; // We must have that i == 1
@@ -279,19 +278,41 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
279
278
280
279
public static ( int utfadjust , int scalaradjust ) CalculateN2N3FinalSIMDAdjustments ( int asciibytes , int n4 , int contbytes , int totalbyte )
281
280
{
282
- // Console.WriteLine("---------"); //debug
283
- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
281
+ Console . WriteLine ( "---------" ) ; //debug
282
+ Console . WriteLine ( "CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte ) ; //debug
284
283
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte ;
285
284
int n2 = - 2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte ;
286
285
int utfadjust = - 2 * n4 - 2 * n3 - n2 ;
287
286
int scalaradjust = - n4 ;
288
287
289
- // Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
288
+ Console . WriteLine ( "CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust ) ; //debug
290
289
291
290
return ( utfadjust , scalaradjust ) ;
292
291
}
293
292
294
- public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes )
293
+ // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
294
+ // {
295
+ // // Calculate the total bytes from start_point to processedLength
296
+ // int totalbyte = processedLength - start_point;
297
+ // int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
298
+
299
+ // // Adjust the length to include a complete character, if necessary
300
+ // if (totalbyte > 0)
301
+ // {
302
+ // (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
303
+ // }
304
+
305
+ // // Pseudocode:
306
+ // // if 'TooLongErroronEdge' bool is true then
307
+ // // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
308
+
309
+ // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
310
+
311
+
312
+ // return (utfadjust, scalaradjust);
313
+ // }
314
+
315
+ public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes , bool TooLongErroronEdge = false )
295
316
{
296
317
// Calculate the total bytes from start_point to processedLength
297
318
int totalbyte = processedLength - start_point ;
@@ -300,17 +321,25 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
300
321
// Adjust the length to include a complete character, if necessary
301
322
if ( totalbyte > 0 )
302
323
{
303
- ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
324
+ ( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
304
325
}
305
326
306
- // var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte);
307
- var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
327
+ // Adjust the counters if 'TooLongErroronEdge' is true
328
+ if ( TooLongErroronEdge )
329
+ {
330
+ // If you can figure out why this makes a difference,youre golden
331
+ asciibytes += adjustascii ;
332
+ contbytes += adjustcont ;
333
+ n4 += adjustn4 ;
334
+ }
308
335
336
+ var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
309
337
310
338
return ( utfadjust , scalaradjust ) ;
311
339
}
312
340
313
341
342
+
314
343
public unsafe static byte * GetPointerToFirstInvalidByteSse ( byte * pInputBuffer , int inputLength )
315
344
{
316
345
@@ -465,9 +494,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
465
494
466
495
public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
467
496
{
468
- // Console.ForegroundColor = ConsoleColor.Blue; //debug
469
- // Console.WriteLine("-------------------------------------");//debug
470
- // Console.ResetColor();//debug
497
+ Console . ForegroundColor = ConsoleColor . Blue ; //debug
498
+ Console . WriteLine ( "-------------------------------------" ) ; //debug
499
+ Console . ResetColor ( ) ; //debug
471
500
472
501
int processedLength = 0 ;
473
502
int TempUtf16CodeUnitCountAdjustment = 0 ;
@@ -659,23 +688,100 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
659
688
Vector256 < byte > must23 = Avx2 . Or ( isThirdByte , isFourthByte ) ;
660
689
Vector256 < byte > must23As80 = Avx2 . And ( must23 , v80 ) ;
661
690
Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
662
- if ( ! Avx2 . TestZ ( error , error ) )
663
- {
664
- // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
665
- int totalbyteasciierror = processedLength - start_point ;
666
- var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes ) ;
691
+ // if (!Avx2.TestZ(error, error))
692
+ // {
693
+ // Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
667
694
668
- utf16CodeUnitCountAdjustment = utfadjustasciierror ;
669
- scalarCountAdjustment = scalaradjustasciierror ;
695
+ // int off = processedLength >= 32 ? processedLength - 32 : processedLength;
696
+ // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
697
+
698
+ // utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
699
+ // scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
700
+
701
+ // // We need to take care of eg
702
+ // // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
703
+ // // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
704
+ // // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
705
+ // // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
706
+ // // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
707
+ // // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
708
+ // // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
709
+ // // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
710
+ // // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
711
+
712
+ // // so in short , we want to solve this error while at the same time not disturbing anything else
713
+ // // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
714
+ // // *TODO:Fill code here *
715
+ // // Peudocode for now
716
+ // // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
717
+ // // pass on true to the
718
+
719
+
720
+ // int totalbyteasciierror = processedLength - start_point;
721
+ // var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
722
+
723
+ // utf16CodeUnitCountAdjustment += utfadjustasciierror;
724
+ // scalarCountAdjustment += scalaradjustasciierror;
725
+
726
+ // TailScalarCodeUnitCountAdjustment =0;
727
+ // TailUtf16CodeUnitCountAdjustment =0;
670
728
671
- TailScalarCodeUnitCountAdjustment = 0 ;
672
- TailUtf16CodeUnitCountAdjustment = 0 ;
729
+
730
+
731
+ // return invalidBytePointer;
732
+ // }
733
+
734
+ if ( ! Avx2 . TestZ ( error , error ) )
735
+ {
736
+ Console . WriteLine ( $ "--Error! @ { processedLength } bytes") ; //debug
673
737
674
738
int off = processedLength >= 32 ? processedLength - 32 : processedLength ;
675
739
byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
740
+ bool TooLongErroronEdge = false ;
741
+
742
+ utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment ;
743
+ scalarCountAdjustment = TailScalarCodeUnitCountAdjustment ;
744
+
745
+ Console . WriteLine ( $ "RewindScalarValidation's function utf16adjust:{ TailUtf16CodeUnitCountAdjustment } , scalaradjust:{ TailScalarCodeUnitCountAdjustment } ") ;
746
+
747
+ // We need to take care of eg
748
+ // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
749
+ // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
750
+ // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
751
+ // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
752
+ // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
753
+ // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
754
+ // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
755
+ // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
756
+ // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
757
+
758
+ // so in short , we want to solve this error while at the same time not disturbing anything else
759
+ // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
760
+ // *TODO:Fill code here *
761
+ // Peudocode for now
762
+ // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
763
+ // pass on true to the
764
+
765
+ // Calculate the offset of the invalid byte pointer from the start of the input buffer
766
+ ulong offsetFromStart = ( ulong ) ( invalidBytePointer - pInputBuffer ) ;
767
+
768
+ // Debugging output
769
+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
770
+ bool isOneByteAfterProcessedLength = ( invalidBytePointer == pInputBuffer + processedLength ) ;
771
+
772
+ // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
773
+ if ( isContinuationByte && isOneByteAfterProcessedLength )
774
+ {
775
+ Console . WriteLine ( "Triggering TooLongErrorOnEdge adjustment" ) ;
776
+ TooLongErroronEdge = true ;
777
+ }
676
778
677
- utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
678
- scalarCountAdjustment += TailScalarCodeUnitCountAdjustment ;
779
+
780
+ int totalbyteasciierror = processedLength - start_point ;
781
+ var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes , TooLongErroronEdge ) ;
782
+
783
+ utf16CodeUnitCountAdjustment += utfadjustasciierror ;
784
+ scalarCountAdjustment += scalaradjustasciierror ;
679
785
680
786
return invalidBytePointer ;
681
787
}
@@ -690,7 +796,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
690
796
processedLength -= i ;
691
797
n4 += tempn4 ; // this is + because the adjustment function returns something negative already
692
798
contbytes += tempcont ;
693
- // Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
799
+ Console . WriteLine ( $ "Unterminated! @ { processedLength } Backing up by { i } ") ; //debug
694
800
}
695
801
696
802
@@ -763,6 +869,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
763
869
{
764
870
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
765
871
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment ;
872
+
766
873
// An invalid byte was found by the scalar function
767
874
return invalidBytePointer ;
768
875
}
0 commit comments