@@ -10,72 +10,6 @@ namespace SimdUnicode
10
10
public static class UTF8
11
11
{
12
12
13
-
14
- static void PrintHexAndBinary ( byte [ ] bytes , int highlightIndex = - 1 )
15
- {
16
- int chunkSize = 16 ; // 128 bits = 16 bytes
17
-
18
- // Process each chunk for hexadecimal
19
- Console . Write ( "Hex: " ) ;
20
- for ( int i = 0 ; i < bytes . Length ; i ++ )
21
- {
22
- if ( i > 0 && i % chunkSize == 0 )
23
- Console . WriteLine ( ) ; // New line after every 16 bytes
24
-
25
- if ( i == highlightIndex )
26
- {
27
- Console . ForegroundColor = ConsoleColor . Red ;
28
- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
29
- Console . ResetColor ( ) ;
30
- }
31
- else if ( i % ( chunkSize * 2 ) == 0 ) // print green every 256 bytes
32
- {
33
- Console . ForegroundColor = ConsoleColor . Green ;
34
- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
35
- Console . ResetColor ( ) ;
36
- }
37
- else
38
- {
39
- Console . Write ( $ "{ bytes [ i ] : X2} ") ;
40
- }
41
-
42
- if ( ( i + 1 ) % chunkSize != 0 ) Console . Write ( " " ) ; // Add space between bytes but not at the end of the line
43
- }
44
- Console . WriteLine ( "\n " ) ; // New line for readability and to separate hex from binary
45
-
46
- // Process each chunk for binary
47
- Console . Write ( "Binary: " ) ;
48
- for ( int i = 0 ; i < bytes . Length ; i ++ )
49
- {
50
- if ( i > 0 && i % chunkSize == 0 )
51
- Console . WriteLine ( ) ; // New line after every 16 bytes
52
-
53
- string binaryString = Convert . ToString ( bytes [ i ] , 2 ) . PadLeft ( 8 , '0' ) ;
54
- if ( i == highlightIndex )
55
- {
56
- Console . ForegroundColor = ConsoleColor . Red ;
57
- Console . Write ( $ "{ binaryString } ") ;
58
- Console . ResetColor ( ) ;
59
- }
60
- else if ( i % ( chunkSize * 2 ) == 0 ) // print green every 256 bytes
61
- {
62
- Console . ForegroundColor = ConsoleColor . Green ;
63
- Console . Write ( $ "{ binaryString } ") ;
64
- Console . ResetColor ( ) ;
65
- }
66
- else
67
- {
68
- Console . Write ( $ "{ binaryString } ") ;
69
- }
70
-
71
- if ( ( i + 1 ) % chunkSize != 0 ) Console . Write ( " " ) ; // Add space between bytes but not at the end of the line
72
- }
73
- Console . WriteLine ( ) ; // New line for readability
74
- }
75
-
76
-
77
- static Func < byte , string > byteToBinaryString = b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ; //for debugging
78
-
79
13
// prevents double counting in case there is a toolong error on the edge
80
14
public static ( int utfAdjust , int scalarAdjust ) GetFinalScalarUtfAdjustments ( byte headerByte )
81
15
{
@@ -92,7 +26,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
92
26
// Check if the header byte belongs to a 4-byte UTF-8 character
93
27
else if ( ( headerByte & 0b11111000 ) == 0b11110000 )
94
28
{
95
-
96
29
return ( 2 , 1 ) ;
97
30
}
98
31
// Otherwise, it's a 1-byte character or continuation byte
@@ -107,10 +40,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
107
40
bool foundLeadingBytes = false ;
108
41
109
42
// Print the byte value at the buf pointer
110
- byte * PinputPlusProcessedlength = buf ;
111
-
112
-
113
-
43
+ byte * PinputPlusProcessedlength = buf ;
114
44
int TooLongErroronEdgeUtfadjust = 0 ;
115
45
int TooLongErroronEdgeScalaradjust = 0 ;
116
46
@@ -119,8 +49,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
119
49
byte candidateByte = buf [ 0 - i ] ;
120
50
foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
121
51
122
-
123
-
124
52
if ( foundLeadingBytes )
125
53
{
126
54
@@ -140,27 +68,26 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
140
68
int TailScalarCountAdjustment = 0 ;
141
69
142
70
byte * invalidBytePointer = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
143
- // Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
144
-
145
- bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
146
- bool isOneByteAfterProcessedLength = ( invalidBytePointer == PinputPlusProcessedlength ) ;
147
-
148
-
149
-
150
- // // Print the byte value at the invalidBytePointer
151
-
152
71
72
+ // We need to take care of eg
73
+ // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
74
+ // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 *11110000* 10011001 10101011 10000011
75
+ // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
76
+ // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
77
+ // Without the following check, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
78
+ // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
79
+ // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
80
+ // the part between parentheses will be counted as valid and thus scalaradjust/utfadjust will be incremented once too much
153
81
82
+ bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
83
+ bool isOnEdge = ( invalidBytePointer == PinputPlusProcessedlength ) ;
154
84
155
- if ( isContinuationByte && isOneByteAfterProcessedLength )
85
+ if ( isContinuationByte && isOnEdge )
156
86
{
157
-
158
87
utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust ;
159
88
scalarCountAdjustment += TooLongErroronEdgeScalaradjust ;
160
-
161
89
}
162
90
163
-
164
91
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
165
92
scalarCountAdjustment += TailScalarCountAdjustment ;
166
93
@@ -295,7 +222,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
295
222
const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS ;
296
223
297
224
// Assuming that a valid UTF-8 sequence ends at pInputBuffer,
298
- // computes how many bytes are needed (eg what type of byte) to complete the last character. also counts the number of n4, n2 and ascii affected
225
+ // computes how many bytes are needed to complete the last character. also counts the number of n4, n2 and ascii affected
299
226
// This will return 1, 2, 3. If the whole byte sequence is valid UTF-8,
300
227
// and this function returns returnedvalue>0, then the bytes at pInputBuffer[0],
301
228
// ... pInputBuffer[returnedvalue - 1] should be continuation bytes.
@@ -309,8 +236,6 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
309
236
{
310
237
if ( ( pInputBuffer [ - i ] & 0b11000000 ) != 0b10000000 )
311
238
{
312
-
313
-
314
239
break ;
315
240
}
316
241
contbyteadjust -= 1 ;
@@ -330,19 +255,15 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
330
255
331
256
public static ( int utfadjust , int scalaradjust ) CalculateN2N3FinalSIMDAdjustments ( int asciibytes , int n4 , int contbytes , int totalbyte )
332
257
{
333
-
334
-
335
258
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte ;
336
259
int n2 = - 2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte ;
337
260
int utfadjust = - 2 * n4 - 2 * n3 - n2 ;
338
261
int scalaradjust = - n4 ;
339
262
340
-
341
-
342
263
return ( utfadjust , scalaradjust ) ;
343
264
}
344
265
345
- public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes , bool TooLongErroronEdge = false )
266
+ public unsafe static ( int utfadjust , int scalaradjust ) calculateErrorPathadjust ( int start_point , int processedLength , byte * pInputBuffer , int asciibytes , int n4 , int contbytes )
346
267
{
347
268
// Calculate the total bytes from start_point to processedLength
348
269
int totalbyte = processedLength - start_point ;
@@ -353,21 +274,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
353
274
{
354
275
( adjusttotalbyte , backedupByHowMuch , adjustascii , adjustcont , adjustn4 ) = adjustmentFactor ( pInputBuffer + processedLength ) ;
355
276
}
356
-
357
- // if (TooLongErroronEdge)
358
- // {
359
- // asciibytes += adjustascii;
360
- // contbytes += adjustcont;
361
- // n4 += adjustn4;
362
- // }
363
-
364
277
var ( utfadjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte + adjusttotalbyte ) ;
365
-
366
278
return ( utfadjust , scalaradjust ) ;
367
279
}
368
280
369
-
370
-
371
281
public unsafe static byte * GetPointerToFirstInvalidByteSse ( byte * pInputBuffer , int inputLength )
372
282
{
373
283
@@ -522,10 +432,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
522
432
523
433
public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
524
434
{
525
-
526
-
527
-
528
-
529
435
int processedLength = 0 ;
530
436
int TempUtf16CodeUnitCountAdjustment = 0 ;
531
437
int TempScalarCountAdjustment = 0 ;
@@ -678,7 +584,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
678
584
//
679
585
if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
680
586
{
681
- // TODO : this path is not explicitly tested, write tests
587
+ // Note/todo : this path is not yet explicitly tested
682
588
int totalbyteasciierror = processedLength - start_point ;
683
589
var ( utfadjustasciierror , scalaradjustasciierror ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyteasciierror ) ;
684
590
@@ -713,49 +619,13 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
713
619
714
620
if ( ! Avx2 . TestZ ( error , error ) )
715
621
{
716
-
717
-
718
622
int off = processedLength > 32 ? processedLength - 32 : processedLength ; // this does not backup ff processedlength = 32
719
-
720
-
721
623
byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
722
- bool TooLongErroronEdge = false ;
723
-
724
624
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment ;
725
625
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment ;
726
626
727
-
728
-
729
- // We need to take care of eg
730
- // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
731
- // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
732
- // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
733
- // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
734
- // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
735
- // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
736
- // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
737
- // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
738
- // If this error arrive at the edge of 2 simd vector, that is where problem abound
739
-
740
- // Calculate the offset of the invalid byte pointer from the start of the input buffer
741
- ulong offsetFromStart = ( ulong ) ( invalidBytePointer - pInputBuffer ) ;
742
-
743
- // Debugging output
744
-
745
- bool isContinuationByte = ( invalidBytePointer [ 0 ] & 0xC0 ) == 0x80 ;
746
-
747
- bool isOneByteAfterProcessedLength = ( invalidBytePointer == pInputBuffer + processedLength ) ;
748
-
749
-
750
- if ( isContinuationByte && isOneByteAfterProcessedLength )
751
- {
752
-
753
- // TooLongErroronEdge = true;
754
- }
755
-
756
-
757
627
int totalbyteasciierror = processedLength - start_point ;
758
- var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes , TooLongErroronEdge ) ;
628
+ var ( utfadjustasciierror , scalaradjustasciierror ) = calculateErrorPathadjust ( start_point , processedLength , pInputBuffer , asciibytes , n4 , contbytes ) ;
759
629
760
630
utf16CodeUnitCountAdjustment += utfadjustasciierror ;
761
631
scalarCountAdjustment += scalaradjustasciierror ;
@@ -769,13 +639,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
769
639
{
770
640
// We have an unterminated sequence.
771
641
var ( totalbyteadjustment , i , tempascii , tempcont , tempn4 ) = adjustmentFactor ( pInputBuffer + processedLength + 32 ) ;
772
-
773
642
processedLength -= i ;
774
643
n4 += tempn4 ;
775
644
contbytes += tempcont ;
776
-
777
645
}
778
646
647
+ // (Nick Nuon)The counts for continuous bytes can probably be optimized:
648
+ // The draft had something like this line:
649
+ // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
650
+ // this actually counts the number of 2 consecutive continuous bytes
651
+ // I put something that was bound to be working regardless as a slow but temporary fix:
652
+
779
653
Vector256 < byte > top2bits = Vector256 . Create ( ( byte ) 0b11000000 ) ; // Mask to isolate the two most significant bits
780
654
Vector256 < byte > contbytemask = Vector256 . Create ( ( byte ) 0b10000000 ) ; // The expected pattern for continuation bytes: 10xxxxxx
781
655
@@ -797,10 +671,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
797
671
asciibytes += ( int ) ( 32 - Popcnt . PopCount ( ( uint ) mask ) ) ;
798
672
}
799
673
800
- // There are 2 possible scenarios here : either
801
- // A) it arrives flush en the border. eg it doesnt need to be processed further
802
- // B) There is some bytes remaining in which case we need to call the scalar functien
803
- // Either way we need to calculate n2,n3 and update the utf16adjust and scalar adjust
804
674
int totalbyte = processedLength - start_point ;
805
675
var ( utf16adjust , scalaradjust ) = CalculateN2N3FinalSIMDAdjustments ( asciibytes , n4 , contbytes , totalbyte ) ;
806
676
0 commit comments