@@ -15,6 +15,9 @@ public static class UTF8
15
15
int TempUtf16CodeUnitCountAdjustment = 0 ;
16
16
int TempScalarCountAdjustment = 0 ;
17
17
18
+ int TailUtf16CodeUnitCountAdjustment = 0 ;
19
+ int TailScalarCountAdjustment = 0 ;
20
+
18
21
int howFarBack = priorBytes ;
19
22
int extraLen = 0 ;
20
23
bool foundLeadingBytes = false ;
@@ -24,27 +27,47 @@ public static class UTF8
24
27
foundLeadingBytes = ( b & 0b11000000 ) != 0b10000000 ;
25
28
if ( foundLeadingBytes )
26
29
{
30
+
31
+
32
+ if ( ( b & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
33
+ {
34
+ TempUtf16CodeUnitCountAdjustment += 1 ;
35
+ }
36
+ if ( ( b & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
37
+ {
38
+ TempUtf16CodeUnitCountAdjustment += 2 ;
39
+ }
40
+ if ( ( b & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
41
+ {
42
+ TempUtf16CodeUnitCountAdjustment += 2 ;
43
+ TempScalarCountAdjustment += 1 ;
44
+ }
45
+
46
+
27
47
buf -= i ;
28
48
extraLen = i ;
29
49
break ;
30
50
}
31
51
}
52
+
53
+ utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
54
+ scalarCountAdjustment += TempScalarCountAdjustment ;
55
+
56
+
32
57
if ( ! foundLeadingBytes )
33
58
{
34
- utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
35
- scalarCountAdjustment += TempScalarCountAdjustment ;
36
59
return buf - howFarBack ;
37
60
}
38
61
39
- // TODO : fix Count handling here
62
+
40
63
41
64
42
65
// Now buf points to the start of a UTF-8 sequence or the start of the buffer.
43
66
// Validate from this new start point with the adjusted length.
44
- byte * invalidByte = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TempUtf16CodeUnitCountAdjustment , out TempScalarCountAdjustment ) ;
67
+ byte * invalidByte = GetPointerToFirstInvalidByteScalar ( buf , len + extraLen , out TailUtf16CodeUnitCountAdjustment , out TailScalarCountAdjustment ) ;
45
68
46
- utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment ;
47
- scalarCountAdjustment += TempScalarCountAdjustment ;
69
+ utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment ;
70
+ scalarCountAdjustment += TailScalarCountAdjustment ;
48
71
49
72
return invalidByte ;
50
73
}
@@ -220,6 +243,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
220
243
{
221
244
222
245
int processedLength = 0 ;
246
+ int TempUtf16CodeUnitCountAdjustment = 0 ;
247
+ int TempScalarCountAdjustment = 0 ;
223
248
224
249
if ( pInputBuffer == null || inputLength <= 0 )
225
250
{
@@ -309,7 +334,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
309
334
// return pInputBuffer + processedLength;
310
335
311
336
// Console.WriteLine("not ascii");
312
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
337
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength , ref TempUtf16CodeUnitCountAdjustment , ref TempScalarCountAdjustment ) ;
313
338
}
314
339
prevIncomplete = Vector128 < byte > . Zero ;
315
340
}
@@ -331,7 +356,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
331
356
Vector128 < byte > error = Sse2 . Xor ( must23As80 , sc ) ;
332
357
if ( Sse2 . MoveMask ( error ) != 0 )
333
358
{
334
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
359
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength , ref TempUtf16CodeUnitCountAdjustment , ref TempScalarCountAdjustment ) ;
335
360
}
336
361
prevIncomplete = Sse2 . SubtractSaturate ( currentBlock , maxValue ) ;
337
362
}
@@ -527,12 +552,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
527
552
{
528
553
529
554
// TODO/think about : this path iss not explicitly tested
530
- // Console.WriteLine("----Checkpoint 1:All ASCII need rewind");
555
+ Console . WriteLine ( "----Checkpoint 1:All ASCII need rewind" ) ;
531
556
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
532
557
scalarCountAdjustment = TempScalarCountAdjustment ;
533
558
534
559
// int off = processedLength >= 3 ? processedLength - 3 : processedLength;
535
- int off = 0 ;
560
+ int off = processedLength ;
536
561
537
562
if ( processedLength >= 32 + 3 ) {
538
563
off = processedLength - 32 - 3 ;
@@ -560,9 +585,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
560
585
}
561
586
}
562
587
}
563
- else { off = processedLength ; }
588
+ // else{ off = processedLength;}
564
589
565
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off ) ;
590
+ // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
591
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off , ref utf16CodeUnitCountAdjustment , ref scalarCountAdjustment ) ;
566
592
}
567
593
prevIncomplete = Vector256 < byte > . Zero ;
568
594
}
@@ -633,12 +659,14 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
633
659
Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
634
660
if ( ! Avx2 . TestZ ( error , error ) )
635
661
{
636
- // TODO: add error handling for Code count
662
+ Console . WriteLine ( "-----Error path!!" ) ;
637
663
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
638
664
scalarCountAdjustment = TempScalarCountAdjustment ;
639
665
640
666
int off = processedLength >= 32 ? processedLength - 32 : processedLength ;
641
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off ) ;
667
+ // return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
668
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off , ref utf16CodeUnitCountAdjustment , ref scalarCountAdjustment ) ;
669
+
642
670
}
643
671
prevIncomplete = Avx2 . SubtractSaturate ( currentBlock , maxValue ) ;
644
672
}
@@ -647,7 +675,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
647
675
if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
648
676
{
649
677
650
- // Console.WriteLine("----Checkpoint 2:SIMD rewind");
678
+ Console . WriteLine ( "----Checkpoint 2:SIMD rewind" ) ;
651
679
// We have an unterminated sequence.
652
680
processedLength -= 3 ;
653
681
for ( int k = 0 ; k < 3 ; k ++ )
@@ -680,7 +708,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
680
708
if ( processedLength < inputLength )
681
709
{
682
710
683
- // Console.WriteLine("----Process remaining Scalar");
711
+ Console . WriteLine ( "----Process remaining Scalar" ) ;
684
712
int overlapCount = 0 ;
685
713
686
714
// // We need to possibly backtrack to the start of the last code point
@@ -736,6 +764,11 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
736
764
{
737
765
int processedLength = 0 ;
738
766
767
+ int TempUtf16CodeUnitCountAdjustment = 0 ;
768
+ int TempScalarCountAdjustment = 0 ;
769
+
770
+ int utf16CodeUnitCountAdjustment = 0 , scalarCountAdjustment = 0 ;
771
+
739
772
if ( pInputBuffer == null || inputLength <= 0 )
740
773
{
741
774
return pInputBuffer ;
@@ -817,7 +850,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
817
850
// we need to check if the previous block was incomplete.
818
851
if ( AdvSimd . Arm64 . MaxAcross ( prevIncomplete ) . ToScalar ( ) != 0 )
819
852
{
820
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
853
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength , ref utf16CodeUnitCountAdjustment , ref scalarCountAdjustment ) ;
821
854
}
822
855
prevIncomplete = Vector128 < byte > . Zero ;
823
856
}
@@ -839,7 +872,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
839
872
Vector128 < byte > error = AdvSimd . Xor ( must23As80 , sc ) ;
840
873
if ( AdvSimd . Arm64 . MaxAcross ( error ) . ToScalar ( ) != 0 )
841
874
{
842
- return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength ) ;
875
+ return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( processedLength , pInputBuffer + processedLength , inputLength - processedLength , ref utf16CodeUnitCountAdjustment , ref scalarCountAdjustment ) ;
843
876
}
844
877
prevIncomplete = AdvSimd . SubtractSaturate ( currentBlock , maxValue ) ;
845
878
}
0 commit comments