@@ -70,43 +70,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
70
70
}
71
71
}
72
72
73
-
74
-
75
- // public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0)
76
- // {
77
- // utf16CodeUnitCountAdjustment = 0;
78
- // scalarCountAdjustment = 0;
79
-
80
- // // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking.
81
- // byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
82
-
83
- // // If the adjustments are still 0 and there are skipped bytes to consider,
84
- // // loop through the skipped bytes and adjust the counts as needed.
85
- // if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0)
86
- // {
87
- // for (int i = 0; i < skippedBytes; i++)
88
- // {
89
- // byte currentByte = *(pInputBuffer + i);
90
- // if (currentByte >= 0xC0 && currentByte < 0xE0)
91
- // {
92
- // // 2-byte sequence
93
- // utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic
94
- // scalarCountAdjustment -= 1;
95
- // }
96
- // else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0))
97
- // {
98
- // // 3-byte or 4-byte sequence
99
- // utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences
100
- // scalarCountAdjustment -= 1;
101
- // }
102
- // // Adjust for other conditions as necessary
103
- // }
104
- // }
105
-
106
- // return result; // Return the pointer from the original check
107
- // }
108
-
109
-
110
73
public unsafe static byte * GetPointerToFirstInvalidByteScalar ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
111
74
{
112
75
@@ -400,6 +363,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
400
363
401
364
public unsafe static byte * GetPointerToFirstInvalidByteAvx2 ( byte * pInputBuffer , int inputLength , out int utf16CodeUnitCountAdjustment , out int scalarCountAdjustment )
402
365
{
366
+ Console . WriteLine ( "--------------------------Calling function----------------------------------" ) ;
403
367
int processedLength = 0 ;
404
368
int TempUtf16CodeUnitCountAdjustment = 0 ;
405
369
int TempScalarCountAdjustment = 0 ;
@@ -551,10 +515,42 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
551
515
if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
552
516
{
553
517
518
+ // TODO/think about : this path iss not explicitly tested
519
+ Console . WriteLine ( "----Checkpoint 1:All ASCII need rewind" ) ;
554
520
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
555
521
scalarCountAdjustment = TempScalarCountAdjustment ;
556
522
557
- int off = processedLength >= 3 ? processedLength - 3 : processedLength ;
523
+ // int off = processedLength >= 3 ? processedLength - 3 : processedLength;
524
+ int off = 0 ;
525
+
526
+ if ( processedLength >= 32 + 3 ) {
527
+ off = processedLength - 32 - 3 ;
528
+ int overlapCount = 3 ;
529
+
530
+ for ( int k = 0 ; k < overlapCount ; k ++ )
531
+ {
532
+
533
+ int candidateByte = pInputBuffer [ processedLength + k ] ;
534
+ if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
535
+ {
536
+ if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
537
+ {
538
+ TempUtf16CodeUnitCountAdjustment += 1 ;
539
+ }
540
+ if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
541
+ {
542
+ TempUtf16CodeUnitCountAdjustment += 2 ;
543
+ }
544
+ if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
545
+ {
546
+ TempUtf16CodeUnitCountAdjustment += 2 ;
547
+ TempScalarCountAdjustment += 1 ;
548
+ }
549
+ }
550
+ }
551
+ }
552
+ else { off = processedLength ; }
553
+
558
554
return SimdUnicode . UTF8 . RewindAndValidateWithErrors ( off , pInputBuffer + off , inputLength - off ) ;
559
555
}
560
556
prevIncomplete = Vector256 < byte > . Zero ;
@@ -626,7 +622,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
626
622
Vector256 < byte > error = Avx2 . Xor ( must23As80 , sc ) ;
627
623
if ( ! Avx2 . TestZ ( error , error ) )
628
624
{
629
-
625
+ // TODO: add error handling for Code count
630
626
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment ;
631
627
scalarCountAdjustment = TempScalarCountAdjustment ;
632
628
@@ -639,13 +635,15 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
639
635
640
636
if ( ! Avx2 . TestZ ( prevIncomplete , prevIncomplete ) )
641
637
{
638
+
639
+ Console . WriteLine ( "----Checkpoint 2:SIMD rewind" ) ;
642
640
// We have an unterminated sequence.
643
641
processedLength -= 3 ;
644
642
for ( int k = 0 ; k < 3 ; k ++ )
645
643
{
646
644
647
645
int candidateByte = pInputBuffer [ processedLength + k ] ;
648
- if ( ( pInputBuffer [ processedLength + k ] & 0b11000000 ) == 0b11000000 )
646
+ if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
649
647
{
650
648
if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
651
649
{
@@ -660,15 +658,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
660
658
TempUtf16CodeUnitCountAdjustment += 2 ;
661
659
TempScalarCountAdjustment += 1 ;
662
660
}
663
-
664
-
665
- processedLength += k ;
666
- // break;
667
-
668
661
}
669
-
670
-
671
-
672
662
}
673
663
}
674
664
}
@@ -678,10 +668,40 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
678
668
// Process the remaining bytes with the scalar function
679
669
if ( processedLength < inputLength )
680
670
{
671
+
672
+ Console . WriteLine ( "----Process remaining Scalar" ) ;
673
+ int overlapCount = 0 ;
674
+
681
675
// // We need to possibly backtrack to the start of the last code point
682
676
while ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
683
677
{
684
678
processedLength -= 1 ;
679
+ overlapCount += 1 ;
680
+ }
681
+
682
+ for ( int k = 0 ; k < overlapCount ; k ++ )
683
+ {
684
+
685
+ int candidateByte = pInputBuffer [ processedLength + k ] ;
686
+ if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
687
+ {
688
+ if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
689
+ {
690
+ TempUtf16CodeUnitCountAdjustment += 1 ;
691
+ }
692
+ if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
693
+ {
694
+ TempUtf16CodeUnitCountAdjustment += 2 ;
695
+ }
696
+ if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
697
+ {
698
+ TempUtf16CodeUnitCountAdjustment += 2 ;
699
+ TempScalarCountAdjustment += 1 ;
700
+ }
701
+
702
+ // processedLength += k;
703
+ break ;
704
+ }
685
705
}
686
706
687
707
byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByteScalar ( pInputBuffer + processedLength , inputLength - processedLength , out TailUtf16CodeUnitCountAdjustment , out TailScalarCodeUnitCountAdjustment ) ;
0 commit comments