@@ -16,6 +16,8 @@ public static class UTF8
16
16
public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
17
17
{
18
18
Console . WriteLine ( "--Rewind Validate with Errors" ) ;
19
+ Console . WriteLine ( "current Byte:" + Convert . ToString ( buf [ 0 ] , 2 ) . PadLeft ( 8 , '0' ) ) ;
20
+
19
21
int TempUtf16CodeUnitCountAdjustment = 0 ;
20
22
int TempScalarCountAdjustment = 0 ;
21
23
@@ -26,47 +28,53 @@ public static class UTF8
26
28
// Even with no errors, it sometime double counts, why.. ? because it goes back even further
27
29
// even though the scalar doesnt thread
28
30
// adjust for double counting
29
- // for (int i = 0; i <= howFarBack; i++)
30
- // {
31
- // byte candidateByte = buf[0 - i];
32
- // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
33
- // if (foundLeadingBytes)
34
- // {
35
- // // if (i == 0) {break;}
36
- // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
37
- // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
38
-
39
- // // adjustment to avoid double counting
40
- // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
41
- // {
42
- // // Console.WriteLine("Found 2 byte");
43
- // TempUtf16CodeUnitCountAdjustment += 1;
44
- // }
45
- // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
46
- // {
47
- // // Console.WriteLine("Found 3 byte");
48
- // TempUtf16CodeUnitCountAdjustment += 2;
49
- // }
50
- // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
51
- // {
52
- // // Console.WriteLine("Found 4 byte");
53
- // TempUtf16CodeUnitCountAdjustment += 2;
54
- // TempScalarCountAdjustment += 1;
55
- // }
56
- // break;
57
- // }
58
- // }
31
+ // for (int i = 0; i <= howFarBack; i++)
32
+ for ( int i = 0 ; i <= howFarBack ; i ++ )
33
+ {
34
+ if ( i == 0 ) { continue ; } ; // we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
35
+ byte candidateByte = buf [ 0 - i ] ;
36
+ foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
37
+ if ( foundLeadingBytes )
38
+ {
39
+
40
+ // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
41
+ Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
42
+
43
+ // adjustment to avoid double counting
44
+ if ( ( candidateByte & 0b11100000 ) == 0b11000000 ) // Start of a 2-byte sequence
45
+ {
46
+ // Console.WriteLine("Found 2 byte");
47
+ TempUtf16CodeUnitCountAdjustment += 1 ;
48
+ }
49
+ if ( ( candidateByte & 0b11110000 ) == 0b11100000 ) // Start of a 3-byte sequence
50
+ {
51
+ // Console.WriteLine("Found 3 byte");
52
+ TempUtf16CodeUnitCountAdjustment += 2 ;
53
+ }
54
+ if ( ( candidateByte & 0b11111000 ) == 0b11110000 ) // Start of a 4-byte sequence
55
+ {
56
+ // Console.WriteLine("Found 4 byte");
57
+ TempUtf16CodeUnitCountAdjustment += 2 ;
58
+ TempScalarCountAdjustment += 1 ;
59
+ }
60
+ break ;
61
+ }
62
+ }
59
63
60
64
for ( int i = 0 ; i <= howFarBack ; i ++ )
61
65
{
66
+ Console . WriteLine ( "backup stat:" + i ) ;
62
67
byte candidateByte = buf [ 0 - i ] ;
63
68
foundLeadingBytes = ( candidateByte & 0b11000000 ) != 0b10000000 ;
64
69
if ( foundLeadingBytes )
65
70
{
66
71
buf -= i ;
67
72
extraLen = i ;
68
73
Console . WriteLine ( howFarBack ) ;
69
- Console . WriteLine ( "Backed up " + i + 1 + " bytes" ) ;
74
+ Console . WriteLine ( "Found leading byte at:" + i + ",Byte:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
75
+
76
+
77
+ // Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
70
78
break ;
71
79
}
72
80
}
@@ -663,21 +671,31 @@ public static class UTF8
663
671
// Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting);
664
672
int backedup = 0 ;
665
673
674
+ int currentByte = pInputBuffer [ processedLength ] ;
675
+ Console . WriteLine ( "CurrentByte:" + Convert . ToString ( currentByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
676
+
666
677
for ( int k = 0 ; k < 3 ; k ++ )
667
678
{
668
679
int candidateByte = pInputBuffer [ processedLength + k ] ;
680
+ Console . WriteLine ( "Backing up " + k + " bytes" ) ;
681
+ Console . WriteLine ( "CurrentByte after backing up:" + Convert . ToString ( candidateByte , 2 ) . PadLeft ( 8 , '0' ) ) ;
682
+
683
+ backedup = 3 - k + 1 ;
684
+ // TODO:
685
+ // the weird + 1 is so I dont have to put an else to the conditional below
686
+ // less readable, there might be a more elegant way to rewrite it but I am taking the path of convenience for now
687
+
669
688
if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
670
689
{
671
- backedup = 3 - k ;
672
- Console . WriteLine ( "Backing up " + backedup + " bytes" ) ;
673
-
674
690
// Whatever you do, do not delete this
675
691
processedLength += k ;
676
692
break ;
677
693
}
678
694
}
679
695
680
- for ( int k = backedup ; k < 3 ; k ++ )
696
+ Console . WriteLine ( "Backed up " + backedup + " bytes" ) ;
697
+
698
+ for ( int k = backedup ; k < 3 ; k ++ )
681
699
{
682
700
int candidateByte = pInputBuffer [ processedLength - k ] ;
683
701
if ( ( candidateByte & 0b11000000 ) == 0b11000000 )
@@ -726,20 +744,20 @@ public static class UTF8
726
744
{
727
745
728
746
Console . WriteLine ( "----Process remaining Scalar" ) ;
729
- // Console.WriteLine("processed length before:" + processedLength);
747
+ Console . WriteLine ( "processed length before:" + processedLength ) ;
730
748
int overlapCount = 0 ;
731
749
732
750
// // We need to possibly backtrack to the start of the last code point
733
751
while ( processedLength > 0 && ( sbyte ) pInputBuffer [ processedLength ] <= - 65 )
734
752
{
735
753
processedLength -= 1 ;
736
- // overlapCount +=1;
754
+ overlapCount += 1 ;
737
755
}
738
756
739
- // Console.WriteLine("processed length after:" + processedLength);
757
+ Console . WriteLine ( "processed length after backtrack :" + processedLength ) ;
740
758
741
759
742
- // Best use rewind I think
760
+ // TOCHECK:See if rewind is better here
743
761
// for(int k = 0; k < overlapCount; k++)
744
762
// {
745
763
// // There is no error here hence the loop is straigthforward and we avoid double counting every byte
@@ -763,11 +781,12 @@ public static class UTF8
763
781
// }
764
782
// }
765
783
766
- // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
767
- // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
784
+ Console . WriteLine ( "TempUTF16 before tail remaining check:" + TempUtf16CodeUnitCountAdjustment ) ;
785
+ Console . WriteLine ( "TempScalar '' '' '':" + TempScalarCountAdjustment ) ;
768
786
769
787
770
788
byte * invalidBytePointer = SimdUnicode . UTF8 . GetPointerToFirstInvalidByteScalar ( pInputBuffer + processedLength , inputLength - processedLength , out TailUtf16CodeUnitCountAdjustment , out TailScalarCodeUnitCountAdjustment ) ;
789
+ // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
771
790
if ( invalidBytePointer != pInputBuffer + inputLength )
772
791
{
773
792
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
@@ -777,8 +796,8 @@ public static class UTF8
777
796
return invalidBytePointer ;
778
797
}
779
798
780
- // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
781
- // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
799
+ Console . WriteLine ( "TempUTF16 after tail remaining check:" + TempUtf16CodeUnitCountAdjustment ) ;
800
+ Console . WriteLine ( "TempScalar '' '' '':" + TempScalarCountAdjustment ) ;
782
801
783
802
}
784
803
0 commit comments