@@ -13,7 +13,7 @@ public static class UTF8
13
13
static Func < byte , string > byteToBinaryString = b => Convert . ToString ( b , 2 ) . PadLeft ( 8 , '0' ) ;
14
14
15
15
16
- public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment )
16
+ public unsafe static byte * RewindAndValidateWithErrors ( int howFarBack , byte * buf , int len , ref int utf16CodeUnitCountAdjustment , ref int scalarCountAdjustment , bool prevWasSimd = false )
17
17
{
18
18
Console . WriteLine ( "--Rewind Validate with Errors" ) ;
19
19
Console . WriteLine ( "current Byte:" + Convert . ToString ( buf [ 0 ] , 2 ) . PadLeft ( 8 , '0' ) ) ;
@@ -23,12 +23,50 @@ public static class UTF8
23
23
24
24
int extraLen = 0 ;
25
25
bool foundLeadingBytes = false ;
26
+ // Console.WriteLine(prevWasSimd);
27
+
28
+ // adjust for filling in gap
29
+ // If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD
30
+ if ( prevWasSimd )
31
+ {
32
+ // Console.WriteLine("Triggering Negative adjustment!");
33
+ // for (int i = 0; i <= 3; i++)
34
+ // {
35
+ // if (i == 0){continue;}; // we dont want to dbouble count current byte
36
+ // byte candidateByte = buf[0 - i];
37
+ // foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
38
+ // // if (i==0 & foundLeadingBytes){break;};// We dont want to
39
+ // // TODO: written like this for readability, I know its ugly so this needs to be rewritten
40
+
41
+ // if (foundLeadingBytes)
42
+ // {
43
+
44
+ // Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
45
+ // // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
46
+
47
+ // // adjustment to avoid double counting
48
+ // if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
49
+ // {
50
+ // // Console.WriteLine("Found 2 byte");
51
+ // TempUtf16CodeUnitCountAdjustment -= 1;
52
+ // }
53
+ // if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
54
+ // {
55
+ // // Console.WriteLine("Found 3 byte");
56
+ // TempUtf16CodeUnitCountAdjustment -= 2;
57
+ // }
58
+ // if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
59
+ // {
60
+ // // Console.WriteLine("Found 4 byte");
61
+ // TempUtf16CodeUnitCountAdjustment -= 2;
62
+ // TempScalarCountAdjustment -= 1;
63
+ // }
64
+ // // break;
65
+ // }
66
+ // }
67
+ }
68
+
26
69
27
- // this is the generic function called when there is an error:
28
- // TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length
29
- // Even with no errors, it sometime double counts, why.. ? because it goes back even further
30
- // even though the scalar doesnt thread
31
- // adjust for double counting
32
70
// for (int i = 0; i <= howFarBack; i++)
33
71
// {
34
72
// if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
@@ -713,7 +751,7 @@ public static class UTF8
713
751
else // Contains non-ASCII characters, we need to do non-trivial processing
714
752
{
715
753
Console . WriteLine ( "--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes" ) ;
716
- prevWasSimd = true ;
754
+ prevWasSimd = true ; // consider moving this somewhere else
717
755
718
756
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
719
757
// TODO:integrate this better with the rest of the code
@@ -766,11 +804,19 @@ public static class UTF8
766
804
Console . WriteLine ( "-----Error path!!" ) ;
767
805
TailScalarCodeUnitCountAdjustment = 0 ;
768
806
TailUtf16CodeUnitCountAdjustment = 0 ;
807
+ int off = 32 ;
808
+
809
+ // if (processedLength <32) // not enough bytes to load into SIMD!
810
+ // {
811
+ // // off = 0;
812
+ // prevWasSimd = false; // there was no previous op at all, let alone SIMD one
813
+ // }
769
814
770
- // TODO :I cant remember why I pu an off that does the same thing here but look intit
771
- // int off = processedLength >= 32 ? processedLength: processedLength;
815
+
816
+ // int off = processedLength >= 32 ? processedLength: 0; // we check if there
817
+ // without this there is an overflow if
772
818
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
773
- byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( 3 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment ) ;
819
+ byte * invalidBytePointer = SimdUnicode . UTF8 . RewindAndValidateWithErrors ( 3 , pInputBuffer + processedLength , inputLength - processedLength , ref TailUtf16CodeUnitCountAdjustment , ref TailScalarCodeUnitCountAdjustment , prevWasSimd ) ;
774
820
775
821
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment ;
776
822
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment ;
0 commit comments