Skip to content

Commit f3f2f9d

Browse files
committed
incomplete test working
1 parent cbf004d commit f3f2f9d

File tree

2 files changed

+167
-26
lines changed

2 files changed

+167
-26
lines changed

src/UTF8.cs

Lines changed: 164 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -24,38 +24,133 @@ public static class UTF8
2424
int extraLen = 0;
2525
bool foundLeadingBytes = false;
2626

27+
// this is the generic function called when there is an error:
2728
// TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length
2829
// Even with no errors, it sometime double counts, why.. ? because it goes back even further
2930
// even though the scalar doesnt thread
3031
// adjust for double counting
3132
// for (int i = 0; i <= howFarBack; i++)
32-
for (int i = 0; i <= howFarBack; i++)
33+
// {
34+
// if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
35+
// // TODO: written like this for readability, I know its ugly so this needs to be rewritten
36+
// byte candidateByte = buf[0 - i];
37+
// foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
38+
// if (foundLeadingBytes)
39+
// {
40+
41+
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
42+
// // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
43+
44+
// // adjustment to avoid double counting
45+
// if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
46+
// {
47+
// // Console.WriteLine("Found 2 byte");
48+
// TempUtf16CodeUnitCountAdjustment += 1;
49+
// }
50+
// if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
51+
// {
52+
// // Console.WriteLine("Found 3 byte");
53+
// TempUtf16CodeUnitCountAdjustment += 2;
54+
// }
55+
// if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
56+
// {
57+
// // Console.WriteLine("Found 4 byte");
58+
// TempUtf16CodeUnitCountAdjustment += 2;
59+
// TempScalarCountAdjustment += 1;
60+
// }
61+
// break;
62+
// }
63+
// }
64+
65+
for (int i = 0; i <= howFarBack; i++)
66+
{
67+
Console.WriteLine("backup stat:" + i);
68+
byte candidateByte = buf[0 - i];
69+
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
70+
if (foundLeadingBytes)
71+
{
72+
buf -= i;
73+
extraLen = i;
74+
Console.WriteLine(howFarBack);
75+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
76+
77+
// Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
78+
break;
79+
}
80+
}
81+
82+
83+
if (!foundLeadingBytes)
84+
{
85+
return buf - howFarBack;
86+
}
87+
88+
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
89+
scalarCountAdjustment += TempScalarCountAdjustment;
90+
91+
int TailUtf16CodeUnitCountAdjustment = 0;
92+
int TailScalarCountAdjustment = 0;
93+
94+
// Now buf points to the start of a UTF-8 sequence or the start of the buffer.
95+
// Validate from this new start point with the adjusted length.
96+
97+
// TODO:figure out why calling SIMD here breaks the tests filter.This just breaks stuff?!?!?!
98+
byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
99+
100+
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
101+
scalarCountAdjustment += TailScalarCountAdjustment;
102+
103+
Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment);
104+
Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment);
105+
Console.WriteLine(" ");
106+
Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment);
107+
Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment);
108+
109+
return invalidBytePointer;
110+
}
111+
112+
// I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD
113+
// function calculates
114+
public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
115+
{
116+
Console.WriteLine("--Rewind Validate with Errors Remaining");
117+
Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
118+
119+
int TempUtf16CodeUnitCountAdjustment = 0;
120+
int TempScalarCountAdjustment = 0;
121+
122+
int extraLen = 0;
123+
bool foundLeadingBytes = false;
124+
125+
for (int i = 0; i <= 3; i++)
33126
{
34127
if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
128+
// TODO: written like this for readability, I know its ugly so this needs to be rewritten
35129
byte candidateByte = buf[0 - i];
130+
Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2"));
131+
36132
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
37133
if (foundLeadingBytes)
38134
{
39-
40-
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
41-
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
135+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
136+
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
42137

43138
// adjustment to avoid double counting
44139
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
45140
{
46-
// Console.WriteLine("Found 2 byte");
47-
TempUtf16CodeUnitCountAdjustment += 1;
141+
Console.WriteLine("Found 2 byte");
142+
TempUtf16CodeUnitCountAdjustment -= 1;
48143
}
49144
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
50145
{
51-
// Console.WriteLine("Found 3 byte");
52-
TempUtf16CodeUnitCountAdjustment += 2;
146+
Console.WriteLine("Found 3 byte");
147+
TempUtf16CodeUnitCountAdjustment -= 2;
53148
}
54149
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
55150
{
56-
// Console.WriteLine("Found 4 byte");
57-
TempUtf16CodeUnitCountAdjustment += 2;
58-
TempScalarCountAdjustment += 1;
151+
Console.WriteLine("Found 4 byte");
152+
TempUtf16CodeUnitCountAdjustment -= 2;
153+
TempScalarCountAdjustment -= 1;
59154
}
60155
break;
61156
}
@@ -73,7 +168,6 @@ public static class UTF8
73168
Console.WriteLine(howFarBack);
74169
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
75170

76-
77171
// Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
78172
break;
79173
}
@@ -109,6 +203,7 @@ public static class UTF8
109203
return invalidBytePointer;
110204
}
111205

206+
112207
public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
113208
{
114209

@@ -511,15 +606,57 @@ public static class UTF8
511606
Vector256<byte> v0f = Vector256.Create((byte)0x0F);
512607
Vector256<byte> v80 = Vector256.Create((byte)0x80);
513608

609+
bool prevWasSimd = false;
610+
514611
for (; processedLength + 32 <= inputLength; processedLength += 32)
515612
{
613+
614+
615+
616+
// TODO: there is a problem with the fastpath : namely that if it is followed by a vector with all ascii,
617+
// there is a gap where
618+
// this is because
619+
// Now we have 2 choices : either still use prev3 to count dutf and check if there is a gap here OR
620+
//
516621
Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
517622

518623
int mask = Avx2.MoveMask(currentBlock);
519624
if (mask == 0)
520625
{
521626
// We have an ASCII block, no need to process it, but
522627
// we need to check if the previous block was incomplete.
628+
629+
if (prevWasSimd){ // recall that the non ascii simd checks counts the adjustment on prev3, hence we need to backtrack in case the
630+
// it was called
631+
Console.WriteLine("--prev was simd!");
632+
for(int k = 1; k <= 3 ; k++) // we dont want to double count the current byte
633+
{
634+
int candidateByte = pInputBuffer[processedLength - k];
635+
if ((candidateByte & 0b11000000) == 0b11000000)
636+
{
637+
{
638+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
639+
{
640+
TempUtf16CodeUnitCountAdjustment -= 1;
641+
}
642+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
643+
{
644+
TempUtf16CodeUnitCountAdjustment -= 2;
645+
}
646+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
647+
{
648+
Console.WriteLine("Found 4-byte");
649+
TempUtf16CodeUnitCountAdjustment -= 2;
650+
TempScalarCountAdjustment -= 1;
651+
}
652+
// break;
653+
654+
}
655+
}
656+
}
657+
658+
}
659+
523660
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
524661
{
525662

@@ -562,14 +699,15 @@ public static class UTF8
562699
// }
563700

564701

565-
// TODO this needs S
566702
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
567703
}
568704
prevIncomplete = Vector256<byte>.Zero;
705+
prevWasSimd = false;
569706
}
570707
else // Contains non-ASCII characters, we need to do non-trivial processing
571708
{
572709
Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes");
710+
prevWasSimd = true;
573711

574712
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
575713
// TODO:integrate this better with the rest of the code
@@ -616,6 +754,7 @@ public static class UTF8
616754
Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
617755
Vector256<byte> must23As80 = Avx2.And(must23, v80);
618756
Vector256<byte> error = Avx2.Xor(must23As80, sc);
757+
619758
if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit
620759
{
621760
Console.WriteLine("-----Error path!!");
@@ -676,9 +815,9 @@ public static class UTF8
676815

677816
for(int k = 0; k < 3; k++)
678817
{
679-
int candidateByte = pInputBuffer[processedLength + k];
818+
int candidateByte = pInputBuffer[processedLength + 32 + k];
680819
Console.WriteLine("Backing up " + k +" bytes");
681-
Console.WriteLine("CurrentByte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
820+
Console.WriteLine("Byte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
682821

683822
backedup = 3-k +1;
684823
// TODO:
@@ -743,21 +882,21 @@ public static class UTF8
743882
if (processedLength < inputLength)
744883
{
745884

746-
Console.WriteLine("----Process remaining Scalar");
747-
Console.WriteLine("processed length before:" + processedLength);
885+
Console.WriteLine("----Process remaining Scalar @ " + processedLength + "bytes");
886+
// Console.WriteLine("processed length before:" + processedLength);
748887
int overlapCount = 0;
749888

750889
// // We need to possibly backtrack to the start of the last code point
751-
while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
752-
{
753-
processedLength -= 1;
754-
overlapCount +=1;
755-
}
890+
// while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
891+
// {
892+
// processedLength -= 1;
893+
// overlapCount +=1;
894+
// }
756895

757896
Console.WriteLine("processed length after backtrack:" + processedLength);
758897

759898

760-
// TOCHECK:See if rewind is better here
899+
// PERFORMANCE TOCHECK:See if rewind is better here
761900
// for(int k = 0; k < overlapCount; k++)
762901
// {
763902
// // There is no error here hence the loop is straigthforward and we avoid double counting every byte
@@ -785,8 +924,8 @@ public static class UTF8
785924
Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
786925

787926

788-
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
789-
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
927+
// byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
928+
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
790929
if (invalidBytePointer != pInputBuffer + inputLength)
791930
{
792931
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;

test/UTF8ValidationTests.cs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ namespace tests;
99
using BenchmarkDotNet.Disassemblers;
1010
using Iced.Intel;
1111

12-
// TODO: add test for unterminated sequeqce happeqiqg at SIMD transition
12+
// TODO: refine test for unterminated sequeqce happening at SIMD transition
13+
// TODO: The various tests do not formally take into account the scenario where vector is all ASCII
14+
1315
public unsafe class Utf8SIMDValidationTests
1416
{
1517

0 commit comments

Comments
 (0)