Skip to content

Commit 4802a10

Browse files
committed
Noerreavx test really working this time
1 parent c5e4004 commit 4802a10

File tree

1 file changed

+78
-61
lines changed

1 file changed

+78
-61
lines changed

src/UTF8.cs

Lines changed: 78 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ public static class UTF8
111111

112112
// I seperate this function as for the tail, we know that there has been no error thus far: but remember the SIMD
113113
// function calculates
114-
public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
114+
public unsafe static byte* RewindAndValidateWithErrorsRemaining(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasUnterminated = false)
115115
{
116116
Console.WriteLine("--Rewind Validate with Errors Remaining");
117117
Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
@@ -122,40 +122,43 @@ public static class UTF8
122122
int extraLen = 0;
123123
bool foundLeadingBytes = false;
124124

125-
for (int i = 0; i <= 3; i++)
126-
{
127-
if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
128-
// TODO: written like this for readability, I know its ugly so this needs to be rewritten
129-
byte candidateByte = buf[0 - i];
130-
Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2"));
131-
132-
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
133-
if (foundLeadingBytes)
125+
// This was created in the context of incomplete tests: namely a gap is created when the SIMD vector is followed by a processremainingscalar
126+
if (!prevWasUnterminated) //
127+
{
128+
for (int i = 0; i <= 3; i++)
134129
{
135-
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
136-
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
130+
if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
131+
// TODO: written like this for readability, I know its ugly so this needs to be rewritten
132+
byte candidateByte = buf[0 - i];
133+
Console.WriteLine("Checking Byte:" + candidateByte.ToString("X2"));
137134

138-
// adjustment to avoid double counting
139-
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
140-
{
141-
Console.WriteLine("Found 2 byte");
142-
TempUtf16CodeUnitCountAdjustment -= 1;
143-
}
144-
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
145-
{
146-
Console.WriteLine("Found 3 byte");
147-
TempUtf16CodeUnitCountAdjustment -= 2;
148-
}
149-
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
135+
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
136+
if (foundLeadingBytes)
150137
{
151-
Console.WriteLine("Found 4 byte");
152-
TempUtf16CodeUnitCountAdjustment -= 2;
153-
TempScalarCountAdjustment -= 1;
138+
Console.WriteLine("Double counting.Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
139+
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
140+
141+
// adjustment to avoid double counting
142+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
143+
{
144+
Console.WriteLine("Found 2 byte");
145+
TempUtf16CodeUnitCountAdjustment -= 1;
146+
}
147+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
148+
{
149+
Console.WriteLine("Found 3 byte");
150+
TempUtf16CodeUnitCountAdjustment -= 2;
151+
}
152+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
153+
{
154+
Console.WriteLine("Found 4 byte");
155+
TempUtf16CodeUnitCountAdjustment -= 2;
156+
TempScalarCountAdjustment -= 1;
157+
}
158+
break;
154159
}
155-
break;
156160
}
157161
}
158-
159162
for (int i = 0; i <= howFarBack; i++)
160163
{
161164
Console.WriteLine("backup stat:" + i);
@@ -501,6 +504,7 @@ public static class UTF8
501504
int TailUtf16CodeUnitCountAdjustment = 0;
502505

503506
bool prevWasSimd = false;
507+
bool prevWasUnterminated = false;
504508

505509

506510
if (pInputBuffer == null || inputLength <= 0)
@@ -836,39 +840,42 @@ public static class UTF8
836840

837841
Console.WriteLine("Backed up " + backedup +" bytes");
838842

839-
for(int k = backedup; k < 3 ; k++)
840-
{
841-
int candidateByte = pInputBuffer[processedLength - k];
842-
if ((candidateByte & 0b11000000) == 0b11000000)
843-
{
844-
// TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3
845-
// This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error
846-
// if (k != 0)
847-
{
848-
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
849-
{
850-
TempUtf16CodeUnitCountAdjustment += 1;
851-
}
852-
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
853-
{
854-
TempUtf16CodeUnitCountAdjustment += 2;
855-
}
856-
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
857-
{
858-
Console.WriteLine("Found 4-byte");
859-
TempUtf16CodeUnitCountAdjustment += 2;
860-
TempScalarCountAdjustment += 1;
861-
}
862-
// break;
843+
// for(int k = backedup; k < 3 ; k++)
844+
// {
845+
// int candidateByte = pInputBuffer[processedLength - k];
846+
// if ((candidateByte & 0b11000000) == 0b11000000)
847+
// {
848+
// // TODO:The SIMD code backs up by at most 3, but recall that the count is now done on prev3
849+
// // This has the advantage that we KNOW that the bytes here are valid instead of having to take into account the error
850+
// // if (k != 0)
851+
// {
852+
// if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
853+
// {
854+
// TempUtf16CodeUnitCountAdjustment += 1;
855+
// }
856+
// if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
857+
// {
858+
// TempUtf16CodeUnitCountAdjustment += 2;
859+
// }
860+
// if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
861+
// {
862+
// Console.WriteLine("Found 4-byte");
863+
// TempUtf16CodeUnitCountAdjustment += 2;
864+
// TempScalarCountAdjustment += 1;
865+
// }
866+
// // break;
863867

864-
}
865-
}
866-
}
868+
// }
869+
// }
870+
// }
867871

868872

869873
Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment);
870874
Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment);
871875
Console.WriteLine("-----------------");
876+
877+
prevWasUnterminated = true;
878+
prevWasSimd = true;
872879
}
873880
}
874881
}
@@ -927,7 +934,7 @@ public static class UTF8
927934

928935

929936
// byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
930-
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
937+
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrorsRemaining(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasUnterminated);
931938
if (invalidBytePointer != pInputBuffer + inputLength)
932939
{
933940
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
@@ -939,28 +946,38 @@ public static class UTF8
939946

940947
Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
941948
Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
949+
// prevWasSimd = false;
950+
951+
}
952+
else
953+
if (processedLength == inputLength && prevWasSimd){ // without this there is a 3 byte gap at the end
954+
Console.Write("Closing in the gap\n");
942955

943-
} else if (processedLength == inputLength && prevWasSimd){
944-
for(int k = 0; k < 3; k++)
956+
for(int k = 0; k <= 3; k++)
945957
{
946-
// There is no error here hence the loop is straigthforward and we avoid double counting every byte
958+
959+
// There is no error here hence the loop is straigthforward and we avoid double counting every byte
947960
int candidateByte = pInputBuffer[processedLength - k];
948961
if ((candidateByte & 0b11000000) == 0b11000000)
949962
{
950963
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
951964
{
965+
Console.Write("Found 2 byte \n");
966+
952967
TempUtf16CodeUnitCountAdjustment -= 1;
953968
}
954969
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
955970
{
971+
Console.Write("Found 3 byte \n");
956972
TempUtf16CodeUnitCountAdjustment -= 2;
957973
}
958974
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
959975
{
976+
Console.Write("Found 4 byte \n");
960977
TempUtf16CodeUnitCountAdjustment -= 2;
961978
TempScalarCountAdjustment -= 1;
962979
}
963-
break;
980+
// break;
964981
}
965982
}
966983
}

0 commit comments

Comments
 (0)