Skip to content

Commit cbf004d

Browse files
committed
Incomplete test progress (only scalarcount working )
1 parent 92b7e3b commit cbf004d

File tree

2 files changed

+69
-44
lines changed

2 files changed

+69
-44
lines changed

src/UTF8.cs

Lines changed: 62 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ public static class UTF8
1616
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
1717
{
1818
Console.WriteLine("--Rewind Validate with Errors");
19+
Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
20+
1921
int TempUtf16CodeUnitCountAdjustment = 0;
2022
int TempScalarCountAdjustment = 0;
2123

@@ -26,47 +28,53 @@ public static class UTF8
2628
// Even with no errors, it sometime double counts, why.. ? because it goes back even further
2729
// even though the scalar doesnt thread
2830
// adjust for double counting
29-
// for (int i = 0; i <= howFarBack; i++)
30-
// {
31-
// byte candidateByte = buf[0 - i];
32-
// foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
33-
// if (foundLeadingBytes)
34-
// {
35-
// // if (i == 0) {break;}
36-
// // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
37-
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
38-
39-
// // adjustment to avoid double counting
40-
// if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
41-
// {
42-
// // Console.WriteLine("Found 2 byte");
43-
// TempUtf16CodeUnitCountAdjustment += 1;
44-
// }
45-
// if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
46-
// {
47-
// // Console.WriteLine("Found 3 byte");
48-
// TempUtf16CodeUnitCountAdjustment += 2;
49-
// }
50-
// if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
51-
// {
52-
// // Console.WriteLine("Found 4 byte");
53-
// TempUtf16CodeUnitCountAdjustment += 2;
54-
// TempScalarCountAdjustment += 1;
55-
// }
56-
// break;
57-
// }
58-
// }
31+
// for (int i = 0; i <= howFarBack; i++)
32+
for (int i = 0; i <= howFarBack; i++)
33+
{
34+
if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
35+
byte candidateByte = buf[0 - i];
36+
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
37+
if (foundLeadingBytes)
38+
{
39+
40+
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
41+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
42+
43+
// adjustment to avoid double counting
44+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
45+
{
46+
// Console.WriteLine("Found 2 byte");
47+
TempUtf16CodeUnitCountAdjustment += 1;
48+
}
49+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
50+
{
51+
// Console.WriteLine("Found 3 byte");
52+
TempUtf16CodeUnitCountAdjustment += 2;
53+
}
54+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
55+
{
56+
// Console.WriteLine("Found 4 byte");
57+
TempUtf16CodeUnitCountAdjustment += 2;
58+
TempScalarCountAdjustment += 1;
59+
}
60+
break;
61+
}
62+
}
5963

6064
for (int i = 0; i <= howFarBack; i++)
6165
{
66+
Console.WriteLine("backup stat:" + i);
6267
byte candidateByte = buf[0 - i];
6368
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
6469
if (foundLeadingBytes)
6570
{
6671
buf -= i;
6772
extraLen = i;
6873
Console.WriteLine(howFarBack);
69-
Console.WriteLine("Backed up " + i + 1 + " bytes");
74+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
75+
76+
77+
// Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
7078
break;
7179
}
7280
}
@@ -663,21 +671,31 @@ public static class UTF8
663671
// Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting);
664672
int backedup= 0;
665673

674+
int currentByte = pInputBuffer[processedLength];
675+
Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0'));
676+
666677
for(int k = 0; k < 3; k++)
667678
{
668679
int candidateByte = pInputBuffer[processedLength + k];
680+
Console.WriteLine("Backing up " + k +" bytes");
681+
Console.WriteLine("CurrentByte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
682+
683+
backedup = 3-k +1;
684+
// TODO:
685+
// the weird + 1 is so I dont have to put an else to the conditional below
686+
// less readable, there might be a more elegant way to rewrite it but I am taking the path of convenience for now
687+
669688
if ((candidateByte & 0b11000000) == 0b11000000)
670689
{
671-
backedup = 3-k;
672-
Console.WriteLine("Backing up " + backedup +" bytes");
673-
674690
// Whatever you do, do not delete this
675691
processedLength += k;
676692
break;
677693
}
678694
}
679695

680-
for(int k = backedup; k < 3; k++)
696+
Console.WriteLine("Backed up " + backedup +" bytes");
697+
698+
for(int k = backedup; k < 3 ; k++)
681699
{
682700
int candidateByte = pInputBuffer[processedLength - k];
683701
if ((candidateByte & 0b11000000) == 0b11000000)
@@ -726,20 +744,20 @@ public static class UTF8
726744
{
727745

728746
Console.WriteLine("----Process remaining Scalar");
729-
// Console.WriteLine("processed length before:" + processedLength);
747+
Console.WriteLine("processed length before:" + processedLength);
730748
int overlapCount = 0;
731749

732750
// // We need to possibly backtrack to the start of the last code point
733751
while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
734752
{
735753
processedLength -= 1;
736-
// overlapCount +=1;
754+
overlapCount +=1;
737755
}
738756

739-
// Console.WriteLine("processed length after:" + processedLength);
757+
Console.WriteLine("processed length after backtrack:" + processedLength);
740758

741759

742-
// Best use rewind I think
760+
// TOCHECK:See if rewind is better here
743761
// for(int k = 0; k < overlapCount; k++)
744762
// {
745763
// // There is no error here hence the loop is straigthforward and we avoid double counting every byte
@@ -763,11 +781,12 @@ public static class UTF8
763781
// }
764782
// }
765783

766-
// Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
767-
// Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
784+
Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
785+
Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
768786

769787

770788
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
789+
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
771790
if (invalidBytePointer != pInputBuffer + inputLength)
772791
{
773792
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
@@ -777,8 +796,8 @@ public static class UTF8
777796
return invalidBytePointer;
778797
}
779798

780-
// Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
781-
// Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
799+
Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
800+
Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
782801

783802
}
784803

test/UTF8ValidationTests.cs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
921921
Console.Write($"{bytes[i]:X2} ");
922922
Console.ResetColor();
923923
}
924+
else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
925+
{
926+
Console.ForegroundColor = ConsoleColor.Green;
927+
Console.Write($"{bytes[i]:X2} ");
928+
Console.ResetColor();
929+
}
924930
else
925931
{
926932
Console.Write($"{bytes[i]:X2} ");
@@ -1408,7 +1414,7 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele
14081414
try
14091415
{
14101416
Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
1411-
// Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
1417+
Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
14121418
}
14131419
catch (Exception)
14141420
{

0 commit comments

Comments
 (0)