Skip to content

Commit f27117b

Browse files
committed
more expressive validateCount test
1 parent dcbf949 commit f27117b

File tree

2 files changed

+151
-81
lines changed

2 files changed

+151
-81
lines changed

src/UTF8.cs

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -9,34 +9,32 @@ namespace SimdUnicode
99
public static class UTF8
1010
{
1111

12-
public unsafe static byte* RewindAndValidateWithErrors(int priorBytes, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
12+
public unsafe static byte* RewindAndValidateWithErrors(int offset, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
1313
{
1414

1515
int TempUtf16CodeUnitCountAdjustment = 0;
1616
int TempScalarCountAdjustment = 0;
1717

18-
int TailUtf16CodeUnitCountAdjustment = 0;
19-
int TailScalarCountAdjustment = 0;
20-
21-
int howFarBack = priorBytes;
18+
int howFarBack = offset;
2219
int extraLen = 0;
2320
bool foundLeadingBytes = false;
2421
for (int i = 0; i <= howFarBack; i++)
2522
{
26-
byte b = buf[0 - i];
27-
foundLeadingBytes = (b & 0b11000000) != 0b10000000;
23+
byte candidateByte = buf[0 - i];
24+
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
2825
if (foundLeadingBytes)
2926
{
3027

31-
if ((b & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
28+
// adjustment to avoid double counting
29+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
3230
{
3331
TempUtf16CodeUnitCountAdjustment += 1;
3432
}
35-
if ((b & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
33+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
3634
{
3735
TempUtf16CodeUnitCountAdjustment += 2;
3836
}
39-
if ((b & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
37+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
4038
{
4139
TempUtf16CodeUnitCountAdjustment += 2;
4240
TempScalarCountAdjustment += 1;
@@ -48,26 +46,26 @@ public static class UTF8
4846
}
4947
}
5048

51-
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
52-
scalarCountAdjustment += TempScalarCountAdjustment;
53-
5449

5550
if (!foundLeadingBytes)
5651
{
5752
return buf - howFarBack;
5853
}
5954

55+
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
56+
scalarCountAdjustment += TempScalarCountAdjustment;
6057

61-
58+
int TailUtf16CodeUnitCountAdjustment = 0;
59+
int TailScalarCountAdjustment = 0;
6260

6361
// Now buf points to the start of a UTF-8 sequence or the start of the buffer.
6462
// Validate from this new start point with the adjusted length.
65-
byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
63+
byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
6664

67-
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
68-
scalarCountAdjustment = TailScalarCountAdjustment;
65+
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
66+
scalarCountAdjustment += TailScalarCountAdjustment;
6967

70-
return invalidByte;
68+
return invalidBytePointer;
7169
}
7270

7371
public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippedBytes,
@@ -594,24 +592,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
594592
{
595593
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
596594

597-
// Identify start of 4-byte sequences.
598-
// Vector256<byte> isFourByteStart = Avx2.SubtractSaturate(currentBlock, fourthByte);
599-
// int fourByteMask = Avx2.MoveMask(isFourByteStart);
600-
// uint fourByteCount = Popcnt.PopCount((uint)fourByteMask);
601-
602-
// // Identify start of 3-byte and 4-byte sequences.
603-
// Vector256<byte> isThreeByteStart = Avx2.SubtractSaturate(currentBlock, thirdByte);
604-
// int threeByteMask = Avx2.MoveMask(isThreeByteStart);
605-
// uint threeByteCount = Popcnt.PopCount((uint)threeByteMask);
606-
607-
// // Calculate only 3-byte sequence count by excluding 4-byte sequences.
608-
// // uint threeByteCount = threeOrFourByteCount - fourByteCount;
609-
610-
// // Identify start of 2-byte,3 or 4 bytes sequences.
611-
// Vector256<byte> isTwoByteStart = Avx2.SubtractSaturate(currentBlock, secondByte);
612-
// int twoByteMask = Avx2.MoveMask(isTwoByteStart);
613-
// uint twoByteCount = Popcnt.PopCount((uint)twoByteMask);
614-
615595
// Detect start of 4-byte sequences.
616596
Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
617597
uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
@@ -628,11 +608,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
628608
uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts.
629609
uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts.
630610

631-
// Calculate only 2-byte sequence count by excluding 3-byte and 4-byte sequences.
632-
// uint pureTwoByteCount = twoByteCount - threeOrFourByteCount;
633-
634-
// Console.WriteLine("2byte count:" + twoByteCount);
635-
636611
// Adjustments
637612
TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2;
638613
TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount;
@@ -655,15 +630,29 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
655630
Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
656631
Vector256<byte> must23As80 = Avx2.And(must23, v80);
657632
Vector256<byte> error = Avx2.Xor(must23As80, sc);
658-
if (!Avx2.TestZ(error, error))
633+
if (!Avx2.TestZ(error, error)) //context: we are dealing with a 32 bit
659634
{
660635
Console.WriteLine("-----Error path!!");
661-
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
662-
scalarCountAdjustment = TempScalarCountAdjustment;
636+
TailScalarCodeUnitCountAdjustment =0;
637+
TailUtf16CodeUnitCountAdjustment =0;
638+
663639

664-
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
640+
int off = processedLength >= 32 ? processedLength - 32 : 0;//processedLength;
641+
// Console.WriteLine(off);
665642
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
666-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
643+
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
644+
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
645+
646+
// Adjustments not to double count
647+
TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2;
648+
TempUtf16CodeUnitCountAdjustment += (int)twoByteCount;
649+
TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2;
650+
TempScalarCountAdjustment += (int)fourByteCount;
651+
652+
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment;
653+
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
654+
655+
return invalidBytePointer;
667656

668657
}
669658
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);

test/UTF8ValidationTests.cs

Lines changed: 116 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -798,16 +798,55 @@ public void Invalid0xf50xffAvx2()
798798
}
799799

800800
// Prints both hexadecimal and binary representations of a byte array
801-
static void PrintHexAndBinary(byte[] bytes)
801+
// static void PrintHexAndBinary(byte[] bytes)
802+
// {
803+
// // Convert to hexadecimal
804+
// string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " ");
805+
// Console.WriteLine($"Hex: {hexRepresentation}");
806+
807+
// // Convert to binary
808+
// string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0')));
809+
// Console.WriteLine($"Binary: {binaryRepresentation}");
810+
// }
811+
812+
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
813+
{
814+
// Convert to hexadecimal
815+
Console.Write("Hex: ");
816+
for (int i = 0; i < bytes.Length; i++)
802817
{
803-
// Convert to hexadecimal
804-
string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " ");
805-
Console.WriteLine($"Hex: {hexRepresentation}");
818+
if (i == highlightIndex)
819+
{
820+
Console.ForegroundColor = ConsoleColor.Red;
821+
Console.Write($"{bytes[i]:X2} ");
822+
Console.ResetColor();
823+
}
824+
else
825+
{
826+
Console.Write($"{bytes[i]:X2} ");
827+
}
828+
}
829+
Console.WriteLine(); // New line for readability
806830

807-
// Convert to binary
808-
string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0')));
809-
Console.WriteLine($"Binary: {binaryRepresentation}");
831+
// Convert to binary
832+
Console.Write("Binary: ");
833+
for (int i = 0; i < bytes.Length; i++)
834+
{
835+
string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
836+
if (i == highlightIndex)
837+
{
838+
Console.ForegroundColor = ConsoleColor.Red;
839+
Console.Write($"{binaryString} ");
840+
Console.ResetColor();
841+
}
842+
else
843+
{
844+
Console.Write($"{binaryString} ");
845+
}
810846
}
847+
Console.WriteLine(); // New line for readability
848+
}
849+
811850

812851

813852
public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate)
@@ -1255,43 +1294,85 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg
12551294

12561295

12571296

1258-
public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default)
1259-
{
1260-
int DotnetUtf16Adjustment, DotnetScalarCountAdjustment;
1261-
int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
1297+
// public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default)
1298+
// {
1299+
// int DotnetUtf16Adjustment, DotnetScalarCountAdjustment;
1300+
// int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
1301+
1302+
// var isDefaultRange = range.Equals(default(Range));
1303+
// var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range);
1304+
1305+
// unsafe
1306+
// {
1307+
// fixed (byte* pInput = utf8)
1308+
// {
1309+
// byte* startPtr = pInput + offset;
1310+
// // Invoke the method under test.
1311+
1312+
// DotnetUtf16Adjustment= 0;
1313+
// DotnetScalarCountAdjustment= 0;
1314+
// DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment);
1315+
1316+
// SimdUnicodeUtf16Adjustment= 0;
1317+
// SimdUnicodeScalarCountAdjustment= 0;
1318+
// utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
1319+
1320+
// // Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment);
1321+
// // Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment);
1322+
1323+
// // Console.WriteLine("Lenght:" + utf8.Length);
1324+
// // Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment);
1325+
// // Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment);
1326+
// // Console.WriteLine("___________________________________________________");
1327+
1328+
// Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
1329+
// Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
1330+
// }
1331+
// }
1332+
// // }
1333+
// }
12621334

1263-
var isDefaultRange = range.Equals(default(Range));
1264-
var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range);
1335+
public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default)
1336+
{
1337+
int DotnetUtf16Adjustment, DotnetScalarCountAdjustment;
1338+
int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
12651339

1266-
unsafe
1267-
{
1268-
fixed (byte* pInput = utf8)
1269-
{
1270-
byte* startPtr = pInput + offset;
1271-
// Invoke the method under test.
1340+
var isDefaultRange = range.Equals(default(Range));
1341+
var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range);
12721342

1273-
DotnetUtf16Adjustment= 0;
1274-
DotnetScalarCountAdjustment= 0;
1275-
DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment);
1343+
unsafe
1344+
{
1345+
fixed (byte* pInput = utf8)
1346+
{
1347+
byte* startPtr = pInput + offset;
12761348

1277-
SimdUnicodeUtf16Adjustment= 0;
1278-
SimdUnicodeScalarCountAdjustment= 0;
1279-
utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
1349+
DotnetUtf16Adjustment = 0;
1350+
DotnetScalarCountAdjustment = 0;
1351+
DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment);
12801352

1281-
// Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment);
1282-
// Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment);
1353+
SimdUnicodeUtf16Adjustment = 0;
1354+
SimdUnicodeScalarCountAdjustment = 0;
1355+
byte* simdResult = utf8ValidationDelegate(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
12831356

1284-
// Console.WriteLine("Lenght:" + utf8.Length);
1285-
// Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment);
1286-
// Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment);
1287-
// Console.WriteLine("___________________________________________________");
1357+
// Determine the index of the invalid byte if simdResult doesn't point to the end.
1358+
int failureIndex = simdResult != pInput + length ? (int)(simdResult - pInput) : -1;
12881359

1289-
Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
1290-
Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
1291-
}
1360+
try
1361+
{
1362+
Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
1363+
Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
12921364
}
1293-
// }
1365+
catch (Exception)
1366+
{
1367+
// Upon failure, print the utf8 array for inspection
1368+
Console.WriteLine("Assertion failed. Inspecting utf8 array:");
1369+
PrintHexAndBinary(utf8,failureIndex);
1370+
throw; // Re-throw the exception to preserve the failure state
1371+
}
1372+
}
12941373
}
1374+
}
1375+
12951376

12961377
[Fact]
12971378
[Trait("Category", "Scalar")]

0 commit comments

Comments
 (0)