Skip to content

Commit 7d90fde

Browse files
committed
Cleanup + better ShortTest
1 parent b5445bb commit 7d90fde

File tree

3 files changed

+38
-164
lines changed

3 files changed

+38
-164
lines changed

README.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,17 @@ To run specific tests, it is helpful to use the filter parameter:
4343

4444

4545
```
46-
dotnet test --filter Ascii
46+
dotnet test --filter TooShortErrorAVX
4747
```
4848

49+
Or to target specific categories:
50+
51+
```
52+
dotnet test --filter "Category=scalar"
53+
```
54+
55+
56+
4957
## Running Benchmarks
5058

5159
To run the benchmarks, run the following command:

src/UTF8.cs

Lines changed: 18 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -190,16 +190,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
190190
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
191191
scalarCountAdjustment = TempScalarCountAdjustment;
192192
return pInputBuffer + pos; } // Too short
193-
// if (pInputBuffer[pos + 3] < 0b10000000) {
194-
// TempUtf16CodeUnitCountAdjustment -= 1;
195-
// } else {
196-
// TempUtf16CodeUnitCountAdjustment -= 2;
197-
// }
198193
TempUtf16CodeUnitCountAdjustment -= 2;
199194
}
200195
else if ((firstByte & 0b11111000) == 0b11110000)
201-
{ // 0b11110000
202-
196+
{
203197
nextPos = pos + 4;
204198
if (nextPos > inputLength) {
205199
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
@@ -226,9 +220,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
226220
return pInputBuffer + pos; }
227221
TempUtf16CodeUnitCountAdjustment -= 2;
228222
TempScalarCountAdjustment -= 1;
229-
230-
231-
232223
}
233224
else
234225
{
@@ -525,36 +516,9 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
525516
Vector256<byte> thirdByte = Vector256.Create((byte)(0b11100000u - 0x80));
526517
Vector256<byte> fourthByte = Vector256.Create((byte)(0b11110000u - 0x80));
527518

528-
// // Mask for the lower and upper parts of the vector
529-
// Vector128<byte> lowerMask = Vector128.Create(
530-
// 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
531-
// 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF).AsByte();
532-
533-
// Vector128<byte> upperMask = Vector128.Create(
534-
// 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
535-
// 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00).AsByte();
536-
537-
// // Combine lower and upper masks into a Vector256<byte>
538-
// Vector256<byte> mask = Vector256.Create(lowerMask, upperMask);
539-
540-
// // Apply the mask to zero out the last 3 bytes of each vector
541-
// Vector256<byte> secondByteMasked = Avx2.And(secondByte, mask);
542-
// Vector256<byte> thirdByteMasked = Avx2.And(thirdByte, mask);
543-
// Vector256<byte> fourthByteMasked = Avx2.And(fourthByte, mask);
544-
545-
546519
Vector256<byte> v0f = Vector256.Create((byte)0x0F);
547520
Vector256<byte> v80 = Vector256.Create((byte)0x80);
548521

549-
// Vector to identify bytes right before the start of a 4-byte sequence in UTF-8.
550-
// Vector256<byte> beforeFourByteMarker = Vector256.Create((byte)(0xF0 - 1));
551-
// // Vector to identify bytes right before the start of a 3-byte sequence in UTF-8.
552-
// Vector256<byte> beforeThreeByteMarker = Vector256.Create((byte)(0xE0 - 1));
553-
// // Vector to identify bytes right before the start of a 2-byte sequence in UTF-8.
554-
// Vector256<byte> beforeTwoByteMarker = Vector256.Create((byte)(0xC0 - 1));
555-
556-
557-
558522
for (; processedLength + 32 <= inputLength; processedLength += 32)
559523
{
560524
Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
@@ -601,34 +565,26 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
601565
}
602566
}
603567
}
604-
// else{ off = processedLength;}
605568

606-
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
607569
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
608570
}
609571
prevIncomplete = Vector256<byte>.Zero;
610572
}
611573
else // Contains non-ASCII characters, we need to do non-trivial processing
612574
{
613-
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
575+
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
614576

615-
// Detect start of 4-byte sequences.
616-
Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
617-
uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
618-
619-
// Detect start of 3-byte sequences (including those that start 4-byte sequences).
620-
Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
621-
uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
622-
623-
// Detect start of 2-byte sequences (including those that start 3-byte and 4-byte sequences).
624-
Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
625-
uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
626-
627-
// Calculate counts by isolating each type.
628-
uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts.
629-
uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts.
577+
Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
578+
Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
579+
Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
630580

581+
uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
582+
uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
583+
uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
631584

585+
// Calculate counts by isolating each type.
586+
uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts.
587+
uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts.
632588

633589
Vector256<byte> shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21);
634590
prevInputBlock = currentBlock;
@@ -652,67 +608,24 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
652608
TailScalarCodeUnitCountAdjustment =0;
653609
TailUtf16CodeUnitCountAdjustment =0;
654610

655-
656-
int off = processedLength >= 32 ? processedLength : processedLength;
657-
658-
// Console.WriteLine("This is off :" + off);
659-
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
660-
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
661-
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
662-
663-
// byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer,processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
664-
// Adjustments not to double count
665-
// TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2;
666-
// TempUtf16CodeUnitCountAdjustment += (int)twoByteCount;
667-
// TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2;
668-
// TempScalarCountAdjustment += (int)fourByteCount;
611+
int off = processedLength >= 32 ? processedLength: processedLength;
612+
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
669613

670614
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment;
671615
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
672616

673-
674-
675617
return invalidBytePointer;
676618

677619
}
678-
// Adjustments
679-
TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2;
680-
TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount;
681-
TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2;
682-
TempScalarCountAdjustment -= (int)fourByteCount;
620+
// Adjustments
621+
TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2;
622+
TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount;
623+
TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2;
624+
TempScalarCountAdjustment -= (int)fourByteCount;
683625

684626
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
685627
}
686628
}
687-
688-
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
689-
{
690-
691-
// Console.WriteLine("----Checkpoint 2:SIMD rewind");
692-
// We have an unterminated sequence.
693-
processedLength -= 3;
694-
for(int k = 0; k < 3; k++)
695-
{
696-
697-
int candidateByte = pInputBuffer[processedLength + k];
698-
if ((candidateByte & 0b11000000) == 0b11000000)
699-
{
700-
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
701-
{
702-
TempUtf16CodeUnitCountAdjustment += 1;
703-
}
704-
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
705-
{
706-
TempUtf16CodeUnitCountAdjustment += 2;
707-
}
708-
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
709-
{
710-
TempUtf16CodeUnitCountAdjustment += 2;
711-
TempScalarCountAdjustment += 1;
712-
}
713-
}
714-
}
715-
}
716629
}
717630
}
718631

@@ -750,8 +663,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
750663
TempUtf16CodeUnitCountAdjustment += 2;
751664
TempScalarCountAdjustment += 1;
752665
}
753-
754-
// processedLength += k;
755666
break;
756667
}
757668
}

test/UTF8ValidationTests.cs

Lines changed: 11 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -435,9 +435,19 @@ public void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate)
435435
{
436436
byte oldByte = utf8[i];
437437
utf8[i] = 0b11100000; // Forcing a too short error
438+
try
439+
{
438440
Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
439441
Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
440-
ValidateCount(utf8,utf8ValidationDelegate);
442+
ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling.
443+
}
444+
catch (Xunit.Sdk.XunitException)
445+
{
446+
Console.WriteLine($"Assertion failed at index: {i}");
447+
PrintHexAndBinary(utf8, i);
448+
utf8[i] = oldByte; // Restore the original byte
449+
throw; // Rethrow the exception to fail the test.
450+
}
441451
utf8[i] = oldByte; // Restore the original byte
442452
}
443453
}
@@ -800,56 +810,6 @@ public void Invalid0xf50xffAvx2()
800810
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
801811
}
802812

803-
// Prints both hexadecimal and binary representations of a byte array
804-
// static void PrintHexAndBinary(byte[] bytes)
805-
// {
806-
// // Convert to hexadecimal
807-
// string hexRepresentation = BitConverter.ToString(bytes).Replace("-", " ");
808-
// Console.WriteLine($"Hex: {hexRepresentation}");
809-
810-
// // Convert to binary
811-
// string binaryRepresentation = string.Join(" ", Array.ConvertAll(bytes, byteValue => Convert.ToString(byteValue, 2).PadLeft(8, '0')));
812-
// Console.WriteLine($"Binary: {binaryRepresentation}");
813-
// }
814-
815-
// static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
816-
// {
817-
// // Convert to hexadecimal
818-
// Console.Write("Hex: ");
819-
// for (int i = 0; i < bytes.Length; i++)
820-
// {
821-
// if (i == highlightIndex)
822-
// {
823-
// Console.ForegroundColor = ConsoleColor.Red;
824-
// Console.Write($"{bytes[i]:X2} ");
825-
// Console.ResetColor();
826-
// }
827-
// else
828-
// {
829-
// Console.Write($"{bytes[i]:X2} ");
830-
// }
831-
// }
832-
// Console.WriteLine(); // New line for readability
833-
834-
// // Convert to binary
835-
// Console.Write("Binary: ");
836-
// for (int i = 0; i < bytes.Length; i++)
837-
// {
838-
// string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
839-
// if (i == highlightIndex)
840-
// {
841-
// Console.ForegroundColor = ConsoleColor.Red;
842-
// Console.Write($"{binaryString} ");
843-
// Console.ResetColor();
844-
// }
845-
// else
846-
// {
847-
// Console.Write($"{binaryString} ");
848-
// }
849-
// }
850-
// Console.WriteLine(); // New line for readability
851-
// }
852-
853813
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
854814
{
855815
int chunkSize = 16; // 128 bits = 16 bytes
@@ -905,13 +865,8 @@ public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate)
905865
{
906866
foreach (int outputLength in outputLengths)
907867
{
908-
909-
910-
Console.WriteLine("Outputlength:" + outputLength);
911868
for (int trial = 0; trial < NumTrials; trial++)
912869
{
913-
Console.WriteLine("trial:",trial);
914-
915870
byte[] utf8 = generator.Generate(outputLength).ToArray();
916871

917872
for (int i = 0; i < utf8.Length; i++)

0 commit comments

Comments
 (0)