Skip to content

Commit 006738c

Browse files
committed
LongErrorAVX working + cleaner
1 parent be79615 commit 006738c

File tree

3 files changed

+97
-30
lines changed

3 files changed

+97
-30
lines changed

benchmark/Benchmark.cs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ public unsafe void SIMDUtf8ValidationRealData()
183183
{
184184
if (allLinesUtf8 != null)
185185
{
186-
RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte);
186+
// RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte);
187187
}
188188
}
189189

@@ -223,15 +223,15 @@ public unsafe void SIMDUtf8ValidationRealDataArm64()
223223
RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
224224
}
225225
}
226-
[Benchmark]
227-
[BenchmarkCategory("avx")]
228-
public unsafe void SIMDUtf8ValidationRealDataAvx2()
229-
{
230-
if (allLinesUtf8 != null)
231-
{
232-
RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
233-
}
234-
}
226+
// [Benchmark]
227+
// [BenchmarkCategory("avx")]
228+
// public unsafe void SIMDUtf8ValidationRealDataAvx2()
229+
// {
230+
// if (allLinesUtf8 != null)
231+
// {
232+
// RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
233+
// }
234+
// }
235235
[Benchmark]
236236
[BenchmarkCategory("sse")]
237237
public unsafe void SIMDUtf8ValidationRealDataSse()

src/UTF8.cs

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,28 @@ public static class UTF8
1818
int extraLen = 0;
1919
bool foundLeadingBytes = false;
2020

21-
for (int i = 0; i < howFarBack; i++)
21+
for (int i = 0; i <= howFarBack; i++)
2222
{
2323
byte candidateByte = buf[0 - i];
2424
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
2525
if (foundLeadingBytes)
2626
{
27+
if (i == 0) {break;}
28+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
2729
// adjustment to avoid double counting
2830
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
2931
{
32+
Console.WriteLine("Found 2 byte");
3033
TempUtf16CodeUnitCountAdjustment += 1;
3134
}
3235
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
3336
{
37+
Console.WriteLine("Found 3 byte");
3438
TempUtf16CodeUnitCountAdjustment += 2;
3539
}
3640
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
3741
{
42+
Console.WriteLine("Found 4 byte");
3843
TempUtf16CodeUnitCountAdjustment += 2;
3944
TempScalarCountAdjustment += 1;
4045
}
@@ -74,8 +79,11 @@ public static class UTF8
7479
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
7580
scalarCountAdjustment += TailScalarCountAdjustment;
7681

77-
Console.WriteLine("utf16count after rewint:" + utf16CodeUnitCountAdjustment);
78-
Console.WriteLine("scalarcount after rewint:" + scalarCountAdjustment);
82+
Console.WriteLine("utf16count after rewint(Temp):" + TempUtf16CodeUnitCountAdjustment);
83+
Console.WriteLine("scalarcount after rewint:" + TempScalarCountAdjustment);
84+
85+
Console.WriteLine("utf16count after rewint(Scalar):" + TailUtf16CodeUnitCountAdjustment);
86+
Console.WriteLine("scalarcount after rewint:" + TailScalarCountAdjustment);
7987

8088
return invalidBytePointer;
8189
}
@@ -620,11 +628,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
620628
uint threeByteCount = threeBytePlusCount - fourByteCount; // Isolate 3-byte starts by subtracting 4-byte starts.
621629
uint twoByteCount = twoBytePlusCount - threeBytePlusCount; // Isolate 2-byte starts by subtracting 3-byte and 4-byte starts.
622630

623-
// Adjustments
624-
TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2;
625-
TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount;
626-
TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2;
627-
TempScalarCountAdjustment -= (int)fourByteCount;
631+
628632

629633
Vector256<byte> shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21);
630634
prevInputBlock = currentBlock;
@@ -649,17 +653,19 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
649653
TailUtf16CodeUnitCountAdjustment =0;
650654

651655

652-
int off = processedLength >= 32 ? processedLength - 32 : 0;//processedLength;
653-
// Console.WriteLine(off);
656+
int off = processedLength >= 32 ? processedLength : 0;//processedLength;
657+
658+
Console.WriteLine("This is off :" + off);
654659
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
655660
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
656661
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
657662

663+
// byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer,processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
658664
// Adjustments not to double count
659-
TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2;
660-
TempUtf16CodeUnitCountAdjustment += (int)twoByteCount;
661-
TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2;
662-
TempScalarCountAdjustment += (int)fourByteCount;
665+
// TempUtf16CodeUnitCountAdjustment += (int)fourByteCount * 2;
666+
// TempUtf16CodeUnitCountAdjustment += (int)twoByteCount;
667+
// TempUtf16CodeUnitCountAdjustment += (int)threeByteCount *2;
668+
// TempScalarCountAdjustment += (int)fourByteCount;
663669

664670
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment;
665671
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
@@ -669,6 +675,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
669675
return invalidBytePointer;
670676

671677
}
678+
// Adjustments
679+
TempUtf16CodeUnitCountAdjustment -= (int)fourByteCount * 2;
680+
TempUtf16CodeUnitCountAdjustment -= (int)twoByteCount;
681+
TempUtf16CodeUnitCountAdjustment -= (int)threeByteCount *2;
682+
TempScalarCountAdjustment -= (int)fourByteCount;
683+
672684
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
673685
}
674686
}

test/UTF8ValidationTests.cs

Lines changed: 61 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ public void BadHeaderBitsScalar()
387387
{
388388
BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
389389
}
390+
390391

391392
// TODO:Uncomment when SSE is updated
392393
// [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
@@ -487,7 +488,9 @@ public void TooShortErrorAVX()
487488
public void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate)
488489
{
489490

490-
int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths
491+
// int[] outputLengths = { 128, 256, 512, 1024 }; // Example lengths
492+
// int[] outputLengths = { 128, 256,345, 512,968, 1024, 1000 }; // Example lengths
493+
491494

492495
foreach (int outputLength in outputLengths)
493496
{
@@ -809,12 +812,55 @@ public void Invalid0xf50xffAvx2()
809812
// Console.WriteLine($"Binary: {binaryRepresentation}");
810813
// }
811814

812-
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
815+
// static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
816+
// {
817+
// // Convert to hexadecimal
818+
// Console.Write("Hex: ");
819+
// for (int i = 0; i < bytes.Length; i++)
820+
// {
821+
// if (i == highlightIndex)
822+
// {
823+
// Console.ForegroundColor = ConsoleColor.Red;
824+
// Console.Write($"{bytes[i]:X2} ");
825+
// Console.ResetColor();
826+
// }
827+
// else
828+
// {
829+
// Console.Write($"{bytes[i]:X2} ");
830+
// }
831+
// }
832+
// Console.WriteLine(); // New line for readability
833+
834+
// // Convert to binary
835+
// Console.Write("Binary: ");
836+
// for (int i = 0; i < bytes.Length; i++)
837+
// {
838+
// string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
839+
// if (i == highlightIndex)
840+
// {
841+
// Console.ForegroundColor = ConsoleColor.Red;
842+
// Console.Write($"{binaryString} ");
843+
// Console.ResetColor();
844+
// }
845+
// else
846+
// {
847+
// Console.Write($"{binaryString} ");
848+
// }
849+
// }
850+
// Console.WriteLine(); // New line for readability
851+
// }
852+
853+
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
813854
{
814-
// Convert to hexadecimal
855+
int chunkSize = 16; // 128 bits = 16 bytes
856+
857+
// Process each chunk for hexadecimal
815858
Console.Write("Hex: ");
816859
for (int i = 0; i < bytes.Length; i++)
817860
{
861+
if (i > 0 && i % chunkSize == 0)
862+
Console.WriteLine(); // New line after every 16 bytes
863+
818864
if (i == highlightIndex)
819865
{
820866
Console.ForegroundColor = ConsoleColor.Red;
@@ -825,13 +871,18 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
825871
{
826872
Console.Write($"{bytes[i]:X2} ");
827873
}
874+
875+
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
828876
}
829-
Console.WriteLine(); // New line for readability
877+
Console.WriteLine("\n"); // New line for readability and to separate hex from binary
830878

831-
// Convert to binary
879+
// Process each chunk for binary
832880
Console.Write("Binary: ");
833881
for (int i = 0; i < bytes.Length; i++)
834882
{
883+
if (i > 0 && i % chunkSize == 0)
884+
Console.WriteLine(); // New line after every 16 bytes
885+
835886
string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
836887
if (i == highlightIndex)
837888
{
@@ -843,19 +894,23 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
843894
{
844895
Console.Write($"{binaryString} ");
845896
}
897+
898+
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
846899
}
847900
Console.WriteLine(); // New line for readability
848901
}
849902

850903

851-
852904
public void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate)
853905
{
854906
foreach (int outputLength in outputLengths)
855907
{
856908

909+
910+
Console.WriteLine("Outputlength:" + outputLength);
857911
for (int trial = 0; trial < NumTrials; trial++)
858912
{
913+
Console.WriteLine("trial:",trial);
859914

860915
byte[] utf8 = generator.Generate(outputLength).ToArray();
861916

0 commit comments

Comments
 (0)