Skip to content

Commit e27c85f

Browse files
committed
cleanup + more expressive tests
1 parent d784815 commit e27c85f

File tree

2 files changed

+37
-132
lines changed

2 files changed

+37
-132
lines changed

src/UTF8.cs

Lines changed: 24 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace SimdUnicode
1010
public static class UTF8
1111
{
1212

13-
// //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
13+
1414
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
1515
{
1616
int chunkSize = 16; // 128 bits = 16 bytes
@@ -78,20 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
7878

7979
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
8080
{
81-
// // Console.WriteLine("CALLING REWIND");//debug
81+
8282
int extraLen = 0;
8383
bool foundLeadingBytes = false;
8484

8585
for (int i = 0; i <= howFarBack; i++)
8686
{
8787
byte candidateByte = buf[0 - i];
8888
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
89-
// Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");//debug
90-
// Console.WriteLine(foundLeadingBytes);//debug
89+
90+
9191

9292
if (foundLeadingBytes)
9393
{
94-
// Console.WriteLine("Found leading byte");//debug
94+
9595
buf -= i;
9696
break;
9797
}
@@ -257,8 +257,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
257257
{
258258
if ((pInputBuffer[-i] & 0b11000000) != 0b10000000)
259259
{
260-
// string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug
261-
// Console.WriteLine($"Stopping at byte {binaryString}"); //debug
260+
261+
262262
break;
263263
}
264264
contbyteadjust -= 1;
@@ -278,40 +278,18 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
278278

279279
public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
280280
{
281-
// Console.WriteLine("---------"); //debug
282-
// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
281+
282+
283283
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
284284
int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
285285
int utfadjust = -2 * n4 - 2 * n3 - n2;
286286
int scalaradjust = -n4;
287287

288-
// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
288+
289289

290290
return (utfadjust, scalaradjust);
291291
}
292292

293-
// public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
294-
// {
295-
// // Calculate the total bytes from start_point to processedLength
296-
// int totalbyte = processedLength - start_point;
297-
// int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
298-
299-
// // Adjust the length to include a complete character, if necessary
300-
// if (totalbyte > 0)
301-
// {
302-
// (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
303-
// }
304-
305-
// // Pseudocode:
306-
// // if 'TooLongErroronEdge' bool is true then
307-
// // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
308-
309-
// var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
310-
311-
312-
// return (utfadjust, scalaradjust);
313-
// }
314-
315293
public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false)
316294
{
317295
// Calculate the total bytes from start_point to processedLength
@@ -324,10 +302,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
324302
(adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
325303
}
326304

327-
// Adjust the counters if 'TooLongErroronEdge' is true
328305
if (TooLongErroronEdge)
329306
{
330-
// If you can figure out why this makes a difference,youre golden
331307
asciibytes += adjustascii;
332308
contbytes += adjustcont;
333309
n4 += adjustn4;
@@ -494,9 +470,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
494470

495471
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
496472
{
497-
// Console.ForegroundColor = ConsoleColor.Blue; //debug
498-
// Console.WriteLine("-------------------------------------");//debug
499-
// Console.ResetColor();//debug
473+
474+
475+
500476

501477
int processedLength = 0;
502478
int TempUtf16CodeUnitCountAdjustment= 0 ;
@@ -674,78 +650,29 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
674650
Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); // takes the 0000 XXXX part of the previous part
675651
Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte
676652
Vector256<byte> sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high);
677-
678-
// Create a span from the Vector256<byte>
679-
// Console.WriteLine("");
680-
// Span<byte> byteSpan = MemoryMarshal.Cast<Vector256<byte>, byte>(MemoryMarshal.CreateSpan(ref sc, 1));
681-
// byte[] scbytes = byteSpan.ToArray();
682-
// PrintHexAndBinary(scbytes);55555555555555555
683-
684653
Vector256<byte> prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2));
685654
Vector256<byte> prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3));
686655
Vector256<byte> isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte);
687656
Vector256<byte> isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte);
688657
Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
689658
Vector256<byte> must23As80 = Avx2.And(must23, v80);
690659
Vector256<byte> error = Avx2.Xor(must23As80, sc);
691-
// if (!Avx2.TestZ(error, error))
692-
// {
693-
// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
694-
695-
// int off = processedLength >= 32 ? processedLength - 32 : processedLength;
696-
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
697-
698-
// utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
699-
// scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
700-
701-
// // We need to take care of eg
702-
// // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
703-
// // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
704-
// // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
705-
// // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
706-
// // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
707-
// // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
708-
// // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
709-
// // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
710-
// // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
711-
712-
// // so in short , we want to solve this error while at the same time not disturbing anything else
713-
// // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
714-
// // *TODO:Fill code here *
715-
// // Peudocode for now
716-
// // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
717-
// // pass on true to the
718660

719661

720-
// int totalbyteasciierror = processedLength - start_point;
721-
// var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
722-
723-
// utf16CodeUnitCountAdjustment += utfadjustasciierror;
724-
// scalarCountAdjustment += scalaradjustasciierror;
725-
726-
// TailScalarCodeUnitCountAdjustment =0;
727-
// TailUtf16CodeUnitCountAdjustment =0;
728-
729-
730-
731-
// return invalidBytePointer;
732-
// }
733-
734662
if (!Avx2.TestZ(error, error))
735663
{
736-
// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
664+
737665

738666
int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32
739-
// int off = processedLength >= 32 ? processedLength - 32 : processedLength; original/main algorithm working
740667

741-
// Console.WriteLine($"Offset backup by: {off}");//debug
668+
742669
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
743670
bool TooLongErroronEdge = false;
744671

745672
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
746673
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
747674

748-
// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");//debug
675+
749676

750677
// We need to take care of eg
751678
// 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
@@ -756,26 +683,21 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
756683
// Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
757684
// but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
758685
// the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
759-
// If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
760-
761-
// so in short , we want to solve this error while at the same time not disturbing anything else
762-
// we know that there is a continuation on the edge eg at the 64 byte, we need te check that
763-
// *TODO:Fill code here *
764-
// Peudocode for now
765-
// if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
766-
// pass on true to the
686+
// If this error arrive at the edge of 2 simd vector, that is where problem abound
767687

768688
// Calculate the offset of the invalid byte pointer from the start of the input buffer
769689
ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer);
770690

771691
// Debugging output
692+
772693
bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
694+
773695
bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength);
774696

775-
// if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
697+
776698
if (isContinuationByte && isOneByteAfterProcessedLength)
777699
{
778-
// Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");//debug
700+
779701
TooLongErroronEdge = true;
780702
}
781703

@@ -797,31 +719,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
797719
var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
798720

799721
processedLength -= i;
800-
n4 += tempn4;// this is + because the adjustment function returns something negative already
722+
n4 += tempn4;
801723
contbytes +=tempcont;
802-
// Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
803-
}
804-
805-
806-
807-
808-
809-
// Vector256<byte> contbyto = Vector256.Create((byte)(0b11000000u - 0x80));
810-
// Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
811-
// Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
812-
// Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
813-
814-
// uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
815-
// uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
816-
// uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
817724

725+
}
818726

819-
// No errors! Updating the variables we keep track of
820-
// We use one instruction (MoveMask) to update ncon, plus one arithmetic operation.
821-
822-
// contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes
823-
// Placeholder until andether way to do with contbyte is found
824-
825727
Vector256<byte> top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits
826728
Vector256<byte> contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx
827729

@@ -843,9 +745,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
843745
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
844746
}
845747

846-
847-
848-
849748
// There are 2 possible scenarios here : either
850749
// A) it arrives flush en the border. eg it doesnt need to be processed further
851750
// B) There is some bytes remaining in which case we need to call the scalar functien
@@ -862,11 +761,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
862761
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
863762
// Process the remaining bytes with the scalar function
864763

865-
866764
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
867765
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
868766
if (processedLength < inputLength)
869767
{
768+
870769
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
871770
if (invalidBytePointer != pInputBuffer + inputLength)
872771
{

0 commit comments

Comments
 (0)