simdutf
diff --git a/‎src/UTF8.cs
Lines changed: 24 additions & 125 deletions b/‎src/UTF8.cs
Lines changed: 24 additions & 125 deletions
@@ -10,7 +10,7 @@ namespace SimdUnicode
     public static class UTF8
     {
 
-//         //debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index 
+
 static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
 {
     int chunkSize = 16; // 128 bits = 16 bytes
@@ -78,20 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
 
         public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
         {
-//             // Console.WriteLine("CALLING REWIND");//debug
+
             int extraLen = 0;
             bool foundLeadingBytes = false;
 
             for (int i = 0; i <= howFarBack; i++)
             {
                 byte candidateByte = buf[0 - i];
                 foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
-//                 Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");//debug
-//                 Console.WriteLine(foundLeadingBytes);//debug
+
+
 
                 if (foundLeadingBytes)
                 {  
-//                     Console.WriteLine("Found leading byte");//debug       
+
                     buf -= i;
                     break;
                 }
@@ -257,8 +257,8 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
             {
                 if ((pInputBuffer[-i] & 0b11000000) != 0b10000000)
                 {
-//                     string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug
-//                     Console.WriteLine($"Stopping at byte {binaryString}"); //debug
+
+
                     break;
                 }
                 contbyteadjust -= 1;
@@ -278,40 +278,18 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
 
         public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
         {
-//             Console.WriteLine("---------"); //debug
-//             Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
+
+
             int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
             int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
             int utfadjust = -2 * n4 - 2 * n3 - n2;
             int scalaradjust = -n4;
 
-//             Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2  + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
+
 
             return (utfadjust, scalaradjust);
         }
 
-        // public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
-        // {
-        //     // Calculate the total bytes from start_point to processedLength
-        //     int totalbyte = processedLength - start_point;
-        //     int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
-
-        //     // Adjust the length to include a complete character, if necessary
-        //     if (totalbyte > 0)
-        //     {
-        //         (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
-        //     }
-
-        //     // Pseudocode:
-        //     // if 'TooLongErroronEdge' bool is true then 
-        //     // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
-
-        //     var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
-
-
-        //     return (utfadjust, scalaradjust);
-        // }
-
         public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false)
         {
             // Calculate the total bytes from start_point to processedLength
@@ -324,10 +302,8 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
                 (adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
             }
 
-            // Adjust the counters if 'TooLongErroronEdge' is true
             if (TooLongErroronEdge)
             {
-                // If you can figure out why this makes a difference,youre golden
                 asciibytes += adjustascii;
                 contbytes += adjustcont;
                 n4 += adjustn4;
@@ -494,9 +470,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
 
         public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
         {
-//             Console.ForegroundColor = ConsoleColor.Blue;            //debug
-//             Console.WriteLine("-------------------------------------");//debug
-//             Console.ResetColor();//debug
+
+
+
 
             int processedLength = 0;
             int TempUtf16CodeUnitCountAdjustment= 0 ;
@@ -674,78 +650,29 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
                             Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); // takes the 0000 XXXX part of the previous part
                             Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte
                             Vector256<byte> sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high);
-
-                                    // Create a span from the Vector256<byte>
-                                // Console.WriteLine("");
-                                // Span<byte> byteSpan = MemoryMarshal.Cast<Vector256<byte>, byte>(MemoryMarshal.CreateSpan(ref sc, 1));
-                                // byte[] scbytes = byteSpan.ToArray();
-                                // PrintHexAndBinary(scbytes);55555555555555555
-                            
                             Vector256<byte> prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2));
                             Vector256<byte> prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3));
                             Vector256<byte> isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte);
                             Vector256<byte> isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte);
                             Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
                             Vector256<byte> must23As80 = Avx2.And(must23, v80);
                             Vector256<byte> error = Avx2.Xor(must23As80, sc);
-                            // if (!Avx2.TestZ(error, error))
-                            // {
-//                                 Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
-
-                            //     int off = processedLength >= 32 ? processedLength - 32 : processedLength;
-                            //     byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
-
-                            //     utf16CodeUnitCountAdjustment =  TailUtf16CodeUnitCountAdjustment;
-                            //     scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
-
-                            //     // We need to take care of eg
-                            //     // 11011110  10101101  11110000  10101101  10101111  10011111  11010111  10101000  11001101  10111001  11010100  10000111  11101111  10010000  10000000  11110011 
-                            //     // 10110100  10101100  10100111  11100100  10101011  10011111  11101111  10100010  10110010  11011100  10100000  00100010  11110000  10011001  10101011  10000011 
-                            //     // 10000000  10100010  11101110  10010101  10101001  11010100  10100111  11110000  10101001  10011101  10011011  11100100  10101011  10010111  11100110  10011001 <= Too long error @ 32 byte edge 
-                            //     // 10010000  11101111  10111111  10010110  11001010  10000000  11000111  10100010  11110010  10111100  10111011  10010100  11101001  10001011  10000110  11110100 
-                            //     // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
-                            //     // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
-                            //     // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----  
-                            //     // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
-                            //     // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
-
-                            //     // so in short , we want to solve this error while at the same time not disturbing anything else
-                            //     // we  know that there is a continuation on the edge eg at the 64 byte, we need te check that
-                            //     // *TODO:Fill code here *
-                            //     // Peudocode for now
-                            //     // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
-                            //     // pass on true to the 
 
 
-                            //     int totalbyteasciierror = processedLength - start_point;                                
-                            //     var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
-
-                            //     utf16CodeUnitCountAdjustment += utfadjustasciierror;
-                            //     scalarCountAdjustment += scalaradjustasciierror;
-
-                            //     TailScalarCodeUnitCountAdjustment =0;
-                            //     TailUtf16CodeUnitCountAdjustment =0;
-
-
-
-                            //     return invalidBytePointer;
-                            // }
-
                             if (!Avx2.TestZ(error, error))
                             {
-//                                 Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
+
 
                                 int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32
-                                // int off = processedLength >= 32 ? processedLength - 32 : processedLength; original/main algorithm working
 
-//                                 Console.WriteLine($"Offset backup by: {off}");//debug
+
                                 byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
                                 bool TooLongErroronEdge = false;
 
                                 utf16CodeUnitCountAdjustment =  TailUtf16CodeUnitCountAdjustment;
                                 scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
 
-//                                 Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");//debug
+
 
                                 // We need to take care of eg
                                 // 11011110  10101101  11110000  10101101  10101111  10011111  11010111  10101000  11001101  10111001  11010100  10000111  11101111  10010000  10000000  11110011 
@@ -756,26 +683,21 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
                                 // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
                                 // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----  
                                 // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
-                                // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
-
-                                // so in short , we want to solve this error while at the same time not disturbing anything else
-                                // we  know that there is a continuation on the edge eg at the 64 byte, we need te check that
-                                // *TODO:Fill code here *
-                                // Peudocode for now
-                                // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
-                                // pass on true to the 
+                                // If this error arrive at the edge of 2 simd vector, that is where problem abound
 
                                     // Calculate the offset of the invalid byte pointer from the start of the input buffer
                                 ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer);
 
                                 // Debugging output
+
                                 bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
+
                                 bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength);
 
-                                // if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
+
                                 if (isContinuationByte && isOneByteAfterProcessedLength)
                                 {
-//                                     Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");//debug
+
                                     TooLongErroronEdge = true; 
                                 }
 
@@ -797,31 +719,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
                                 var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
 
                                 processedLength -= i;
-                                n4 += tempn4;// this is + because the adjustment function returns something negative already
+                                n4 += tempn4;
                                 contbytes +=tempcont;
-//                                 Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
-                            }
-
-
-                              
-                              
-                              
-                            // Vector256<byte> contbyto = Vector256.Create((byte)(0b11000000u - 0x80));
-                            // Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
-                            // Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
-                            // Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
-
-                            // uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
-                            // uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
-                            // uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
 
+                            }
 
-                            // No errors! Updating the variables we keep track of
-                            // We use one instruction (MoveMask) to update ncon, plus one arithmetic operation.
-                            
-                            // contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes
-                            // Placeholder until andether way to do with contbyte is found
-                            
                             Vector256<byte> top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits
                             Vector256<byte> contbytemask = Vector256.Create((byte)0b10000000);        // The expected pattern for continuation bytes: 10xxxxxx
 
@@ -843,9 +745,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
                         asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
                     }
 
-
-
-                    
                     // There are 2 possible scenarios here : either  
                     //  A)  it arrives flush en the border. eg it doesnt need to be processed further
                     //  B)  There is some bytes remaining in which case we need to call the scalar functien
@@ -862,11 +761,11 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
             // We have processed all the blocks using SIMD, we need to process the remaining bytes.
             // Process the remaining bytes with the scalar function
 
-
             // worst possible case is 4 bytes, where we need to backtrack 3 bytes
             // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
             if (processedLength < inputLength)
             {
+                
                 byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
                 if (invalidBytePointer != pInputBuffer + inputLength)
                 {