Incomplete test progress (only scalarcount working )

Nick-Nuon · Nick-Nuon · commit cbf004dd01cc · 2024-04-23T09:59:45.000-04:00
diff --git a/src/UTF8.cs b/src/UTF8.cs
@@ -16,6 +16,8 @@ public static class UTF8
         public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
         {
             Console.WriteLine("--Rewind Validate with Errors");
+            Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
+
             int TempUtf16CodeUnitCountAdjustment = 0;
             int TempScalarCountAdjustment = 0;
 
@@ -26,47 +28,53 @@ public static class UTF8
             // Even with no errors, it sometime double counts, why.. ? because it goes back even further
             // even though the scalar doesnt thread 
             // adjust for  double counting
-            // for (int i = 0; i <= howFarBack; i++)
-            // {
-            //     byte candidateByte = buf[0 - i];
-            //     foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
-            //     if (foundLeadingBytes)
-            //     {
-            //         // if (i == 0) {break;}
-            //         // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
-            //         Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
-
-            //         // adjustment to avoid double counting 
-            //         if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
-            //         {
-            //             // Console.WriteLine("Found 2 byte");
-            //             TempUtf16CodeUnitCountAdjustment += 1; 
-            //         }
-            //         if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
-            //         {
-            //             // Console.WriteLine("Found 3 byte");
-            //             TempUtf16CodeUnitCountAdjustment += 2; 
-            //         }
-            //         if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
-            //         {
-            //             // Console.WriteLine("Found 4 byte");
-            //             TempUtf16CodeUnitCountAdjustment += 2;
-            //             TempScalarCountAdjustment += 1;
-            //         }
-            //         break;
-            //     }
-            // }
+            // for (int i = 0; i <= howFarBack; i++) 
+            for (int i = 0; i <= howFarBack; i++) 
+            {
+                if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
+                byte candidateByte = buf[0 - i];
+                foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
+                if (foundLeadingBytes)
+                {
+
+                    // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
+                    Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
+
+                    // adjustment to avoid double counting 
+                    if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
+                    {
+                        // Console.WriteLine("Found 2 byte");
+                        TempUtf16CodeUnitCountAdjustment += 1; 
+                    }
+                    if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
+                    {
+                        // Console.WriteLine("Found 3 byte");
+                        TempUtf16CodeUnitCountAdjustment += 2; 
+                    }
+                    if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
+                    {
+                        // Console.WriteLine("Found 4 byte");
+                        TempUtf16CodeUnitCountAdjustment += 2;
+                        TempScalarCountAdjustment += 1;
+                    }
+                    break;
+                }
+            }
 
             for (int i = 0; i <= howFarBack; i++)
             {
+                Console.WriteLine("backup stat:" + i);
                 byte candidateByte = buf[0 - i];
                 foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
                 if (foundLeadingBytes)
                 {         
                     buf -= i;
                     extraLen = i;
                     Console.WriteLine(howFarBack);
-                    Console.WriteLine("Backed up " + i + 1 + " bytes");
+                    Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
+
+
+                    // Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
                     break;
                 }
             }
@@ -663,21 +671,31 @@ public static class UTF8
                                 // Console.WriteLine("incomplete utf16 count", incompleteUtf16CodeUnitPreventDoubleCounting);
                                 int backedup= 0;
 
+                                int currentByte = pInputBuffer[processedLength];
+                                Console.WriteLine("CurrentByte:" + Convert.ToString(currentByte, 2).PadLeft(8, '0'));
+
                                 for(int k = 0; k < 3; k++)
                                 {
                                     int candidateByte = pInputBuffer[processedLength + k];
+                                    Console.WriteLine("Backing up " + k +" bytes");
+                                    Console.WriteLine("CurrentByte after backing up:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
+
+                                    backedup = 3-k +1;
+                                    // TODO: 
+                                    // the weird + 1 is so I dont have to put an else to the conditional below
+                                    // less readable, there might be  a more elegant way to rewrite it but I am taking the path of convenience for now
+
                                     if ((candidateByte & 0b11000000) == 0b11000000)
                                     {
-                                        backedup = 3-k;
-                                        Console.WriteLine("Backing up " + backedup +" bytes");
-
                                         // Whatever you do, do not delete this
                                         processedLength += k;
                                         break;
                                     }
                                 }
 
-                                for(int k = backedup; k < 3; k++)
+                                Console.WriteLine("Backed up " + backedup +" bytes");
+
+                                for(int k = backedup; k < 3 ; k++)
                                 {
                                     int candidateByte = pInputBuffer[processedLength - k];
                                     if ((candidateByte & 0b11000000) == 0b11000000)
@@ -726,20 +744,20 @@ public static class UTF8
             {
 
                 Console.WriteLine("----Process remaining Scalar");
-                // Console.WriteLine("processed length before:" + processedLength);
+                Console.WriteLine("processed length before:" + processedLength);
                 int overlapCount = 0;
 
                 // // We need to possibly backtrack to the start of the last code point
                 while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
                 {
                     processedLength -= 1;
-                    // overlapCount +=1;
+                    overlapCount +=1;
                 }                
                 
-                // Console.WriteLine("processed length after:" + processedLength);
+                Console.WriteLine("processed length after backtrack:" + processedLength);
 
 
-                // Best use rewind I think
+                // TOCHECK:See if rewind is better here
                 // for(int k = 0; k < overlapCount; k++)
                 // {
                 // // There is no error here hence the loop is straigthforward and we avoid double counting every byte                     
@@ -763,11 +781,12 @@ public static class UTF8
                 //     }
                 // }
 
-                // Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
-                // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
+                Console.WriteLine("TempUTF16 before tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
+                Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
 
 
                 byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
+                // byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3,pInputBuffer + processedLength, inputLength - processedLength,ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
                 if (invalidBytePointer != pInputBuffer + inputLength)
                 {
                     utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
@@ -777,8 +796,8 @@ public static class UTF8
                     return invalidBytePointer;
                 }
 
-                // Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
-                // Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
+                Console.WriteLine("TempUTF16 after tail remaining check:"+ TempUtf16CodeUnitCountAdjustment);
+                Console.WriteLine("TempScalar '' '' '':"+ TempScalarCountAdjustment);
 
             }
 
diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs
@@ -921,6 +921,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
             Console.Write($"{bytes[i]:X2} ");
             Console.ResetColor();
         }
+        else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
+        {
+            Console.ForegroundColor = ConsoleColor.Green;
+            Console.Write($"{bytes[i]:X2} ");
+            Console.ResetColor();
+        }
         else
         {
             Console.Write($"{bytes[i]:X2} ");
@@ -1408,7 +1414,7 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele
             try
             {
                 Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
-                // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
+                Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
             }
             catch (Exception)
             {

Original file line number	Diff line number	Diff line change
`@@ -921,6 +921,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)`
`921`	`921`	`Console.Write($"{bytes[i]:X2} ");`
`922`	`922`	`Console.ResetColor();`
`923`	`923`	`}`
	`924`	`+ else if (i % (chunkSize * 2) == 0) // print green every 256 bytes`
	`925`	`+ {`
	`926`	`+ Console.ForegroundColor = ConsoleColor.Green;`
	`927`	`+ Console.Write($"{bytes[i]:X2} ");`
	`928`	`+ Console.ResetColor();`
	`929`	`+ }`
`924`	`930`	`else`
`925`	`931`	`{`
`926`	`932`	`Console.Write($"{bytes[i]:X2} ");`
`@@ -1408,7 +1414,7 @@ public void ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDele`
`1408`	`1414`	`try`
`1409`	`1415`	`{`
`1410`	`1416`	`Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");`
`1411`		`- // Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");`
	`1417`	`+ Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");`
`1412`	`1418`	`}`
`1413`	`1419`	`catch (Exception)`
`1414`	`1420`	`{`