Skip to content

Commit 73ecbf0

Browse files
committed
Some buggy attempts in comments
1 parent 4802a10 commit 73ecbf0

File tree

2 files changed

+74
-13
lines changed

2 files changed

+74
-13
lines changed

src/UTF8.cs

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public static class UTF8
1313
static Func<byte, string> byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');
1414

1515

16-
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
16+
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment,bool prevWasSimd=false)
1717
{
1818
Console.WriteLine("--Rewind Validate with Errors");
1919
Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
@@ -23,12 +23,50 @@ public static class UTF8
2323

2424
int extraLen = 0;
2525
bool foundLeadingBytes = false;
26+
// Console.WriteLine(prevWasSimd);
27+
28+
// adjust for filling in gap
29+
// If an error is found, since we start counting tho adjustments on prev3, a gap is left that needs to be counted in case the previous operation was using SIMD
30+
if (prevWasSimd)
31+
{
32+
// Console.WriteLine("Triggering Negative adjustment!");
33+
// for (int i = 0; i <= 3; i++)
34+
// {
35+
// if (i == 0){continue;}; // we dont want to dbouble count current byte
36+
// byte candidateByte = buf[0 - i];
37+
// foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
38+
// // if (i==0 & foundLeadingBytes){break;};// We dont want to
39+
// // TODO: written like this for readability, I know its ugly so this needs to be rewritten
40+
41+
// if (foundLeadingBytes)
42+
// {
43+
44+
// Console.WriteLine("Negative adjstment:Found leading byte at:" + i + ",Byte:" + candidateByte.ToString("X2"));
45+
// // Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
46+
47+
// // adjustment to avoid double counting
48+
// if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
49+
// {
50+
// // Console.WriteLine("Found 2 byte");
51+
// TempUtf16CodeUnitCountAdjustment -= 1;
52+
// }
53+
// if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
54+
// {
55+
// // Console.WriteLine("Found 3 byte");
56+
// TempUtf16CodeUnitCountAdjustment -= 2;
57+
// }
58+
// if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
59+
// {
60+
// // Console.WriteLine("Found 4 byte");
61+
// TempUtf16CodeUnitCountAdjustment -= 2;
62+
// TempScalarCountAdjustment -= 1;
63+
// }
64+
// // break;
65+
// }
66+
// }
67+
}
68+
2669

27-
// this is the generic function called when there is an error:
28-
// TODO: adjust for double counting iff there is an error eg invalidpointerbyte != length
29-
// Even with no errors, it sometime double counts, why.. ? because it goes back even further
30-
// even though the scalar doesnt thread
31-
// adjust for double counting
3270
// for (int i = 0; i <= howFarBack; i++)
3371
// {
3472
// if (i==0){continue;};// we dont want to miss out on counting the current byte, only to avoid double counting what may have been counted prior
@@ -713,7 +751,7 @@ public static class UTF8
713751
else // Contains non-ASCII characters, we need to do non-trivial processing
714752
{
715753
Console.WriteLine("--Found non-ascii:triggering SIMD routine at " + processedLength + "bytes");
716-
prevWasSimd = true;
754+
prevWasSimd = true; // consider moving this somewhere else
717755

718756
// Use SubtractSaturate to effectively compare if bytes in block are greater than markers.
719757
// TODO:integrate this better with the rest of the code
@@ -766,11 +804,19 @@ public static class UTF8
766804
Console.WriteLine("-----Error path!!");
767805
TailScalarCodeUnitCountAdjustment =0;
768806
TailUtf16CodeUnitCountAdjustment =0;
807+
int off= 32;
808+
809+
// if (processedLength <32) // not enough bytes to load into SIMD!
810+
// {
811+
// // off = 0;
812+
// prevWasSimd = false; // there was no previous op at all, let alone SIMD one
813+
// }
769814

770-
// TODO :I cant remember why I pu an off that does the same thing here but look intit
771-
// int off = processedLength >= 32 ? processedLength: processedLength;
815+
816+
// int off = processedLength >= 32 ? processedLength: 0; // we check if there
817+
// without this there is an overflow if
772818
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
773-
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
819+
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(3, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment,prevWasSimd);
774820

775821
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment +TailUtf16CodeUnitCountAdjustment;
776822
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;

test/UTF8ValidationTests.cs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ namespace tests;
1111

1212
// TODO: refine test for unterminated sequeqce happening at SIMD transition
1313
// TODO: The various tests do not formally take into account the scenario where vector is all ASCII
14+
// TODO?: Test if the error is in the first vector?
1415

1516
public unsafe class Utf8SIMDValidationTests
1617
{
@@ -475,9 +476,23 @@ public void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate)
475476
{
476477
byte oldByte = utf8[i];
477478
utf8[i] = 0b11111000; // Forcing a header bits error
478-
Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
479-
Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
480-
ValidateCount(utf8,utf8ValidationDelegate);
479+
// Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
480+
// Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
481+
// ValidateCount(utf8,utf8ValidationDelegate);
482+
try
483+
{
484+
Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
485+
Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
486+
ValidateCount(utf8,utf8ValidationDelegate); // Ensure you want to call this here, it seems unrelated to exception handling.
487+
}
488+
catch (Xunit.Sdk.XunitException)
489+
{
490+
Console.WriteLine($"Assertion failed at index: {i}");
491+
PrintHexAndBinary(utf8, i);
492+
utf8[i] = oldByte; // Restore the original byte
493+
throw; // Rethrow the exception to fail the test.
494+
}
495+
481496
utf8[i] = oldByte; // Restore the original byte
482497
}
483498
}

0 commit comments

Comments
 (0)