Skip to content

Commit 0c758c9

Browse files
committed
temporary cont byte fix
1 parent d73ffc3 commit 0c758c9

File tree

1 file changed

+124
-20
lines changed

1 file changed

+124
-20
lines changed

src/UTF8.cs

Lines changed: 124 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,70 @@ namespace SimdUnicode
1010
public static class UTF8
1111
{
1212

13+
// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
14+
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
15+
{
16+
int chunkSize = 16; // 128 bits = 16 bytes
17+
18+
// Process each chunk for hexadecimal
19+
Console.Write("Hex: ");
20+
for (int i = 0; i < bytes.Length; i++)
21+
{
22+
if (i > 0 && i % chunkSize == 0)
23+
Console.WriteLine(); // New line after every 16 bytes
24+
25+
if (i == highlightIndex)
26+
{
27+
Console.ForegroundColor = ConsoleColor.Red;
28+
Console.Write($"{bytes[i]:X2} ");
29+
Console.ResetColor();
30+
}
31+
else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
32+
{
33+
Console.ForegroundColor = ConsoleColor.Green;
34+
Console.Write($"{bytes[i]:X2} ");
35+
Console.ResetColor();
36+
}
37+
else
38+
{
39+
Console.Write($"{bytes[i]:X2} ");
40+
}
41+
42+
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
43+
}
44+
Console.WriteLine("\n"); // New line for readability and to separate hex from binary
45+
46+
// Process each chunk for binary
47+
Console.Write("Binary: ");
48+
for (int i = 0; i < bytes.Length; i++)
49+
{
50+
if (i > 0 && i % chunkSize == 0)
51+
Console.WriteLine(); // New line after every 16 bytes
52+
53+
string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
54+
if (i == highlightIndex)
55+
{
56+
Console.ForegroundColor = ConsoleColor.Red;
57+
Console.Write($"{binaryString} ");
58+
Console.ResetColor();
59+
}
60+
else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
61+
{
62+
Console.ForegroundColor = ConsoleColor.Green;
63+
Console.Write($"{binaryString} ");
64+
Console.ResetColor();
65+
}
66+
else
67+
{
68+
Console.Write($"{binaryString} ");
69+
}
70+
71+
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
72+
}
73+
Console.WriteLine(); // New line for readability
74+
}
75+
76+
1377
static Func<byte, string> byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging
1478

1579
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
@@ -188,38 +252,40 @@ public static class UTF8
188252
public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int contbyte,int n4) adjustmentFactor(byte* pInputBuffer) {
189253
// Find the first non-continuation byte, working backward.
190254
int i = 1;
255+
int contbyteadjust = 0;
191256
for (; i <= 4; i++)
192257
{
193258
if ((pInputBuffer[-i] & 0b11000000) != 0b10000000)
194259
{
195260
break;
196261
}
262+
contbyteadjust -= 1;
263+
197264
}
198265
if ((pInputBuffer[-i] & 0b10000000) == 0) {
199-
return (0,i,-1,0,0); // We must have that i == 1
266+
return (0,i,-1,contbyteadjust,0); // We must have that i == 1
200267
}
201268
if ((pInputBuffer[-i] & 0b11100000) == 0b11000000) {
202-
return (2 - i,i,0,0,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte.
269+
return (2 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2, if i == 1, we are missing one byte.
203270
}
204271
if ((pInputBuffer[-i] & 0b11110000) == 0b11100000) {
205-
return (3 - i,i,0,0,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte.
272+
return (3 - i,i,0,contbyteadjust,0); // We have that i == 1 or i == 2 or i == 3, if i == 1, we are missing two bytes, if i == 2, we are missing one byte.
206273
}
207274
// We must have that (pInputBuffer[-i] & 0b11111000) == 0b11110000
208-
return (4 - i,i,0,0,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte.
275+
return (4 - i,i,0,contbyteadjust,-1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte.
209276
}
210277

211278
public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
212279
{
213-
Console.WriteLine("---------");
214-
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);
280+
Console.WriteLine("---------"); //debug
281+
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
215282
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
216283
int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
217284
int utfadjust = -2 * n4 - 2 * n3 - n2;
218285
int scalaradjust = -n4;
219286

220-
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);
287+
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
221288

222-
223289
return (utfadjust, scalaradjust);
224290
}
225291

@@ -395,7 +461,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
395461

396462
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
397463
{
398-
Console.WriteLine("-------------------------------------");
464+
Console.ForegroundColor = ConsoleColor.Blue; //debug
465+
Console.WriteLine("-------------------------------------");//debug
466+
Console.ResetColor();//debug
467+
399468
int processedLength = 0;
400469
int TempUtf16CodeUnitCountAdjustment= 0 ;
401470
int TempScalarCountAdjustment = 0;
@@ -568,10 +637,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
568637
Vector256<byte> prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1));
569638
// Vector256.Shuffle vs Avx2.Shuffle
570639
// https://github.com/dotnet/runtime/blob/1400c1e7a888ea1e710e5c08d55c800e0b04bf8a/docs/coding-guidelines/vectorization-guidelines.md#vector256shuffle-vs-avx2shuffle
571-
Vector256<byte> byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
572-
Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f));
573-
Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
640+
Vector256<byte> byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);// takes the XXXX 0000 part of the previous byte
641+
Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f)); // takes the 0000 XXXX part of the previous part
642+
Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte
574643
Vector256<byte> sc = Avx2.And(Avx2.And(byte_1_high, byte_1_low), byte_2_high);
644+
645+
// Create a span from the Vector256<byte>
646+
// Console.WriteLine("");
647+
// Span<byte> byteSpan = MemoryMarshal.Cast<Vector256<byte>, byte>(MemoryMarshal.CreateSpan(ref sc, 1));
648+
// byte[] scbytes = byteSpan.ToArray();
649+
// PrintHexAndBinary(scbytes);55555555555555555
650+
575651
Vector256<byte> prev2 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 2));
576652
Vector256<byte> prev3 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 3));
577653
Vector256<byte> isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte);
@@ -581,7 +657,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
581657
Vector256<byte> error = Avx2.Xor(must23As80, sc);
582658
if (!Avx2.TestZ(error, error))
583659
{
584-
Console.WriteLine("--Error!");
660+
Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
585661
int totalbyteasciierror = processedLength - start_point;
586662
var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
587663

@@ -608,25 +684,53 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
608684
var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
609685

610686
processedLength -= i;
611-
n4 += tempn4;
687+
n4 += tempn4;// this is + because the adjustment function returns something negative already
612688
contbytes +=tempcont;
613-
Console.WriteLine($"Unterminated! Backing up by {i}");
614-
689+
Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
615690
}
616691

692+
693+
694+
695+
696+
// Vector256<byte> contbyto = Vector256.Create((byte)(0b11000000u - 0x80));
697+
// Vector256<byte> isStartOf4ByteSequence = Avx2.SubtractSaturate(currentBlock, fourthByte);
698+
// Vector256<byte> isStartOf3OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, thirdByte);
699+
// Vector256<byte> isStartOf2OrMoreByteSequence = Avx2.SubtractSaturate(currentBlock, secondByte);
700+
701+
// uint twoBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf2OrMoreByteSequence));
702+
// uint threeBytePlusCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf3OrMoreByteSequence));
703+
// uint fourByteCount = Popcnt.PopCount((uint)Avx2.MoveMask(isStartOf4ByteSequence));
704+
705+
617706
// No errors! Updating the variables we keep track of
618707
// We use one instruction (MoveMask) to update ncon, plus one arithmetic operation.
619-
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
708+
709+
// contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)); // this actually counts the number of 2 consecutive continuous bytes
710+
// Placeholder until andether way to do with contbyte is found
711+
712+
Vector256<byte> top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits
713+
Vector256<byte> contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx
620714

715+
// Apply the mask and compare
716+
Vector256<byte> maskedData = Avx2.And(currentBlock, top2bits);
717+
Vector256<byte> compareResult = Avx2.CompareEqual(maskedData, contbytemask);
718+
// Move mask to get integer representation
719+
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(compareResult));
720+
721+
722+
621723
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
622724
n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
623725
}
726+
727+
// important: we just update asciibytes if there was no error.
728+
// We count the number of ascii bytes in the block using just some simple arithmetic
729+
// and no expensive operation:
624730
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
625731
}
626732

627-
// important: we just update asciibytes if there was no error.
628-
// We count the number of ascii bytes in the block using just some simple arithmetic
629-
// and no expensive operation:
733+
630734

631735

632736
// There are 2 possible scenarios here : either

0 commit comments

Comments
 (0)