Skip to content

Commit c22c649

Browse files
committed
cleanup
1 parent 1d27d6f commit c22c649

File tree

3 files changed

+120
-204
lines changed

3 files changed

+120
-204
lines changed

README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,6 @@ dotnet test
2929

3030
To get a list of available tests, enter the command:
3131

32-
```
33-
dotnet test --list-tests | cut -d '(' -f 1 | uniq
34-
```
35-
36-
For a far more verbose output:
37-
3832
```
3933
dotnet test --list-tests
4034
```

src/UTF8.cs

Lines changed: 23 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -10,72 +10,6 @@ namespace SimdUnicode
1010
public static class UTF8
1111
{
1212

13-
14-
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
15-
{
16-
int chunkSize = 16; // 128 bits = 16 bytes
17-
18-
// Process each chunk for hexadecimal
19-
Console.Write("Hex: ");
20-
for (int i = 0; i < bytes.Length; i++)
21-
{
22-
if (i > 0 && i % chunkSize == 0)
23-
Console.WriteLine(); // New line after every 16 bytes
24-
25-
if (i == highlightIndex)
26-
{
27-
Console.ForegroundColor = ConsoleColor.Red;
28-
Console.Write($"{bytes[i]:X2} ");
29-
Console.ResetColor();
30-
}
31-
else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
32-
{
33-
Console.ForegroundColor = ConsoleColor.Green;
34-
Console.Write($"{bytes[i]:X2} ");
35-
Console.ResetColor();
36-
}
37-
else
38-
{
39-
Console.Write($"{bytes[i]:X2} ");
40-
}
41-
42-
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
43-
}
44-
Console.WriteLine("\n"); // New line for readability and to separate hex from binary
45-
46-
// Process each chunk for binary
47-
Console.Write("Binary: ");
48-
for (int i = 0; i < bytes.Length; i++)
49-
{
50-
if (i > 0 && i % chunkSize == 0)
51-
Console.WriteLine(); // New line after every 16 bytes
52-
53-
string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
54-
if (i == highlightIndex)
55-
{
56-
Console.ForegroundColor = ConsoleColor.Red;
57-
Console.Write($"{binaryString} ");
58-
Console.ResetColor();
59-
}
60-
else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
61-
{
62-
Console.ForegroundColor = ConsoleColor.Green;
63-
Console.Write($"{binaryString} ");
64-
Console.ResetColor();
65-
}
66-
else
67-
{
68-
Console.Write($"{binaryString} ");
69-
}
70-
71-
if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
72-
}
73-
Console.WriteLine(); // New line for readability
74-
}
75-
76-
77-
static Func<byte, string> byteToBinaryString = b => Convert.ToString(b, 2).PadLeft(8, '0');//for debugging
78-
7913
// prevents double counting in case there is a toolong error on the edge
8014
public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte)
8115
{
@@ -92,7 +26,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
9226
// Check if the header byte belongs to a 4-byte UTF-8 character
9327
else if ((headerByte & 0b11111000) == 0b11110000)
9428
{
95-
9629
return (2, 1);
9730
}
9831
// Otherwise, it's a 1-byte character or continuation byte
@@ -107,10 +40,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
10740
bool foundLeadingBytes = false;
10841

10942
// Print the byte value at the buf pointer
110-
byte* PinputPlusProcessedlength = buf;
111-
112-
113-
43+
byte* PinputPlusProcessedlength = buf;
11444
int TooLongErroronEdgeUtfadjust = 0;
11545
int TooLongErroronEdgeScalaradjust = 0;
11646

@@ -119,8 +49,6 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
11949
byte candidateByte = buf[0 - i];
12050
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
12151

122-
123-
12452
if (foundLeadingBytes)
12553
{
12654

@@ -140,27 +68,26 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
14068
int TailScalarCountAdjustment = 0;
14169

14270
byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
143-
// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
144-
145-
bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
146-
bool isOneByteAfterProcessedLength = (invalidBytePointer == PinputPlusProcessedlength);
147-
148-
149-
150-
// // Print the byte value at the invalidBytePointer
151-
15271

72+
// We need to take care of eg
73+
// 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
74+
// 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 *11110000* 10011001 10101011 10000011
75+
// 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
76+
// 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
77+
// Without the following check, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
78+
// Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
79+
// but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
80+
// the part between parentheses will be counted as valid and thus scalaradjust/utfadjust will be incremented once too much
15381

82+
bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
83+
bool isOnEdge = (invalidBytePointer == PinputPlusProcessedlength);
15484

155-
if (isContinuationByte && isOneByteAfterProcessedLength)
85+
if (isContinuationByte && isOnEdge)
15686
{
157-
15887
utf16CodeUnitCountAdjustment += TooLongErroronEdgeUtfadjust;
15988
scalarCountAdjustment += TooLongErroronEdgeScalaradjust;
160-
16189
}
16290

163-
16491
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
16592
scalarCountAdjustment += TailScalarCountAdjustment;
16693

@@ -295,7 +222,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
295222
const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS;
296223

297224
// Assuming that a valid UTF-8 sequence ends at pInputBuffer,
298-
// computes how many bytes are needed (eg what type of byte) to complete the last character. also counts the number of n4, n2 and ascii affected
225+
// computes how many bytes are needed to complete the last character. also counts the number of n4, n2 and ascii affected
299226
// This will return 1, 2, 3. If the whole byte sequence is valid UTF-8,
300227
// and this function returns returnedvalue>0, then the bytes at pInputBuffer[0],
301228
// ... pInputBuffer[returnedvalue - 1] should be continuation bytes.
@@ -309,8 +236,6 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
309236
{
310237
if ((pInputBuffer[-i] & 0b11000000) != 0b10000000)
311238
{
312-
313-
314239
break;
315240
}
316241
contbyteadjust -= 1;
@@ -330,19 +255,15 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
330255

331256
public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
332257
{
333-
334-
335258
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
336259
int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
337260
int utfadjust = -2 * n4 - 2 * n3 - n2;
338261
int scalaradjust = -n4;
339262

340-
341-
342263
return (utfadjust, scalaradjust);
343264
}
344265

345-
public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false)
266+
public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes)
346267
{
347268
// Calculate the total bytes from start_point to processedLength
348269
int totalbyte = processedLength - start_point;
@@ -353,21 +274,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
353274
{
354275
(adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
355276
}
356-
357-
// if (TooLongErroronEdge)
358-
// {
359-
// asciibytes += adjustascii;
360-
// contbytes += adjustcont;
361-
// n4 += adjustn4;
362-
// }
363-
364277
var (utfadjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte + adjusttotalbyte);
365-
366278
return (utfadjust, scalaradjust);
367279
}
368280

369-
370-
371281
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
372282
{
373283

@@ -522,10 +432,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
522432

523433
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
524434
{
525-
526-
527-
528-
529435
int processedLength = 0;
530436
int TempUtf16CodeUnitCountAdjustment= 0 ;
531437
int TempScalarCountAdjustment = 0;
@@ -678,7 +584,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
678584
//
679585
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
680586
{
681-
// TODO : this path is not explicitly tested, write tests
587+
// Note/todo : this path is not yet explicitly tested
682588
int totalbyteasciierror = processedLength - start_point;
683589
var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror);
684590

@@ -713,49 +619,13 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
713619

714620
if (!Avx2.TestZ(error, error))
715621
{
716-
717-
718622
int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32
719-
720-
721623
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
722-
bool TooLongErroronEdge = false;
723-
724624
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
725625
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
726626

727-
728-
729-
// We need to take care of eg
730-
// 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
731-
// 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
732-
// 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
733-
// 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
734-
// In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
735-
// Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
736-
// but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
737-
// the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
738-
// If this error arrive at the edge of 2 simd vector, that is where problem abound
739-
740-
// Calculate the offset of the invalid byte pointer from the start of the input buffer
741-
ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer);
742-
743-
// Debugging output
744-
745-
bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
746-
747-
bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength);
748-
749-
750-
if (isContinuationByte && isOneByteAfterProcessedLength)
751-
{
752-
753-
// TooLongErroronEdge = true;
754-
}
755-
756-
757627
int totalbyteasciierror = processedLength - start_point;
758-
var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes,TooLongErroronEdge);
628+
var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
759629

760630
utf16CodeUnitCountAdjustment += utfadjustasciierror;
761631
scalarCountAdjustment += scalaradjustasciierror;
@@ -769,13 +639,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
769639
{
770640
// We have an unterminated sequence.
771641
var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
772-
773642
processedLength -= i;
774643
n4 += tempn4;
775644
contbytes +=tempcont;
776-
777645
}
778646

647+
// (Nick Nuon)The counts for continuous bytes can probably be optimized:
648+
// The draft had something like this line:
649+
// contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
650+
// this actually counts the number of 2 consecutive continuous bytes
651+
// I put something that was bound to be working regardless as a slow but temporary fix:
652+
779653
Vector256<byte> top2bits = Vector256.Create((byte)0b11000000); // Mask to isolate the two most significant bits
780654
Vector256<byte> contbytemask = Vector256.Create((byte)0b10000000); // The expected pattern for continuation bytes: 10xxxxxx
781655

@@ -797,10 +671,6 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
797671
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
798672
}
799673

800-
// There are 2 possible scenarios here : either
801-
// A) it arrives flush en the border. eg it doesnt need to be processed further
802-
// B) There is some bytes remaining in which case we need to call the scalar functien
803-
// Either way we need to calculate n2,n3 and update the utf16adjust and scalar adjust
804674
int totalbyte = processedLength - start_point;
805675
var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4, contbytes, totalbyte);
806676

0 commit comments

Comments
 (0)