Skip to content

Commit 1cee9d6

Browse files
committed
save game + longerror fix attempt + error around rewind length(I thing)
1 parent f6e40c8 commit 1cee9d6

File tree

2 files changed

+168
-43
lines changed

2 files changed

+168
-43
lines changed

src/UTF8.cs

Lines changed: 142 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ namespace SimdUnicode
1010
public static class UTF8
1111
{
1212

13-
// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
13+
//debug helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
1414
static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
1515
{
1616
int chunkSize = 16; // 128 bits = 16 bytes
@@ -78,19 +78,20 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
7878

7979
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
8080
{
81-
82-
int TempUtf16CodeUnitCountAdjustment = 0;
83-
int TempScalarCountAdjustment = 0;
84-
81+
// Console.WriteLine("CALLING REWIND");
8582
int extraLen = 0;
8683
bool foundLeadingBytes = false;
8784

8885
for (int i = 0; i <= howFarBack; i++)
8986
{
9087
byte candidateByte = buf[0 - i];
9188
foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
89+
Console.WriteLine($"Rewinding byte to offset {-i}: {candidateByte:X2}");
90+
Console.WriteLine(foundLeadingBytes);
91+
9292
if (foundLeadingBytes)
93-
{
93+
{
94+
Console.WriteLine("Found leading byte");
9495
buf -= i;
9596
break;
9697
}
@@ -101,13 +102,12 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
101102
return buf - howFarBack;
102103
}
103104

104-
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
105-
scalarCountAdjustment += TempScalarCountAdjustment;
106-
107105
int TailUtf16CodeUnitCountAdjustment = 0;
108106
int TailScalarCountAdjustment = 0;
109107

110108
byte* invalidBytePointer = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
109+
// Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCountAdjustment}");
110+
111111

112112
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
113113
scalarCountAdjustment += TailScalarCountAdjustment;
@@ -219,7 +219,7 @@ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
219219
}
220220
else
221221
{
222-
// we may have a continuation
222+
// we may have a continuation/too long error
223223
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
224224
scalarCountAdjustment = TempScalarCountAdjustment;
225225
return pInputBuffer + pos;
@@ -257,12 +257,11 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
257257
{
258258
if ((pInputBuffer[-i] & 0b11000000) != 0b10000000)
259259
{
260-
string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');
261-
// Console.WriteLine($"Stopping at byte {binaryString}"); //debug
260+
string binaryString = Convert.ToString(pInputBuffer[-i], 2).PadLeft(8, '0');//debug
261+
Console.WriteLine($"Stopping at byte {binaryString}"); //debug
262262
break;
263263
}
264264
contbyteadjust -= 1;
265-
266265
}
267266
if ((pInputBuffer[-i] & 0b10000000) == 0) {
268267
return (0,i,-1,contbyteadjust,0); // We must have that i == 1
@@ -279,19 +278,41 @@ public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,in
279278

280279
public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
281280
{
282-
// Console.WriteLine("---------"); //debug
283-
// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
281+
Console.WriteLine("---------"); //debug
282+
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's input debug. This is ascii count:" + asciibytes + " n4: " + n4 + " contbytes:" + contbytes + " totalbytes:" + totalbyte);//debug
284283
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
285284
int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
286285
int utfadjust = -2 * n4 - 2 * n3 - n2;
287286
int scalaradjust = -n4;
288287

289-
// Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
288+
Console.WriteLine("CalculateN2N3FinalSIMDAdjustments's output debug. This is n3 count:" + n3 + " n2: " + n2 + " utfadjust:" + utfadjust + " scalaradjust:" + scalaradjust);//debug
290289

291290
return (utfadjust, scalaradjust);
292291
}
293292

294-
public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes)
293+
// public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) //todo: add an extra bool parameter 'TooLongErroronEdge' which defaults to false
294+
// {
295+
// // Calculate the total bytes from start_point to processedLength
296+
// int totalbyte = processedLength - start_point;
297+
// int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
298+
299+
// // Adjust the length to include a complete character, if necessary
300+
// if (totalbyte > 0)
301+
// {
302+
// (adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
303+
// }
304+
305+
// // Pseudocode:
306+
// // if 'TooLongErroronEdge' bool is true then
307+
// // then substract (remove) adjustascii, adjustcont, adjustn4 from their respective counterpart in the following function:
308+
309+
// var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
310+
311+
312+
// return (utfadjust, scalaradjust);
313+
// }
314+
315+
public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes, bool TooLongErroronEdge = false)
295316
{
296317
// Calculate the total bytes from start_point to processedLength
297318
int totalbyte = processedLength - start_point;
@@ -300,17 +321,25 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
300321
// Adjust the length to include a complete character, if necessary
301322
if (totalbyte > 0)
302323
{
303-
(adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
324+
(adjusttotalbyte, backedupByHowMuch, adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
304325
}
305326

306-
// var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte);
307-
var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes, n4 , contbytes , totalbyte + adjusttotalbyte);
327+
// Adjust the counters if 'TooLongErroronEdge' is true
328+
if (TooLongErroronEdge)
329+
{
330+
// If you can figure out why this makes a difference,youre golden
331+
asciibytes += adjustascii;
332+
contbytes += adjustcont;
333+
n4 += adjustn4;
334+
}
308335

336+
var (utfadjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte + adjusttotalbyte);
309337

310338
return (utfadjust, scalaradjust);
311339
}
312340

313341

342+
314343
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
315344
{
316345

@@ -465,9 +494,9 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
465494

466495
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
467496
{
468-
// Console.ForegroundColor = ConsoleColor.Blue; //debug
469-
// Console.WriteLine("-------------------------------------");//debug
470-
// Console.ResetColor();//debug
497+
Console.ForegroundColor = ConsoleColor.Blue; //debug
498+
Console.WriteLine("-------------------------------------");//debug
499+
Console.ResetColor();//debug
471500

472501
int processedLength = 0;
473502
int TempUtf16CodeUnitCountAdjustment= 0 ;
@@ -659,23 +688,100 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
659688
Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
660689
Vector256<byte> must23As80 = Avx2.And(must23, v80);
661690
Vector256<byte> error = Avx2.Xor(must23As80, sc);
662-
if (!Avx2.TestZ(error, error))
663-
{
664-
// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
665-
int totalbyteasciierror = processedLength - start_point;
666-
var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
691+
// if (!Avx2.TestZ(error, error))
692+
// {
693+
// Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
667694

668-
utf16CodeUnitCountAdjustment = utfadjustasciierror;
669-
scalarCountAdjustment = scalaradjustasciierror;
695+
// int off = processedLength >= 32 ? processedLength - 32 : processedLength;
696+
// byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
697+
698+
// utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
699+
// scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
700+
701+
// // We need to take care of eg
702+
// // 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
703+
// // 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
704+
// // 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
705+
// // 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
706+
// // In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
707+
// // Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
708+
// // but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
709+
// // the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
710+
// // If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
711+
712+
// // so in short , we want to solve this error while at the same time not disturbing anything else
713+
// // we know that there is a continuation on the edge eg at the 64 byte, we need te check that
714+
// // *TODO:Fill code here *
715+
// // Peudocode for now
716+
// // if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
717+
// // pass on true to the
718+
719+
720+
// int totalbyteasciierror = processedLength - start_point;
721+
// var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
722+
723+
// utf16CodeUnitCountAdjustment += utfadjustasciierror;
724+
// scalarCountAdjustment += scalaradjustasciierror;
725+
726+
// TailScalarCodeUnitCountAdjustment =0;
727+
// TailUtf16CodeUnitCountAdjustment =0;
670728

671-
TailScalarCodeUnitCountAdjustment =0;
672-
TailUtf16CodeUnitCountAdjustment =0;
729+
730+
731+
// return invalidBytePointer;
732+
// }
733+
734+
if (!Avx2.TestZ(error, error))
735+
{
736+
Console.WriteLine($"--Error! @ {processedLength} bytes");//debug
673737

674738
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
675739
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment,ref TailScalarCodeUnitCountAdjustment);
740+
bool TooLongErroronEdge = false;
741+
742+
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
743+
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
744+
745+
Console.WriteLine($"RewindScalarValidation's function utf16adjust:{TailUtf16CodeUnitCountAdjustment}, scalaradjust:{TailScalarCodeUnitCountAdjustment}");
746+
747+
// We need to take care of eg
748+
// 11011110 10101101 11110000 10101101 10101111 10011111 11010111 10101000 11001101 10111001 11010100 10000111 11101111 10010000 10000000 11110011
749+
// 10110100 10101100 10100111 11100100 10101011 10011111 11101111 10100010 10110010 11011100 10100000 00100010 11110000 10011001 10101011 10000011
750+
// 10000000 10100010 11101110 10010101 10101001 11010100 10100111 11110000 10101001 10011101 10011011 11100100 10101011 10010111 11100110 10011001 <= Too long error @ 32 byte edge
751+
// 10010000 11101111 10111111 10010110 11001010 10000000 11000111 10100010 11110010 10111100 10111011 10010100 11101001 10001011 10000110 11110100
752+
// In this edge case, the 11110000 byte is erroneously double counted: the SIMD procedure counts it once, then it is counted again by the scalar function
753+
// Normally , if there is an error, this does not cause an issue: most erronous utf-8 unit will not be counted
754+
// but it is in the case of too long as if you take for example (1111---- 10----- 10----- 10-----) 10-----
755+
// the part between parentheses will be counted as valid and thus scalaradjust will be incremented once too much
756+
// If this error arrive at the edge of 2 simd vector, that is where problem abound: the rewind scalar function will backup
757+
758+
// so in short , we want to solve this error while at the same time not disturbing anything else
759+
// we know that there is a continuation on the edge eg at the 64 byte, we need te check that
760+
// *TODO:Fill code here *
761+
// Peudocode for now
762+
// if invalidbyte is of typo 10XX XXXX & invalidbyto pointer % 32 byte == 0 then
763+
// pass on true to the
764+
765+
// Calculate the offset of the invalid byte pointer from the start of the input buffer
766+
ulong offsetFromStart = (ulong)(invalidBytePointer - pInputBuffer);
767+
768+
// Debugging output
769+
bool isContinuationByte = (invalidBytePointer[0] & 0xC0) == 0x80;
770+
bool isOneByteAfterProcessedLength = (invalidBytePointer == pInputBuffer + processedLength);
771+
772+
// if (isContinuationByte && isAtBoundary && isOneByteAfterProcessedLength)// this alone creates false positives
773+
if (isContinuationByte && isOneByteAfterProcessedLength)
774+
{
775+
Console.WriteLine("Triggering TooLongErrorOnEdge adjustment");
776+
TooLongErroronEdge = true;
777+
}
676778

677-
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
678-
scalarCountAdjustment += TailScalarCodeUnitCountAdjustment;
779+
780+
int totalbyteasciierror = processedLength - start_point;
781+
var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes,TooLongErroronEdge);
782+
783+
utf16CodeUnitCountAdjustment += utfadjustasciierror;
784+
scalarCountAdjustment += scalaradjustasciierror;
679785

680786
return invalidBytePointer;
681787
}
@@ -690,7 +796,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
690796
processedLength -= i;
691797
n4 += tempn4;// this is + because the adjustment function returns something negative already
692798
contbytes +=tempcont;
693-
// Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
799+
Console.WriteLine($"Unterminated! @ {processedLength} Backing up by {i}"); //debug
694800
}
695801

696802

@@ -763,6 +869,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
763869
{
764870
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
765871
scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
872+
766873
// An invalid byte was found by the scalar function
767874
return invalidBytePointer;
768875
}

0 commit comments

Comments
 (0)