Skip to content

Commit 051e55b

Browse files
committed
Count no error test working
1 parent fe73718 commit 051e55b

File tree

1 file changed

+68
-48
lines changed

1 file changed

+68
-48
lines changed

src/UTF8.cs

Lines changed: 68 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -70,43 +70,6 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
7070
}
7171
}
7272

73-
74-
75-
// public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment, int skippedBytes = 0)
76-
// {
77-
// utf16CodeUnitCountAdjustment = 0;
78-
// scalarCountAdjustment = 0;
79-
80-
// // Call the original function first. Assuming GetPointerToFirstInvalidByteOriginal exists and does the primary checking.
81-
// byte* result = GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
82-
83-
// // If the adjustments are still 0 and there are skipped bytes to consider,
84-
// // loop through the skipped bytes and adjust the counts as needed.
85-
// if (utf16CodeUnitCountAdjustment == 0 && scalarCountAdjustment == 0 && skippedBytes > 0)
86-
// {
87-
// for (int i = 0; i < skippedBytes; i++)
88-
// {
89-
// byte currentByte = *(pInputBuffer + i);
90-
// if (currentByte >= 0xC0 && currentByte < 0xE0)
91-
// {
92-
// // 2-byte sequence
93-
// utf16CodeUnitCountAdjustment -= 1; // Adjust according to your logic
94-
// scalarCountAdjustment -= 1;
95-
// }
96-
// else if ((currentByte >= 0xE0 && currentByte < 0xF0) || (currentByte >= 0xF0))
97-
// {
98-
// // 3-byte or 4-byte sequence
99-
// utf16CodeUnitCountAdjustment -= 1; // This might need to be adjusted based on your specific logic for 3-byte and 4-byte sequences
100-
// scalarCountAdjustment -= 1;
101-
// }
102-
// // Adjust for other conditions as necessary
103-
// }
104-
// }
105-
106-
// return result; // Return the pointer from the original check
107-
// }
108-
109-
11073
public unsafe static byte* GetPointerToFirstInvalidByteScalar(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
11174
{
11275

@@ -400,6 +363,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
400363

401364
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength,out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
402365
{
366+
Console.WriteLine("--------------------------Calling function----------------------------------");
403367
int processedLength = 0;
404368
int TempUtf16CodeUnitCountAdjustment= 0 ;
405369
int TempScalarCountAdjustment = 0;
@@ -551,10 +515,42 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
551515
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
552516
{
553517

518+
// TODO/think about : this path iss not explicitly tested
519+
Console.WriteLine("----Checkpoint 1:All ASCII need rewind");
554520
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
555521
scalarCountAdjustment = TempScalarCountAdjustment;
556522

557-
int off = processedLength >= 3 ? processedLength - 3 : processedLength;
523+
// int off = processedLength >= 3 ? processedLength - 3 : processedLength;
524+
int off = 0;
525+
526+
if (processedLength >= 32 + 3){
527+
off = processedLength -32 - 3;
528+
int overlapCount =3;
529+
530+
for(int k = 0; k < overlapCount; k++)
531+
{
532+
533+
int candidateByte = pInputBuffer[processedLength + k];
534+
if ((candidateByte & 0b11000000) == 0b11000000)
535+
{
536+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
537+
{
538+
TempUtf16CodeUnitCountAdjustment += 1;
539+
}
540+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
541+
{
542+
TempUtf16CodeUnitCountAdjustment += 2;
543+
}
544+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
545+
{
546+
TempUtf16CodeUnitCountAdjustment += 2;
547+
TempScalarCountAdjustment += 1;
548+
}
549+
}
550+
}
551+
}
552+
else{ off = processedLength;}
553+
558554
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
559555
}
560556
prevIncomplete = Vector256<byte>.Zero;
@@ -626,7 +622,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
626622
Vector256<byte> error = Avx2.Xor(must23As80, sc);
627623
if (!Avx2.TestZ(error, error))
628624
{
629-
625+
// TODO: add error handling for Code count
630626
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
631627
scalarCountAdjustment = TempScalarCountAdjustment;
632628

@@ -639,13 +635,15 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
639635

640636
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
641637
{
638+
639+
Console.WriteLine("----Checkpoint 2:SIMD rewind");
642640
// We have an unterminated sequence.
643641
processedLength -= 3;
644642
for(int k = 0; k < 3; k++)
645643
{
646644

647645
int candidateByte = pInputBuffer[processedLength + k];
648-
if ((pInputBuffer[processedLength + k] & 0b11000000) == 0b11000000)
646+
if ((candidateByte & 0b11000000) == 0b11000000)
649647
{
650648
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
651649
{
@@ -660,15 +658,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
660658
TempUtf16CodeUnitCountAdjustment += 2;
661659
TempScalarCountAdjustment += 1;
662660
}
663-
664-
665-
processedLength += k;
666-
// break;
667-
668661
}
669-
670-
671-
672662
}
673663
}
674664
}
@@ -678,10 +668,40 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
678668
// Process the remaining bytes with the scalar function
679669
if (processedLength < inputLength)
680670
{
671+
672+
Console.WriteLine("----Process remaining Scalar");
673+
int overlapCount = 0;
674+
681675
// // We need to possibly backtrack to the start of the last code point
682676
while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
683677
{
684678
processedLength -= 1;
679+
overlapCount +=1;
680+
}
681+
682+
for(int k = 0; k < overlapCount; k++)
683+
{
684+
685+
int candidateByte = pInputBuffer[processedLength + k];
686+
if ((candidateByte & 0b11000000) == 0b11000000)
687+
{
688+
if ((candidateByte & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
689+
{
690+
TempUtf16CodeUnitCountAdjustment += 1;
691+
}
692+
if ((candidateByte & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
693+
{
694+
TempUtf16CodeUnitCountAdjustment += 2;
695+
}
696+
if ((candidateByte & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
697+
{
698+
TempUtf16CodeUnitCountAdjustment += 2;
699+
TempScalarCountAdjustment += 1;
700+
}
701+
702+
// processedLength += k;
703+
break;
704+
}
685705
}
686706

687707
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);

0 commit comments

Comments
 (0)