Skip to content

Commit c61c83b

Browse files
committed
fixing count erre attempt
1 parent 4919672 commit c61c83b

File tree

2 files changed

+108
-25
lines changed

2 files changed

+108
-25
lines changed

src/UTF8.cs

Lines changed: 51 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ public static class UTF8
1515
int TempUtf16CodeUnitCountAdjustment = 0;
1616
int TempScalarCountAdjustment = 0;
1717

18+
int TailUtf16CodeUnitCountAdjustment = 0;
19+
int TailScalarCountAdjustment = 0;
20+
1821
int howFarBack = priorBytes;
1922
int extraLen = 0;
2023
bool foundLeadingBytes = false;
@@ -24,27 +27,47 @@ public static class UTF8
2427
foundLeadingBytes = (b & 0b11000000) != 0b10000000;
2528
if (foundLeadingBytes)
2629
{
30+
31+
32+
if ((b & 0b11100000) == 0b11000000) // Start of a 2-byte sequence
33+
{
34+
TempUtf16CodeUnitCountAdjustment += 1;
35+
}
36+
if ((b & 0b11110000) == 0b11100000) // Start of a 3-byte sequence
37+
{
38+
TempUtf16CodeUnitCountAdjustment += 2;
39+
}
40+
if ((b & 0b11111000) == 0b11110000) // Start of a 4-byte sequence
41+
{
42+
TempUtf16CodeUnitCountAdjustment += 2;
43+
TempScalarCountAdjustment += 1;
44+
}
45+
46+
2747
buf -= i;
2848
extraLen = i;
2949
break;
3050
}
3151
}
52+
53+
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
54+
scalarCountAdjustment += TempScalarCountAdjustment;
55+
56+
3257
if (!foundLeadingBytes)
3358
{
34-
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
35-
scalarCountAdjustment += TempScalarCountAdjustment;
3659
return buf - howFarBack;
3760
}
3861

39-
// TODO : fix Count handling here
62+
4063

4164

4265
// Now buf points to the start of a UTF-8 sequence or the start of the buffer.
4366
// Validate from this new start point with the adjusted length.
44-
byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TempUtf16CodeUnitCountAdjustment, out TempScalarCountAdjustment);
67+
byte* invalidByte = GetPointerToFirstInvalidByteScalar(buf, len + extraLen,out TailUtf16CodeUnitCountAdjustment, out TailScalarCountAdjustment);
4568

46-
utf16CodeUnitCountAdjustment += TempUtf16CodeUnitCountAdjustment;
47-
scalarCountAdjustment += TempScalarCountAdjustment;
69+
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
70+
scalarCountAdjustment += TailScalarCountAdjustment;
4871

4972
return invalidByte;
5073
}
@@ -220,6 +243,8 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
220243
{
221244

222245
int processedLength = 0;
246+
int TempUtf16CodeUnitCountAdjustment= 0 ;
247+
int TempScalarCountAdjustment = 0;
223248

224249
if (pInputBuffer == null || inputLength <= 0)
225250
{
@@ -309,7 +334,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
309334
// return pInputBuffer + processedLength;
310335

311336
// Console.WriteLine("not ascii");
312-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
337+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment);
313338
}
314339
prevIncomplete = Vector128<byte>.Zero;
315340
}
@@ -331,7 +356,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
331356
Vector128<byte> error = Sse2.Xor(must23As80, sc);
332357
if (Sse2.MoveMask(error) != 0)
333358
{
334-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
359+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref TempUtf16CodeUnitCountAdjustment,ref TempScalarCountAdjustment);
335360
}
336361
prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
337362
}
@@ -527,12 +552,12 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
527552
{
528553

529554
// TODO/think about : this path iss not explicitly tested
530-
// Console.WriteLine("----Checkpoint 1:All ASCII need rewind");
555+
Console.WriteLine("----Checkpoint 1:All ASCII need rewind");
531556
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
532557
scalarCountAdjustment = TempScalarCountAdjustment;
533558

534559
// int off = processedLength >= 3 ? processedLength - 3 : processedLength;
535-
int off = 0;
560+
int off = processedLength;
536561

537562
if (processedLength >= 32 + 3){
538563
off = processedLength -32 - 3;
@@ -560,9 +585,10 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
560585
}
561586
}
562587
}
563-
else{ off = processedLength;}
588+
// else{ off = processedLength;}
564589

565-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
590+
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
591+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
566592
}
567593
prevIncomplete = Vector256<byte>.Zero;
568594
}
@@ -633,12 +659,14 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
633659
Vector256<byte> error = Avx2.Xor(must23As80, sc);
634660
if (!Avx2.TestZ(error, error))
635661
{
636-
// TODO: add error handling for Code count
662+
Console.WriteLine("-----Error path!!");
637663
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
638664
scalarCountAdjustment = TempScalarCountAdjustment;
639665

640666
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
641-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
667+
// return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
668+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
669+
642670
}
643671
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
644672
}
@@ -647,7 +675,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
647675
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
648676
{
649677

650-
// Console.WriteLine("----Checkpoint 2:SIMD rewind");
678+
Console.WriteLine("----Checkpoint 2:SIMD rewind");
651679
// We have an unterminated sequence.
652680
processedLength -= 3;
653681
for(int k = 0; k < 3; k++)
@@ -680,7 +708,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
680708
if (processedLength < inputLength)
681709
{
682710

683-
// Console.WriteLine("----Process remaining Scalar");
711+
Console.WriteLine("----Process remaining Scalar");
684712
int overlapCount = 0;
685713

686714
// // We need to possibly backtrack to the start of the last code point
@@ -736,6 +764,11 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
736764
{
737765
int processedLength = 0;
738766

767+
int TempUtf16CodeUnitCountAdjustment= 0 ;
768+
int TempScalarCountAdjustment = 0;
769+
770+
int utf16CodeUnitCountAdjustment=0, scalarCountAdjustment=0;
771+
739772
if (pInputBuffer == null || inputLength <= 0)
740773
{
741774
return pInputBuffer;
@@ -817,7 +850,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
817850
// we need to check if the previous block was incomplete.
818851
if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
819852
{
820-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
853+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
821854
}
822855
prevIncomplete = Vector128<byte>.Zero;
823856
}
@@ -839,7 +872,7 @@ public unsafe static void AdjustForSkippedBytes(byte* pInputBuffer,// int skippe
839872
Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
840873
if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0)
841874
{
842-
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
875+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength,ref utf16CodeUnitCountAdjustment,ref scalarCountAdjustment);
843876
}
844877
prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
845878
}

test/UTF8ValidationTests.cs

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -871,13 +871,7 @@ public void TooLargeErrorScalar()
871871

872872
[Fact]
873873
[Trait("Category", "avx")]
874-
public void TooLargeErrorAVX()
875-
{
876-
TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
877-
}
878-
879-
[Fact]
880-
public void TooLargeErrorAvx2()
874+
public void TooLargeErrorAvx()
881875
{
882876
TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
883877
}
@@ -1299,6 +1293,62 @@ public void ValidateCount(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg
12991293
// }
13001294
}
13011295

1296+
[Fact]
1297+
[Trait("Category", "Scalar")]
1298+
public void DotnetUTF16Count()
1299+
{
1300+
int[] outputLengths = { 10, 15, 11,12 ,15,15,1, 3, 5, 8, 10, 12, 15, 18 };
1301+
int DotnetUtf16Adjustment, DotnetScalarCountAdjustment;
1302+
int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
1303+
1304+
1305+
foreach (int outputLength in outputLengths)
1306+
{
1307+
// Generate a UTF-8 sequence with 3 units, each 2 bytes long, presumed to be valid.
1308+
// byte[] utf8 = generator.Generate(howManyUnits: 11, byteCountInUnit: 3).ToArray();
1309+
byte[] utf8 = generator.Generate(howManyUnits: outputLength).ToArray();
1310+
PrintHexAndBinary(utf8);
1311+
var (offset, length) = (0, utf8.Length);
1312+
1313+
unsafe
1314+
{
1315+
fixed (byte* pInput = utf8)
1316+
{
1317+
byte* startPtr = pInput + offset;
1318+
// Invoke the method under test.
1319+
1320+
DotnetUtf16Adjustment= 0;
1321+
DotnetScalarCountAdjustment= 0;
1322+
DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out DotnetUtf16Adjustment, out DotnetScalarCountAdjustment);
1323+
1324+
SimdUnicodeUtf16Adjustment= 0;
1325+
SimdUnicodeScalarCountAdjustment= 0;
1326+
SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
1327+
1328+
Console.WriteLine("Lenght:" + utf8.Length);
1329+
1330+
Console.WriteLine("DotnetScalar:" + DotnetScalarCountAdjustment);
1331+
Console.WriteLine("OurScalar:" + SimdUnicodeScalarCountAdjustment);
1332+
1333+
Console.WriteLine("Dotnetutf16:" + DotnetUtf16Adjustment);
1334+
Console.WriteLine("Ourutf16:" + SimdUnicodeUtf16Adjustment);
1335+
Console.WriteLine("___________________________________________________");
1336+
1337+
1338+
Assert.True(DotnetUtf16Adjustment == SimdUnicodeUtf16Adjustment, $"Expected UTF16 Adjustment: {DotnetUtf16Adjustment}, but got: {SimdUnicodeUtf16Adjustment}.");
1339+
Assert.True(DotnetScalarCountAdjustment == SimdUnicodeScalarCountAdjustment, $"Expected Scalar Count Adjustment: {DotnetScalarCountAdjustment}, but got: {SimdUnicodeScalarCountAdjustment}.");
1340+
1341+
1342+
1343+
1344+
// If your generator creates specific patterns or the utility calculates these adjustments differently,
1345+
// you'll need to adjust the expected values accordingly.
1346+
}
1347+
}
1348+
}
1349+
}
1350+
1351+
13021352
}
13031353

13041354

0 commit comments

Comments
 (0)