Skip to content

Commit b9ff7c3

Browse files
committed
save game
1 parent 7b77dc1 commit b9ff7c3

File tree

2 files changed

+38
-24
lines changed

2 files changed

+38
-24
lines changed

src/UTF8.cs

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ public static class UTF8
1414

1515
public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len,ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
1616
{
17-
// Console.WriteLine("--Rewind Validate with Errors");
18-
// Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
17+
Console.WriteLine("-Rewind Validate with Errors");
18+
Console.WriteLine("current Byte:" + Convert.ToString(buf[0], 2).PadLeft(8, '0'));
1919

2020
int TempUtf16CodeUnitCountAdjustment = 0;
2121
int TempScalarCountAdjustment = 0;
@@ -33,7 +33,7 @@ public static class UTF8
3333
buf -= i;
3434
// extraLen = i; // a measure of how far we've backed up, only useful for debugging
3535
// Console.WriteLine(howFarBack);
36-
// Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
36+
Console.WriteLine("Found leading byte at:" + i + ",Byte:" + Convert.ToString(candidateByte, 2).PadLeft(8, '0'));
3737

3838
// Console.WriteLine("Backed up " + extraLen + 1 + " bytes");
3939
break;
@@ -56,12 +56,6 @@ public static class UTF8
5656
utf16CodeUnitCountAdjustment += TailUtf16CodeUnitCountAdjustment;
5757
scalarCountAdjustment += TailScalarCountAdjustment;
5858

59-
// Console.WriteLine("rewind utf16 Doublecount adjustment(Temp):" + TempUtf16CodeUnitCountAdjustment);
60-
// Console.WriteLine("scalarcount adjstment after rewind:" + TempScalarCountAdjustment);
61-
// Console.WriteLine(" ");
62-
// Console.WriteLine("rewinds utf16 count(done by GetPointerToFirstInvalidByteScalar):" + TailUtf16CodeUnitCountAdjustment);
63-
// Console.WriteLine("scalarcount after rewind(ditto):" + TailScalarCountAdjustment);
64-
6559
return invalidBytePointer;
6660
}
6761

@@ -199,8 +193,7 @@ public static class UTF8
199193
// ... pInputBuffer[returnedvalue - 1] should be continuation bytes.
200194
// Note that this function is unsafe, and it is the caller's responsibility
201195
// to ensure that we can read at least 4 bytes before pInputBuffer.
202-
// (Nick Nuon added 7th may) there is an addenum labeled important in the mock PR however I think we can treat unterminated as
203-
public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int n2,int n4) adjustmentFactor(byte* pInputBuffer) {
196+
public unsafe static (int totalbyteadjustment,int backedupByHowMuch,int ascii,int contbyte,int n4) adjustmentFactor(byte* pInputBuffer) {
204197
// Find the first non-continuation byte, working backward.
205198
int i = 1;
206199
for (; i <= 4; i++)
@@ -254,15 +247,15 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
254247
{
255248
// Calculate the total bytes from start_point to processedLength
256249
int totalbyte = processedLength - start_point;
257-
int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustn2 = 0, adjustn4 = 0;
250+
int adjusttotalbyte = 0, backedupByHowMuch = 0, adjustascii = 0, adjustcont = 0, adjustn4 = 0;
258251

259252
// Adjust the length to include a complete character, if necessary
260253
if (totalbyte > 0)
261254
{
262-
(adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustn2, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
255+
(adjusttotalbyte, backedupByHowMuch ,adjustascii, adjustcont, adjustn4) = adjustmentFactor(pInputBuffer + processedLength);
263256
}
264257

265-
var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustn2, totalbyte + adjusttotalbyte);
258+
var (utfadjust,scalaradjust) = CalculateN2N3FinalSIMDAdjustments( asciibytes + adjustascii, n4 + adjustn4, contbytes + adjustcont, totalbyte + adjusttotalbyte);
266259

267260
// Return the calculated n2 and n3
268261
return (utfadjust, scalaradjust);
@@ -432,6 +425,19 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
432425

433426
int TailScalarCodeUnitCountAdjustment = 0;
434427
int TailUtf16CodeUnitCountAdjustment = 0;
428+
bool lastSIMDisIncomplete = false;
429+
// This is to solve a specific problem, where we have an unterminated SIMD vector followed by a call to the scaral rewind function:
430+
// as an example say I have this sequence of byte where every line represents 16 bytes:
431+
// 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
432+
// 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 11101100 10001001 10011000 11001011 <=== This SIMD vector is unterminated,thus it has to backup
433+
// 10100100 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
434+
// 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
435+
// By default , if there is an unterminated SIMD vector, it assumes that the next vector is SIMD,
436+
// dont count the backed up bytes(in this case the "11101100 10001001 10011000")
437+
// however in case there isnt enough bytes to fill in, a gap is created as (??????)
438+
// A call to the adjustment vector has to be made and this is the value that holds whether this call is made or not.
439+
// It is somewhat questionable to create one extra variable just for that but I felt that I needed to separate what worked and what was tacked on later as clearly as possible
440+
435441

436442
if (pInputBuffer == null || inputLength <= 0)
437443
{
@@ -566,6 +572,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
566572
int contbytes = 0; // number of continuation bytes in the block
567573
int n4 = 0; // number of 4-byte sequences that start in this block
568574
// int totalbyte = 0, n3 = 0, n2 = 0;
575+
569576

570577

571578

@@ -654,10 +661,16 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
654661
// We have an unterminated sequence.
655662
Console.WriteLine("---Unterminated seq--- at " + processedLength + "bytes");
656663

657-
var (totalbyteadjustment, i,tempascii, tempn2, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
664+
665+
var (totalbyteadjustment, i,tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
666+
667+
Console.WriteLine("this is n4 adjusted by the adjustmentfactor function :" + tempn4 + " contbyte: " + contbytes);
668+
6
658669
processedLength -= i;
659670
n4 += tempn4;
660-
contbytes +=tempn2;
671+
contbytes +=tempcont;
672+
673+
lastSIMDisIncomplete = true;
661674

662675
// // Console.WriteLine("TempUTF16:"+ TempUtf16CodeUnitCountAdjustment);
663676
// // Console.WriteLine("TempScalar:"+ TempScalarCountAdjustment);
@@ -667,13 +680,17 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
667680
// No errors! Updating the variables we keep track of
668681
// We use one instruction (MoveMask) to update ncon, plus one arithmetic operation.
669682
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc));
670-
Console.WriteLine("this is contbytes" + contbytes) ;
683+
684+
671685

672686
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
673687
n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
688+
Console.WriteLine("No error has been detected! Adding contbytes: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(sc)) + "Adding n4: " + (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))));
689+
Console.WriteLine(" this is the accumulated contbytes" + contbytes + " and n4:" + n4) ; // debug
674690
}
675691
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));// TODO(Nick Nuon): simplify this expression
676692

693+
677694
}
678695

679696
// important: we just update asciibytes if there was no error.
@@ -690,6 +707,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
690707

691708
utf16CodeUnitCountAdjustment = utf16adjust;
692709
scalarCountAdjustment = scalaradjust;
710+
693711
}
694712

695713

test/UTF8ValidationTests.cs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,7 @@ namespace tests;
99
using BenchmarkDotNet.Disassemblers;
1010
using Iced.Intel;
1111

12-
// TODO: refine test for unterminated sequeqce happening at SIMD transition
1312
// TODO: The various tests do not formally take into account the scenario where vector is all ASCII
14-
// TODO?: Test if the error is in the first vector?
15-
// TODO:fix NoError,Ingomplete (some of the tests are wrong)
1613

1714
public unsafe class Utf8SIMDValidationTests
1815
{
@@ -258,7 +255,7 @@ public void NoError(Utf8ValidationDelegate utf8ValidationDelegate)
258255
try
259256
{
260257
Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
261-
Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate));
258+
Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate));
262259
ValidateCount(utf8,utf8ValidationDelegate);
263260
}
264261
catch (Xunit.Sdk.XunitException)
@@ -398,19 +395,18 @@ public void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDe
398395
List<byte> secondbyte = generator.Generate(1,secondcodeLength);
399396
singlebytes.AddRange(secondbyte);
400397

401-
int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength);
398+
int incompleteLocation = 127 - rand.Next(1,firstcodeLength + secondcodeLength - 1);
402399
allAscii.InsertRange(incompleteLocation,singlebytes);
403400

404401
var utf8 = allAscii.ToArray();
405-
Console.WriteLine("---------------New trial");
406402
// PrintHexAndBinary(utf8,incompleteLocation);
407403

408404
bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate);
409405
string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
410406
try
411407
{
412408
Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
413-
Assert.True(InvalidateUtf8(utf8, outputLength,utf8ValidationDelegate));
409+
Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate));
414410
ValidateCount(utf8,utf8ValidationDelegate);
415411
}
416412
catch (Xunit.Sdk.XunitException)

0 commit comments

Comments
 (0)