Skip to content

Commit 889cd7e

Browse files
committed
not bad :-)
1 parent 3ff8eb7 commit 889cd7e

File tree

8 files changed

+238
-429
lines changed

8 files changed

+238
-429
lines changed

src/Ascii.cs

Lines changed: 76 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -136,83 +136,111 @@ public static unsafe bool SIMDIsAscii(this ReadOnlySpan<char> s)
136136

137137
[MethodImpl(MethodImplOptions.AggressiveInlining)]
138138
public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
139+
{
140+
if (AdvSimd.Arm64.IsSupported)
141+
{
142+
return GetIndexOfFirstNonAsciiByteArm64(pBuffer, bufferLength);
143+
}
144+
// TODO: Add support for other architectures
145+
/*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
146+
{
147+
return GetIndexOfFirstNonAsciiByteAvx512(pBuffer, bufferLength);
148+
}*/
149+
if (Avx2.IsSupported)
150+
{
151+
return GetIndexOfFirstNonAsciiByteAvx2(pBuffer, bufferLength);
152+
}
153+
154+
if (Sse2.IsSupported)
155+
{
156+
return GetIndexOfFirstNonAsciiByteSse2(pBuffer, bufferLength);
157+
158+
}
159+
160+
return GetIndexOfFirstNonAsciiByteScalar(pBuffer, bufferLength);
161+
}
162+
163+
164+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
165+
public static unsafe nuint GetIndexOfFirstNonAsciiByteArm64(byte* pBuffer, nuint bufferLength)
139166
{
140167
byte* buf_orig = pBuffer;
141168
byte* end = pBuffer + bufferLength;
142-
Vector256<sbyte> ascii = Vector256<sbyte>.Zero;
143169

144-
if (Vector256.IsHardwareAccelerated)
170+
for (; pBuffer + 16 <= end; pBuffer += 16)
145171
{
146-
for (; pBuffer + 32 <= end; pBuffer += 32)
172+
Vector128<byte> input = AdvSimd.LoadVector128(pBuffer);
173+
if (AdvSimd.Arm64.MaxAcross(input).ToScalar() > 127)
147174
{
148-
Vector256<sbyte> input = Avx.LoadVector256((sbyte*)pBuffer);
149-
int notascii = Avx2.MoveMask(input.AsByte());
150-
if (notascii != 0)
151-
{
152-
// Print a message for debugging
153-
// Console.WriteLine($"Non-ASCII character found. notascii: {notascii}, index: {(nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii)}");
154-
155-
return (nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii);
156-
}
175+
return (nuint)(pBuffer - buf_orig) + GetIndexOfFirstNonAsciiByteScalar(pBuffer, (nuint)(end - pBuffer));
157176
}
158177
}
159178

160-
if (Vector128.IsHardwareAccelerated)
179+
180+
// Call the scalar function for the remaining bytes
181+
nuint scalarResult = GetIndexOfFirstNonAsciiByteScalar(pBuffer, (nuint)(end - pBuffer));
182+
183+
// Add the number of bytes processed by SIMD
184+
return (nuint)(pBuffer - buf_orig) + scalarResult;
185+
186+
}
187+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
188+
public static unsafe nuint GetIndexOfFirstNonAsciiByteSse2(byte* pBuffer, nuint bufferLength)
189+
{
190+
byte* buf_orig = pBuffer;
191+
byte* end = pBuffer + bufferLength;
192+
193+
for (; pBuffer + 16 <= end; pBuffer += 16)
161194
{
162-
for (; pBuffer + 16 <= end; pBuffer += 16)
195+
Vector128<sbyte> input = Sse2.LoadVector128((sbyte*)pBuffer);
196+
int notascii = Sse2.MoveMask(input.AsByte());
197+
if (notascii != 0)
163198
{
164-
Vector128<sbyte> input = Sse2.LoadVector128((sbyte*)pBuffer);
165-
int notascii = Sse2.MoveMask(input.AsByte());
166-
if (notascii != 0)
167-
{
168-
// Print a message for debugging
169-
// Console.WriteLine($"Non-ASCII character found. notascii: {notascii}, index: {(nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii)}");
170-
171-
return (nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii);
172-
}
199+
return (nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii);
173200
}
174201
}
175202

176-
177203
// Call the scalar function for the remaining bytes
178-
nuint scalarResult = Scalar_GetIndexOfFirstNonAsciiByte(pBuffer, (nuint)(end - pBuffer));
204+
nuint scalarResult = GetIndexOfFirstNonAsciiByteScalar(pBuffer, (nuint)(end - pBuffer));
179205

180206
// Add the number of bytes processed by SIMD
181207
return (nuint)(pBuffer - buf_orig) + scalarResult;
182208

183209
}
184210

211+
185212
[MethodImpl(MethodImplOptions.AggressiveInlining)]
186-
public static unsafe nuint Scalar_GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bufferLength)
213+
public static unsafe nuint GetIndexOfFirstNonAsciiByteAvx2(byte* pBuffer, nuint bufferLength)
187214
{
188-
byte* pCurrent = pBuffer;
189-
byte* pBufferEnd = pBuffer + bufferLength;
215+
byte* buf_orig = pBuffer;
216+
byte* end = pBuffer + bufferLength;
190217

191-
if (!Vector128.IsHardwareAccelerated)
218+
for (; pBuffer + 32 <= end; pBuffer += 32)
192219
{
193-
194-
// Process in blocks of 16 bytes when possible
195-
while (pCurrent + 16 <= pBufferEnd)
220+
Vector256<sbyte> input = Avx.LoadVector256((sbyte*)pBuffer);
221+
int notascii = Avx2.MoveMask(input.AsByte());
222+
if (notascii != 0)
196223
{
197-
ulong v1 = *(ulong*)pCurrent;
198-
ulong v2 = *(ulong*)(pCurrent + 8);
199-
ulong v = v1 | v2;
200-
201-
if ((v & 0x8080808080808080) != 0)
202-
{
203-
for (; pCurrent < pBufferEnd; pCurrent++)
204-
{
205-
if (*pCurrent >= 0b10000000)
206-
{
207-
return (nuint)(pCurrent - pBuffer);
208-
}
209-
}
210-
}
211-
212-
pCurrent += 16;
224+
return (nuint)(pBuffer - buf_orig) + (nuint)BitOperations.TrailingZeroCount(notascii);
213225
}
214226
}
215227

228+
229+
230+
// Call the scalar function for the remaining bytes
231+
nuint scalarResult = GetIndexOfFirstNonAsciiByteScalar(pBuffer, (nuint)(end - pBuffer));
232+
233+
// Add the number of bytes processed by SIMD
234+
return (nuint)(pBuffer - buf_orig) + scalarResult;
235+
236+
}
237+
238+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
239+
public static unsafe nuint GetIndexOfFirstNonAsciiByteScalar(byte* pBuffer, nuint bufferLength)
240+
{
241+
byte* pCurrent = pBuffer;
242+
byte* pBufferEnd = pBuffer + bufferLength;
243+
216244
// Process the tail byte-by-byte
217245
for (; pCurrent < pBufferEnd; pCurrent++)
218246
{

src/UTF16.cs

Lines changed: 0 additions & 9 deletions
This file was deleted.

src/UTF8.cs

Lines changed: 162 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
using System;
22
using System.Runtime.Intrinsics;
33
using System.Runtime.Intrinsics.X86;
4+
using System.Runtime.Intrinsics.Arm;
45
using System.Runtime.CompilerServices;
56

67
namespace SimdUnicode
@@ -253,6 +254,8 @@ public static class UTF8
253254
Vector256<byte> shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21);
254255
prevInputBlock = currentBlock;
255256
Vector256<byte> prev1 = Avx2.AlignRight(prevInputBlock, shuffled, (byte)(16 - 1));
257+
// Vector256.Shuffle vs Avx2.Shuffle
258+
// https://github.com/dotnet/runtime/blob/1400c1e7a888ea1e710e5c08d55c800e0b04bf8a/docs/coding-guidelines/vectorization-guidelines.md#vector256shuffle-vs-avx2shuffle
256259
Vector256<byte> byte_1_high = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
257260
Vector256<byte> byte_1_low = Avx2.Shuffle(shuf2, (prev1 & v0f));
258261
Vector256<byte> byte_2_high = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
@@ -304,16 +307,171 @@ public static class UTF8
304307
return pInputBuffer + inputLength;
305308
}
306309

310+
public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength)
311+
{
312+
int processedLength = 0;
313+
314+
if (pInputBuffer == null || inputLength <= 0)
315+
{
316+
return pInputBuffer;
317+
}
318+
if (inputLength > 128)
319+
{
320+
// We skip any ASCII characters at the start of the buffer
321+
int asciirun = 0;
322+
for(; asciirun + 64 <= inputLength; asciirun += 64)
323+
{
324+
Vector128<byte> block1 = AdvSimd.LoadVector128(pInputBuffer + asciirun);
325+
Vector128<byte> block2 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 16);
326+
Vector128<byte> block3 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 32);
327+
Vector128<byte> block4 = AdvSimd.LoadVector128(pInputBuffer + asciirun + 48);
328+
Vector128<byte> or = AdvSimd.Or(AdvSimd.Or(block1, block2), AdvSimd.Or(block3, block4));
329+
if (AdvSimd.Arm64.MaxAcross(or).ToScalar() > 127)
330+
{
331+
break;
332+
}
333+
}
334+
processedLength = asciirun;
335+
336+
if (processedLength + 32 < inputLength)
337+
{
338+
// We still have work to do!
339+
Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
340+
341+
Vector128<byte> maxValue = Vector128.Create(
342+
255, 255, 255, 255, 255, 255, 255, 255,
343+
255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
344+
Vector128<byte> prevIncomplete = AdvSimd.SubtractSaturate(prevInputBlock, maxValue);
345+
346+
347+
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
348+
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
349+
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
350+
TOO_SHORT | OVERLONG_2,
351+
TOO_SHORT,
352+
TOO_SHORT | OVERLONG_3 | SURROGATE,
353+
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
354+
355+
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
356+
CARRY | OVERLONG_2,
357+
CARRY,
358+
CARRY,
359+
CARRY | TOO_LARGE,
360+
CARRY | TOO_LARGE | TOO_LARGE_1000,
361+
CARRY | TOO_LARGE | TOO_LARGE_1000,
362+
CARRY | TOO_LARGE | TOO_LARGE_1000,
363+
CARRY | TOO_LARGE | TOO_LARGE_1000,
364+
CARRY | TOO_LARGE | TOO_LARGE_1000,
365+
CARRY | TOO_LARGE | TOO_LARGE_1000,
366+
CARRY | TOO_LARGE | TOO_LARGE_1000,
367+
CARRY | TOO_LARGE | TOO_LARGE_1000,
368+
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
369+
CARRY | TOO_LARGE | TOO_LARGE_1000,
370+
CARRY | TOO_LARGE | TOO_LARGE_1000);
371+
Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
372+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
373+
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
374+
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
375+
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
376+
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
377+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
378+
379+
Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
380+
Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
381+
Vector128<byte> v0f = Vector128.Create((byte)0x0F);
382+
Vector128<byte> v80 = Vector128.Create((byte)0x80);
383+
// Performance note: we could process 64 bytes at a time for better speed in some cases.
384+
for (; processedLength + 16 <= inputLength; processedLength += 16)
385+
{
386+
387+
Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
388+
389+
if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() > 127)
390+
{
391+
// We have an ASCII block, no need to process it, but
392+
// we need to check if the previous block was incomplete.
393+
if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
394+
{
395+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
396+
}
397+
prevIncomplete = Vector128<byte>.Zero;
398+
}
399+
else
400+
{
401+
// Contains non-ASCII characters, we need to do non-trivial processing
402+
Vector128<byte> prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
403+
Vector128<byte> byte_1_high = Vector128.Shuffle(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
404+
Vector128<byte> byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f));
405+
Vector128<byte> byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
406+
Vector128<byte> sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
407+
Vector128<byte> prev2 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 2));
408+
Vector128<byte> prev3 = AdvSimd.ExtractVector128 (prevInputBlock, currentBlock, (byte)(16 - 3));
409+
prevInputBlock = currentBlock;
410+
Vector128<byte> isThirdByte = AdvSimd.SubtractSaturate(prev2, thirdByte);
411+
Vector128<byte> isFourthByte = AdvSimd.SubtractSaturate(prev3, fourthByte);
412+
Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte);
413+
Vector128<byte> must23As80 = AdvSimd.And(must23, v80);
414+
Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
415+
if (AdvSimd.Arm64.MaxAcross(error).ToScalar() != 0)
416+
{
417+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
418+
}
419+
prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
420+
}
421+
}
422+
}
423+
}
424+
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
425+
426+
// Process the remaining bytes with the scalar function
427+
if (processedLength < inputLength)
428+
{
429+
// We need to possibly backtrack to the start of the last code point
430+
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
431+
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
432+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
433+
{
434+
processedLength -= 1;
435+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
436+
{
437+
processedLength -= 1;
438+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
439+
{
440+
processedLength -= 1;
441+
}
442+
}
443+
}
444+
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
445+
if (invalidBytePointer != pInputBuffer + inputLength)
446+
{
447+
// An invalid byte was found by the scalar function
448+
return invalidBytePointer;
449+
}
450+
}
451+
452+
return pInputBuffer + inputLength;
453+
}
307454
public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength)
308455
{
309-
if (Avx2.IsSupported)
456+
if (AdvSimd.Arm64.IsSupported)
310457
{
311-
return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength);
458+
return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength);
312459
}
313-
else
460+
if (Avx2.IsSupported)
314461
{
315-
return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength);
462+
return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength);
316463
}
464+
// TODO add support for other ISAs
465+
//if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
466+
//{
467+
// return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
468+
//
469+
//}
470+
//if (Sse2.IsSupported)
471+
//{
472+
// return GetPointerToFirstInvalidByteSse2(pInputBuffer, inputLength);
473+
//}
474+
return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength);
317475
}
318476

319477
}

0 commit comments

Comments
 (0)