Skip to content

Commit ce70834

Browse files
author
Daniel Lemire
committed
adding sse
1 parent 90b8a60 commit ce70834

File tree

3 files changed

+186
-26
lines changed

3 files changed

+186
-26
lines changed

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,18 @@ You can print the content of a vector register like so:
8888
v.CopyTo(b);
8989
Console.WriteLine(Convert.ToHexString(b));
9090
}
91+
public static void ToString(Vector128<byte> v)
92+
{
93+
Span<byte> b = stackalloc byte[16];
94+
v.CopyTo(b);
95+
Console.WriteLine(Convert.ToHexString(b));
96+
}
9197
```
9298

99+
## Performance tips
100+
101+
- Be careful: `Vector128.Shuffle` is not the same as `Ssse3.Shuffle` nor is `Vector128.Shuffle` the same as `Avx2.Shuffle`. Prefer the latter.
102+
93103
## More reading
94104

95105

benchmark/Benchmark.cs

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,9 @@ private class Config : ManualConfig
6262
{
6363
public Config()
6464
{
65-
AddColumn(new Speed());
66-
67-
65+
AddColumn(new Speed());
66+
67+
6868
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
6969
{
7070
Console.WriteLine("ARM64 system detected.");
@@ -77,27 +77,27 @@ public Config()
7777
{
7878
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX-512 support.");
7979
AddFilter(new AnyCategoriesFilter(["avx512", "avx", "sse", "scalar", "runtime"]));
80-
}
80+
}
8181
else if (Avx2.IsSupported)
8282
{
8383
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX2 support.");
84-
AddFilter(new AnyCategoriesFilter(["avx", "sse", "scalar", "runtime"]));
85-
}
86-
else if (Sse42.IsSupported)
84+
AddFilter(new AnyCategoriesFilter(["avx", "sse", "scalar", "runtime"]));
85+
}
86+
else if (Ssse3.IsSupported)
8787
{
8888
Console.WriteLine("X64 system detected (Intel, AMD,...) with Sse4.2 support.");
89-
AddFilter(new AnyCategoriesFilter(["sse", "scalar", "runtime"]));
90-
}
91-
else
89+
AddFilter(new AnyCategoriesFilter(["sse", "scalar", "runtime"]));
90+
}
91+
else
9292
{
9393
Console.WriteLine("X64 system detected (Intel, AMD,...) without relevant SIMD support.");
94-
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
94+
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
9595
}
9696
}
9797
else
9898
{
99-
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
100-
99+
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
100+
101101
}
102102

103103
}
@@ -214,8 +214,6 @@ public unsafe void SIMDUtf8ValidationRealDataAvx2()
214214
RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
215215
}
216216
}
217-
/*
218-
// TODO: enable this benchmark when the SSE implementation is ready
219217
[Benchmark]
220218
[BenchmarkCategory("sse")]
221219
public unsafe void SIMDUtf8ValidationRealDataSse()
@@ -224,7 +222,7 @@ public unsafe void SIMDUtf8ValidationRealDataSse()
224222
{
225223
RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
226224
}
227-
}*/
225+
}
228226
/*
229227
// TODO: enable this benchmark when the AVX-512 implementation is ready
230228
[Benchmark]

src/UTF8.cs

Lines changed: 162 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,160 @@ public static class UTF8
128128
const byte OVERLONG_4 = 1 << 6;
129129
const byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTS;
130130

131+
public unsafe static byte* GetPointerToFirstInvalidByteSse(byte* pInputBuffer, int inputLength)
132+
{
133+
134+
int processedLength = 0;
135+
136+
if (pInputBuffer == null || inputLength <= 0)
137+
{
138+
return pInputBuffer;
139+
}
140+
if (inputLength > 128)
141+
{
142+
// We skip any ASCII characters at the start of the buffer
143+
int asciirun = 0;
144+
for(; asciirun + 64 <= inputLength; asciirun += 64)
145+
{
146+
Vector128<byte> block1 = Avx.LoadVector128(pInputBuffer + asciirun);
147+
Vector128<byte> block2 = Avx.LoadVector128(pInputBuffer + asciirun + 16);
148+
Vector128<byte> block3 = Avx.LoadVector128(pInputBuffer + asciirun + 32);
149+
Vector128<byte> block4 = Avx.LoadVector128(pInputBuffer + asciirun + 48);
150+
151+
Vector128<byte> or = Sse2.Or(Sse2.Or(block1, block2), Sse2.Or(block3, block4));
152+
if (Sse2.MoveMask(or) != 0)
153+
{
154+
break;
155+
}
156+
}
157+
processedLength = asciirun;
158+
159+
if (processedLength + 16 < inputLength)
160+
{
161+
// We still have work to do!
162+
Vector128<byte> prevInputBlock = Vector128<byte>.Zero;
163+
164+
Vector128<byte> maxValue = Vector128.Create(
165+
255, 255, 255, 255, 255, 255, 255, 255,
166+
255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1);
167+
Vector128<byte> prevIncomplete = Sse2.SubtractSaturate(prevInputBlock, maxValue);
168+
169+
170+
Vector128<byte> shuf1 = Vector128.Create(TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
171+
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
172+
TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
173+
TOO_SHORT | OVERLONG_2,
174+
TOO_SHORT,
175+
TOO_SHORT | OVERLONG_3 | SURROGATE,
176+
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
177+
178+
Vector128<byte> shuf2 = Vector128.Create(CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
179+
CARRY | OVERLONG_2,
180+
CARRY,
181+
CARRY,
182+
CARRY | TOO_LARGE,
183+
CARRY | TOO_LARGE | TOO_LARGE_1000,
184+
CARRY | TOO_LARGE | TOO_LARGE_1000,
185+
CARRY | TOO_LARGE | TOO_LARGE_1000,
186+
CARRY | TOO_LARGE | TOO_LARGE_1000,
187+
CARRY | TOO_LARGE | TOO_LARGE_1000,
188+
CARRY | TOO_LARGE | TOO_LARGE_1000,
189+
CARRY | TOO_LARGE | TOO_LARGE_1000,
190+
CARRY | TOO_LARGE | TOO_LARGE_1000,
191+
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
192+
CARRY | TOO_LARGE | TOO_LARGE_1000,
193+
CARRY | TOO_LARGE | TOO_LARGE_1000);
194+
Vector128<byte> shuf3 = Vector128.Create(TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
195+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
196+
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 | OVERLONG_4,
197+
TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
198+
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
199+
TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
200+
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
201+
202+
Vector128<byte> thirdByte = Vector128.Create((byte)(0b11100000u - 0x80));
203+
Vector128<byte> fourthByte = Vector128.Create((byte)(0b11110000u - 0x80));
204+
Vector128<byte> v0f = Vector128.Create((byte)0x0F);
205+
Vector128<byte> v80 = Vector128.Create((byte)0x80);
206+
207+
for (; processedLength + 16 <= inputLength; processedLength += 16)
208+
{
209+
210+
Vector128<byte> currentBlock = Sse2.LoadVector128(pInputBuffer + processedLength);
211+
212+
int mask = Sse2.MoveMask(currentBlock);
213+
if (mask == 0)
214+
{
215+
// Console.WriteLine("ascii");
216+
217+
// We have an ASCII block, no need to process it, but
218+
// we need to check if the previous block was incomplete.
219+
if (Sse2.MoveMask(prevIncomplete) != 0)
220+
{
221+
// return pInputBuffer + processedLength;
222+
223+
// Console.WriteLine("not ascii");
224+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
225+
}
226+
prevIncomplete = Vector128<byte>.Zero;
227+
}
228+
else
229+
{
230+
// Contains non-ASCII characters, we need to do non-trivial processing
231+
Vector128<byte> prev1 = Ssse3.AlignRight(currentBlock, prevInputBlock, (byte)(16 - 1));
232+
Vector128<byte> byte_1_high = Ssse3.Shuffle(shuf1, Sse2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
233+
Vector128<byte> byte_1_low = Ssse3.Shuffle(shuf2, (prev1 & v0f));
234+
Vector128<byte> byte_2_high = Ssse3.Shuffle(shuf3, Sse2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
235+
Vector128<byte> sc = Sse2.And(Sse2.And(byte_1_high, byte_1_low), byte_2_high);
236+
Vector128<byte> prev2 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 2));
237+
Vector128<byte> prev3 = Ssse3.AlignRight (currentBlock, prevInputBlock, (byte)(16 - 3));
238+
prevInputBlock = currentBlock;
239+
Vector128<byte> isThirdByte = Sse2.SubtractSaturate(prev2, thirdByte);
240+
Vector128<byte> isFourthByte = Sse2.SubtractSaturate(prev3, fourthByte);
241+
Vector128<byte> must23 = Sse2.Or(isThirdByte, isFourthByte);
242+
Vector128<byte> must23As80 = Sse2.And(must23, v80);
243+
Vector128<byte> error = Sse2.Xor(must23As80, sc);
244+
if (Sse2.MoveMask(error) != 0)
245+
{
246+
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
247+
}
248+
prevIncomplete = Sse2.SubtractSaturate(currentBlock, maxValue);
249+
}
250+
}
251+
}
252+
}
253+
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
254+
255+
// Process the remaining bytes with the scalar function
256+
if (processedLength < inputLength)
257+
{
258+
// We need to possibly backtrack to the start of the last code point
259+
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
260+
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
261+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
262+
{
263+
processedLength -= 1;
264+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
265+
{
266+
processedLength -= 1;
267+
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
268+
{
269+
processedLength -= 1;
270+
}
271+
}
272+
}
273+
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
274+
if (invalidBytePointer != pInputBuffer + inputLength)
275+
{
276+
// An invalid byte was found by the scalar function
277+
return invalidBytePointer;
278+
}
279+
}
280+
281+
return pInputBuffer + inputLength;
282+
}
283+
284+
131285
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength)
132286
{
133287
int processedLength = 0;
@@ -461,16 +615,14 @@ public static class UTF8
461615
{
462616
return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength);
463617
}
464-
// TODO add support for other ISAs
465-
//if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
466-
//{
467-
// return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
468-
//GetPointerToFirstInvalidByteAvx2
469-
//}
470-
//if (Sse42.IsSupported)
471-
//{
472-
// return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength);
473-
//}
618+
/*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
619+
{
620+
return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
621+
}*/
622+
if (Ssse3.IsSupported)
623+
{
624+
return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength);
625+
}
474626
return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength);
475627
}
476628

0 commit comments

Comments
 (0)