Skip to content

Commit a6b5c64

Browse files
Release 2.1.0
### Known Issues - half8 "equals" and "not equals" operators don't conform to the IEEE 754 standard - Unity has not yet reacted to my bug-report in regards to their "half" implementation ### Fixes - fixed triggered burst compilation error by "Sse4_1.blend_epi16" when compiling for SSE2 due to fallback code not using a constant value for "imm8" - fixed incorrect CPU feature checks for quarter vector type-conversion code when compiling for SSE2 - fixed "tzcnt" implementations (were completely broken) - fixed scalar (single value and C# fallback) "lzcnt" implementations for (s)byte and (u)short values and (u)long4 vectors ### Additions - added "ulong countbits(void* ptr, ulong bytes)", which counts the number of 1-bits in a given block of memory, using Wojciech Mula's SIMD population count algorithm - added high performance and/or SIMD "gcd" a.k.a. greatest common divisor functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors - added high performance and/or SIMD "lcm" a.k.a. least common multiple functions for (u)int, (u)long and all integer vector types, which always return unsgined types and vectors - added high performance and/or SIMD "intsqrt" - integer square root (floor(sqrt(x)) functions for all integer- and integer vector types, with the functions for signed integers throwing an ArgumentOutOfRangeException in case a value is negative ### Improvements - performance improvements of "avg" functions for signed integer vectors - added SIMD implementations of the "transpose" functions for all matrix types - added SSE4 and SSE2 fallback code for variable bitshifts ("shl", "shrl" and "shra") - added SSE2 fallback code for (s)byte vector-by-vector division and modulo operations - added SSE2 fallback code for "all_dif" for (s)byte16, (u)short8 and (u)int8 vectors - added SSE2 fallback code for typecasting, propagating through the entire library - added SSE2 fallback code for "addsub" and "subadd" functions - bitmask32 and bitmask64 now allow for masks to be up to 32 and 64 bits wide, respectively ### Changes - renamed "BurstCompilerException" to "CPUFeatureCheckException" - "shl", "shrl" and "shra" now have undefined behavior when bitshifting any value beyond [0, 8 * sizeof(integer_type)] for performance reasons and because of differences between SSE, AVX and managed C# ### Fixed Oversights - added "shl", "shrl" and "shra" functions for (s)byte and (u)short vectors - added "ror" and "rol" (varying per element) functions for (s)byte and (u)short vectors - added "compareto" functions for all vector types except half- and quarter vectors - added "all_dif" functions for (s)byte32 vectors - added vshr/l and vror/l functions for (s)byte32 and (u)short16 vectors
1 parent 2970a15 commit a6b5c64

File tree

142 files changed

+19088
-6549
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+19088
-6549
lines changed

Runtime/Functions/Arithmetic/Add-Subtract.cs

Lines changed: 46 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ public static float2 addsub(float2 a, float2 b)
1818
}
1919
else
2020
{
21-
return new float2(a.x + b.x, a.y - b.y);
21+
return a + math.select(b, -b, new bool2(false, true));
2222
}
2323
}
2424

@@ -32,7 +32,7 @@ public static float3 addsub(float3 a, float3 b)
3232
}
3333
else
3434
{
35-
return new float3(a.x + b.x, a.y - b.y, a.z + b.z);
35+
return a + math.select(b, -b, new bool3(false, true, false));
3636
}
3737
}
3838

@@ -46,7 +46,7 @@ public static float4 addsub(float4 a, float4 b)
4646
}
4747
else
4848
{
49-
return new float4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
49+
return a + math.select(b, -b, new bool4(false, true, false, true));
5050
}
5151
}
5252

@@ -75,7 +75,7 @@ public static double2 addsub(double2 a, double2 b)
7575
}
7676
else
7777
{
78-
return new double2(a.x + b.x, a.y - b.y);
78+
return a + math.select(b, -b, new bool2(false, true));
7979
}
8080
}
8181

@@ -89,7 +89,7 @@ public static double3 addsub(double3 a, double3 b)
8989
}
9090
else
9191
{
92-
return new double3(a.x + b.x, a.y - b.y, a.z + b.z);
92+
return a + math.select(b, -b, new bool3(false, true, false));
9393
}
9494
}
9595

@@ -103,7 +103,7 @@ public static double4 addsub(double4 a, double4 b)
103103
}
104104
else
105105
{
106-
return new double4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
106+
return a + math.select(b, -b, new bool4(false, true, false, true));
107107
}
108108
}
109109

@@ -118,7 +118,7 @@ public static byte2 addsub(byte2 a, byte2 b)
118118
}
119119
else
120120
{
121-
return new byte2((byte)(a.x + b.x), (byte)(a.y - b.y));
121+
return a + select(b, (byte2)(-(sbyte2)b), new bool2(false, true));
122122
}
123123
}
124124

@@ -132,7 +132,7 @@ public static byte3 addsub(byte3 a, byte3 b)
132132
}
133133
else
134134
{
135-
return new byte3((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z));
135+
return a + select(b, (byte3)(-(sbyte3)b), new bool3(false, true, false));
136136
}
137137
}
138138

@@ -146,7 +146,7 @@ public static byte4 addsub(byte4 a, byte4 b)
146146
}
147147
else
148148
{
149-
return new byte4((byte)(a.x + b.x), (byte)(a.y - b.y), (byte)(a.z + b.z), (byte)(a.w - b.w));
149+
return a + select(b, (byte4)(-(sbyte4)b), new bool4(false, true, false, true));
150150
}
151151
}
152152

@@ -160,7 +160,7 @@ public static byte8 addsub(byte8 a, byte8 b)
160160
}
161161
else
162162
{
163-
return new byte8((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7));
163+
return a + select(b, (byte8)(-(sbyte8)b), new bool8(false, true, false, true, false, true, false, true));
164164
}
165165
}
166166

@@ -174,7 +174,7 @@ public static byte16 addsub(byte16 a, byte16 b)
174174
}
175175
else
176176
{
177-
return new byte16((byte)(a.x0 + b.x0), (byte)(a.x1 - b.x1), (byte)(a.x2 + b.x2), (byte)(a.x3 - b.x3), (byte)(a.x4 + b.x4), (byte)(a.x5 - b.x5), (byte)(a.x6 + b.x6), (byte)(a.x7 - b.x7), (byte)(a.x8 + b.x8), (byte)(a.x9 - b.x9), (byte)(a.x10 + b.x10), (byte)(a.x11 - b.x11), (byte)(a.x12 + b.x12), (byte)(a.x13 - b.x13), (byte)(a.x14 + b.x14), (byte)(a.x15 - b.x15));
177+
return a + select(b, (byte16)(-(sbyte16)b), new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
178178
}
179179
}
180180

@@ -203,7 +203,7 @@ public static sbyte2 addsub(sbyte2 a, sbyte2 b)
203203
}
204204
else
205205
{
206-
return new sbyte2((sbyte)(a.x + b.x), (sbyte)(a.y - b.y));
206+
return a + select(b, -b, new bool2(false, true));
207207
}
208208
}
209209

@@ -217,7 +217,7 @@ public static sbyte3 addsub(sbyte3 a, sbyte3 b)
217217
}
218218
else
219219
{
220-
return new sbyte3((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z));
220+
return a + select(b, -b, new bool3(false, true, false));
221221
}
222222
}
223223

@@ -231,7 +231,7 @@ public static sbyte4 addsub(sbyte4 a, sbyte4 b)
231231
}
232232
else
233233
{
234-
return new sbyte4((sbyte)(a.x + b.x), (sbyte)(a.y - b.y), (sbyte)(a.z + b.z), (sbyte)(a.w - b.w));
234+
return a + select(b, -b, new bool4(false, true, false, true));
235235
}
236236
}
237237

@@ -245,7 +245,7 @@ public static sbyte8 addsub(sbyte8 a, sbyte8 b)
245245
}
246246
else
247247
{
248-
return new sbyte8((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7));
248+
return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
249249
}
250250
}
251251

@@ -259,7 +259,7 @@ public static sbyte16 addsub(sbyte16 a, sbyte16 b)
259259
}
260260
else
261261
{
262-
return new sbyte16((sbyte)(a.x0 + b.x0), (sbyte)(a.x1 - b.x1), (sbyte)(a.x2 + b.x2), (sbyte)(a.x3 - b.x3), (sbyte)(a.x4 + b.x4), (sbyte)(a.x5 - b.x5), (sbyte)(a.x6 + b.x6), (sbyte)(a.x7 - b.x7), (sbyte)(a.x8 + b.x8), (sbyte)(a.x9 - b.x9), (sbyte)(a.x10 + b.x10), (sbyte)(a.x11 - b.x11), (sbyte)(a.x12 + b.x12), (sbyte)(a.x13 - b.x13), (sbyte)(a.x14 + b.x14), (sbyte)(a.x15 - b.x15));
262+
return a + select(b, -b, new bool16(false, true, false, true, false, true, false, true, false, true, false, true, false, true, false, true));
263263
}
264264
}
265265

@@ -288,7 +288,7 @@ public static ushort2 addsub(ushort2 a, ushort2 b)
288288
}
289289
else
290290
{
291-
return new ushort2((ushort)(a.x + b.x), (ushort)(a.y - b.y));
291+
return a + select(b, (ushort2)(-(short2)b), new bool2(false, true));
292292
}
293293
}
294294

@@ -302,7 +302,7 @@ public static ushort3 addsub(ushort3 a, ushort3 b)
302302
}
303303
else
304304
{
305-
return new ushort3((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z));
305+
return a + select(b, (ushort3)(-(short3)b), new bool3(false, true, false));
306306
}
307307
}
308308

@@ -316,7 +316,7 @@ public static ushort4 addsub(ushort4 a, ushort4 b)
316316
}
317317
else
318318
{
319-
return new ushort4((ushort)(a.x + b.x), (ushort)(a.y - b.y), (ushort)(a.z + b.z), (ushort)(a.w - b.w));
319+
return a + select(b, (ushort4)(-(short4)b), new bool4(false, true, false, true));
320320
}
321321
}
322322

@@ -330,7 +330,7 @@ public static ushort8 addsub(ushort8 a, ushort8 b)
330330
}
331331
else
332332
{
333-
return new ushort8((ushort)(a.x0 + b.x0), (ushort)(a.x1 - b.x1), (ushort)(a.x2 + b.x2), (ushort)(a.x3 - b.x3), (ushort)(a.x4 + b.x4), (ushort)(a.x5 - b.x5), (ushort)(a.x6 + b.x6), (ushort)(a.x7 - b.x7));
333+
return a + select(b, (ushort8)(-(short8)b), new bool8(false, true, false, true, false, true, false, true));
334334
}
335335
}
336336

@@ -359,7 +359,7 @@ public static short2 addsub(short2 a, short2 b)
359359
}
360360
else
361361
{
362-
return new short2((short)(a.x + b.x), (short)(a.y - b.y));
362+
return a + select(b, -b, new bool2(false, true));
363363
}
364364
}
365365

@@ -373,7 +373,7 @@ public static short3 addsub(short3 a, short3 b)
373373
}
374374
else
375375
{
376-
return new short3((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z));
376+
return a + select(b, -b, new bool3(false, true, false));
377377
}
378378
}
379379

@@ -387,7 +387,7 @@ public static short4 addsub(short4 a, short4 b)
387387
}
388388
else
389389
{
390-
return new short4((short)(a.x + b.x), (short)(a.y - b.y), (short)(a.z + b.z), (short)(a.w - b.w));
390+
return a + select(b, -b, new bool4(false, true, false, true));
391391
}
392392
}
393393

@@ -401,7 +401,7 @@ public static short8 addsub(short8 a, short8 b)
401401
}
402402
else
403403
{
404-
return new short8((short)(a.x0 + b.x0), (short)(a.x1 - b.x1), (short)(a.x2 + b.x2), (short)(a.x3 - b.x3), (short)(a.x4 + b.x4), (short)(a.x5 - b.x5), (short)(a.x6 + b.x6), (short)(a.x7 - b.x7));
404+
return a + select(b, -b, new bool8(false, true, false, true, false, true, false, true));
405405
}
406406
}
407407

@@ -432,7 +432,7 @@ public static uint2 addsub(uint2 a, uint2 b)
432432
}
433433
else
434434
{
435-
return new uint2(a.x + b.x, a.y - b.y);
435+
return a + math.select(b, (uint2)(-(int2)b), new bool2(false, true));
436436
}
437437
}
438438

@@ -448,7 +448,7 @@ public static uint3 addsub(uint3 a, uint3 b)
448448
}
449449
else
450450
{
451-
return new uint3(a.x + b.x, a.y - b.y, a.z + b.z);
451+
return a + math.select(b, (uint3)(-(int3)b), new bool3(false, true, false));
452452
}
453453
}
454454

@@ -464,7 +464,7 @@ public static uint4 addsub(uint4 a, uint4 b)
464464
}
465465
else
466466
{
467-
return new uint4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
467+
return a + math.select(b, (uint4)(-(int4)b), new bool4(false, true, false, true));
468468
}
469469
}
470470

@@ -495,7 +495,7 @@ public static int2 addsub(int2 a, int2 b)
495495
}
496496
else
497497
{
498-
return new int2(a.x + b.x, a.y - b.y);
498+
return a + math.select(b, -b, new bool2(false, true));
499499
}
500500
}
501501

@@ -511,7 +511,7 @@ public static int3 addsub(int3 a, int3 b)
511511
}
512512
else
513513
{
514-
return new int3(a.x + b.x, a.y - b.y, a.z + b.z);
514+
return a + math.select(b, -b, new bool3(false, true, false));
515515
}
516516
}
517517

@@ -527,7 +527,7 @@ public static int4 addsub(int4 a, int4 b)
527527
}
528528
else
529529
{
530-
return new int4(a.x + b.x, a.y - b.y, a.z + b.z, a.w - b.w);
530+
return a + math.select(b, -b, new bool4(false, true, false, true));
531531
}
532532
}
533533

@@ -552,7 +552,14 @@ public static ulong2 addsub(ulong2 a, ulong2 b)
552552
{
553553
if (Sse2.IsSse2Supported)
554554
{
555-
return a + Mask.BlendEpi16(b, default(v128) - b, 0b1111_0000);
555+
if (Sse4_1.IsSse41Supported)
556+
{
557+
return a + Sse4_1.blend_epi16(b, default(v128) - b, 0b1111_0000);
558+
}
559+
else
560+
{
561+
return a + Mask.BlendEpi16_SSE2(b, default(v128) - b, 0b1111_0000);
562+
}
556563
}
557564
else
558565
{
@@ -595,7 +602,14 @@ public static long2 addsub(long2 a, long2 b)
595602
{
596603
if (Sse2.IsSse2Supported)
597604
{
598-
return a + Mask.BlendEpi16(b, -b, 0b1111_0000);
605+
if (Sse4_1.IsSse41Supported)
606+
{
607+
return a + Sse4_1.blend_epi16(b, -b, 0b1111_0000);
608+
}
609+
else
610+
{
611+
return a + Mask.BlendEpi16_SSE2(b, -b, 0b1111_0000);
612+
}
599613
}
600614
else
601615
{

0 commit comments

Comments
 (0)