Skip to content

Commit c34e2cf

Browse files
committed
Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum
for skylake kernels. This is the same method as used in [sd]asum. _mm_set1_epi64x was commented out for zasum, but has the advantage of avoiding possible undefined behaviour (using an uninitialized variable), optimized out by NVHPC and icx. The new code works fine with those compilers. For GCC 12.3 the generated code is identical; no matter what method you use, the compiler optimizes the code into a compile-time constant, there is no performance benefit using mm_cmpeq_epi8 since the corresponding instruction (VPCMPEQB) isn't actually generated!
1 parent 22aa401 commit c34e2cf

File tree

2 files changed

+8
-13
lines changed

2 files changed

+8
-13
lines changed

kernel/x86_64/casum_microk_skylakex-2.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
#ifdef __NVCOMPILER
33
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
44
#endif
5-
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
5+
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
66

7-
#if (!(defined(__NVCOMPILER) ))
8-
//&& NVCOMPVERS < 2309))
7+
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
98

109
#define HAVE_CASUM_KERNEL 1
1110

@@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
2120

2221
if (n2 < 64) {
2322
__m128 accum_10, accum_11, accum_12, accum_13;
24-
__m128 abs_mask1 = abs_mask1;
23+
__m128 abs_mask1;
2524

2625
accum_10 = _mm_setzero_ps();
2726
accum_11 = _mm_setzero_ps();
2827
accum_12 = _mm_setzero_ps();
2928
accum_13 = _mm_setzero_ps();
3029

31-
abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
32-
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
30+
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
3331

3432
_mm_prefetch(&x1[0], _MM_HINT_T0);
3533

kernel/x86_64/zasum_microk_skylakex-2.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
#ifdef __NVCOMPILER
33
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
44
#endif
5-
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
5+
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
66

7-
#if (!(defined(__NVCOMPILER) ))
8-
//&& NVCOMPVERS < 2309))
7+
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
98

109
#define HAVE_ZASUM_KERNEL 1
1110

@@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
2221

2322
if (n2 < 32) {
2423
__m128d accum_10, accum_11, accum_12, accum_13;
25-
__m128d abs_mask1 = abs_mask1;
24+
__m128d abs_mask1;
2625

2726
accum_10 = _mm_setzero_pd();
2827
accum_11 = _mm_setzero_pd();
2928
accum_12 = _mm_setzero_pd();
3029
accum_13 = _mm_setzero_pd();
3130

32-
// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
33-
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
34-
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
31+
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
3532

3633
_mm_prefetch(&x1[0], _MM_HINT_T0);
3734
if (n2 >= 16){

0 commit comments

Comments
 (0)