Skip to content

Commit 2ea65ba

Browse files
authored
Merge pull request #4330 from bartoldeman/asum-init-mask
Use _mm_set1_epi{32,64x} to init mask in x86-64 [cz]asum
2 parents 864c65b + c34e2cf commit 2ea65ba

File tree

2 files changed

+8
-13
lines changed

2 files changed

+8
-13
lines changed

kernel/x86_64/casum_microk_skylakex-2.c

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
#ifdef __NVCOMPILER
33
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
44
#endif
5-
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
5+
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
66

7-
#if (!(defined(__NVCOMPILER) ))
8-
//&& NVCOMPVERS < 2309))
7+
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
98

109
#define HAVE_CASUM_KERNEL 1
1110

@@ -21,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
2120

2221
if (n2 < 64) {
2322
__m128 accum_10, accum_11, accum_12, accum_13;
24-
__m128 abs_mask1 = abs_mask1;
23+
__m128 abs_mask1;
2524

2625
accum_10 = _mm_setzero_ps();
2726
accum_11 = _mm_setzero_ps();
2827
accum_12 = _mm_setzero_ps();
2928
accum_13 = _mm_setzero_ps();
3029

31-
abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
32-
abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
30+
abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
3331

3432
_mm_prefetch(&x1[0], _MM_HINT_T0);
3533

kernel/x86_64/zasum_microk_skylakex-2.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
#ifdef __NVCOMPILER
33
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
44
#endif
5-
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309)))
5+
#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
66

7-
#if (!(defined(__NVCOMPILER) ))
8-
//&& NVCOMPVERS < 2309))
7+
#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
98

109
#define HAVE_ZASUM_KERNEL 1
1110

@@ -22,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
2221

2322
if (n2 < 32) {
2423
__m128d accum_10, accum_11, accum_12, accum_13;
25-
__m128d abs_mask1 = abs_mask1;
24+
__m128d abs_mask1;
2625

2726
accum_10 = _mm_setzero_pd();
2827
accum_11 = _mm_setzero_pd();
2928
accum_12 = _mm_setzero_pd();
3029
accum_13 = _mm_setzero_pd();
3130

32-
// abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
33-
abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
34-
abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
31+
abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
3532

3633
_mm_prefetch(&x1[0], _MM_HINT_T0);
3734
if (n2 >= 16){

0 commit comments

Comments
 (0)