|
| 1 | +#include "common.h" |
| 2 | + |
| 3 | +#ifndef ABS_K |
| 4 | +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) |
| 5 | +#endif |
| 6 | + |
| 7 | +#if defined(SKYLAKEX) |
| 8 | +#include "casum_microk_skylakex-2.c" |
| 9 | +#endif |
| 10 | + |
| 11 | +#ifndef HAVE_CASUM_KERNEL |
| 12 | +static FLOAT casum_kernel(BLASLONG n, FLOAT *x1) |
| 13 | +{ |
| 14 | + |
| 15 | + BLASLONG i=0; |
| 16 | + BLASLONG n_8 = n & -8; |
| 17 | + FLOAT *x = x1; |
| 18 | + FLOAT temp0, temp1, temp2, temp3; |
| 19 | + FLOAT temp4, temp5, temp6, temp7; |
| 20 | + FLOAT sum0 = 0.0; |
| 21 | + FLOAT sum1 = 0.0; |
| 22 | + FLOAT sum2 = 0.0; |
| 23 | + FLOAT sum3 = 0.0; |
| 24 | + FLOAT sum4 = 0.0; |
| 25 | + |
| 26 | + while (i < n_8) { |
| 27 | + temp0 = ABS_K(x[0]); |
| 28 | + temp1 = ABS_K(x[1]); |
| 29 | + temp2 = ABS_K(x[2]); |
| 30 | + temp3 = ABS_K(x[3]); |
| 31 | + temp4 = ABS_K(x[4]); |
| 32 | + temp5 = ABS_K(x[5]); |
| 33 | + temp6 = ABS_K(x[6]); |
| 34 | + temp7 = ABS_K(x[7]); |
| 35 | + |
| 36 | + sum0 += temp0; |
| 37 | + sum1 += temp1; |
| 38 | + sum2 += temp2; |
| 39 | + sum3 += temp3; |
| 40 | + |
| 41 | + sum0 += temp4; |
| 42 | + sum1 += temp5; |
| 43 | + sum2 += temp6; |
| 44 | + sum3 += temp7; |
| 45 | + |
| 46 | + x+=8; |
| 47 | + i+=4; |
| 48 | + } |
| 49 | + |
| 50 | + while (i < n) { |
| 51 | + sum4 += (ABS_K(x1[0]) + ABS_K(x1[1])); |
| 52 | + x1 += 2; |
| 53 | + i++; |
| 54 | + } |
| 55 | + |
| 56 | + return sum0+sum1+sum2+sum3+sum4; |
| 57 | +} |
| 58 | + |
| 59 | +#endif |
| 60 | + |
| 61 | +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
| 62 | +{ |
| 63 | + BLASLONG i = 0; |
| 64 | + BLASLONG ip = 0; |
| 65 | + BLASLONG inc_x2; |
| 66 | + FLOAT sumf = 0.0; |
| 67 | + |
| 68 | + if (n <= 0 || inc_x <= 0) return(sumf); |
| 69 | + if (inc_x == 1) { |
| 70 | + sumf = casum_kernel(n, x); |
| 71 | + } |
| 72 | + else { |
| 73 | + inc_x2 = 2 * inc_x; |
| 74 | + |
| 75 | + while (i < n) { |
| 76 | + sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]); |
| 77 | + ip += inc_x2; |
| 78 | + i++; |
| 79 | + } |
| 80 | + } |
| 81 | + |
| 82 | + return(sumf); |
| 83 | +} |
| 84 | + |
| 85 | +#if defined(SMP) |
| 86 | +static int asum_thread_function(BLASLONG n, |
| 87 | + BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, |
| 88 | + FLOAT *x, BLASLONG inc_x, |
| 89 | + FLOAT * dummy3, BLASLONG dummy4, |
| 90 | + FLOAT * result, BLASLONG dummy5) |
| 91 | +{ |
| 92 | + *(FLOAT *) result = asum_compute(n, x, inc_x); |
| 93 | + return 0; |
| 94 | +} |
| 95 | + |
| 96 | +extern int blas_level1_thread_with_value(int mode, |
| 97 | + BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, |
| 98 | + void *a, BLASLONG lda, |
| 99 | + void *b, BLASLONG ldb, |
| 100 | + void *c, BLASLONG ldc, |
| 101 | + int (*function)(), |
| 102 | + int nthread); |
| 103 | +#endif |
| 104 | + |
| 105 | +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
| 106 | +{ |
| 107 | +#if defined(SMP) |
| 108 | + int nthreads; |
| 109 | + FLOAT dummy_alpha[2]; |
| 110 | +#endif |
| 111 | + FLOAT sumf = 0.0; |
| 112 | + |
| 113 | +#if defined(SMP) |
| 114 | + int num_cpu = num_cpu_avail(1); |
| 115 | + if (n <= 10000 || inc_x <= 0) |
| 116 | + nthreads = 1; |
| 117 | + else |
| 118 | + nthreads = num_cpu < n/10000 ? num_cpu : n/10000; |
| 119 | + |
| 120 | + if (nthreads == 1) { |
| 121 | + sumf = asum_compute(n, x, inc_x); |
| 122 | + } |
| 123 | + else { |
| 124 | + int mode, i; |
| 125 | + char result[MAX_CPU_NUMBER * sizeof(double) *2]; |
| 126 | + FLOAT *ptr; |
| 127 | +#if !defined(DOUBLE) |
| 128 | + mode = BLAS_SINGLE | BLAS_COMPLEX; |
| 129 | +#else |
| 130 | + mode = BLAS_DOUBLE | BLAS_COMPLEX; |
| 131 | +#endif |
| 132 | + blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, |
| 133 | + NULL, 0, result, 0, (void *)asum_thread_function, nthreads); |
| 134 | + ptr = (FLOAT *)result; |
| 135 | + for (i = 0; i < nthreads; i++) { |
| 136 | + sumf += (*ptr); |
| 137 | + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); |
| 138 | + } |
| 139 | + } |
| 140 | +#else |
| 141 | + sumf = asum_compute(n, x, inc_x); |
| 142 | +#endif |
| 143 | + return(sumf); |
| 144 | +} |
0 commit comments