|
30 | 30 | #include <clc/shared/clc_max.h>
|
31 | 31 | #include <math/clc_remainder.h>
|
32 | 32 |
|
33 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) |
34 |
| -{ |
35 |
| - int ux = as_int(x); |
36 |
| - int ax = ux & EXSIGNBIT_SP32; |
37 |
| - float xa = as_float(ax); |
38 |
| - int sx = ux ^ ax; |
39 |
| - int ex = ax >> EXPSHIFTBITS_SP32; |
40 |
| - |
41 |
| - int uy = as_int(y); |
42 |
| - int ay = uy & EXSIGNBIT_SP32; |
43 |
| - float ya = as_float(ay); |
44 |
| - int ey = ay >> EXPSHIFTBITS_SP32; |
45 |
| - |
46 |
| - float xr = as_float(0x3f800000 | (ax & 0x007fffff)); |
47 |
| - float yr = as_float(0x3f800000 | (ay & 0x007fffff)); |
48 |
| - int c; |
49 |
| - int k = ex - ey; |
50 |
| - |
51 |
| - while (k > 0) { |
52 |
| - c = xr >= yr; |
53 |
| - xr -= c ? yr : 0.0f; |
54 |
| - xr += xr; |
55 |
| - --k; |
56 |
| - } |
57 |
| - |
| 33 | +_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) { |
| 34 | + int ux = as_int(x); |
| 35 | + int ax = ux & EXSIGNBIT_SP32; |
| 36 | + float xa = as_float(ax); |
| 37 | + int sx = ux ^ ax; |
| 38 | + int ex = ax >> EXPSHIFTBITS_SP32; |
| 39 | + |
| 40 | + int uy = as_int(y); |
| 41 | + int ay = uy & EXSIGNBIT_SP32; |
| 42 | + float ya = as_float(ay); |
| 43 | + int ey = ay >> EXPSHIFTBITS_SP32; |
| 44 | + |
| 45 | + float xr = as_float(0x3f800000 | (ax & 0x007fffff)); |
| 46 | + float yr = as_float(0x3f800000 | (ay & 0x007fffff)); |
| 47 | + int c; |
| 48 | + int k = ex - ey; |
| 49 | + |
| 50 | + while (k > 0) { |
58 | 51 | c = xr >= yr;
|
59 | 52 | xr -= c ? yr : 0.0f;
|
| 53 | + xr += xr; |
| 54 | + --k; |
| 55 | + } |
60 | 56 |
|
61 |
| - int lt = ex < ey; |
62 |
| - |
63 |
| - xr = lt ? xa : xr; |
64 |
| - yr = lt ? ya : yr; |
| 57 | + c = xr >= yr; |
| 58 | + xr -= c ? yr : 0.0f; |
65 | 59 |
|
| 60 | + int lt = ex < ey; |
66 | 61 |
|
67 |
| - float s = as_float(ey << EXPSHIFTBITS_SP32); |
68 |
| - xr *= lt ? 1.0f : s; |
| 62 | + xr = lt ? xa : xr; |
| 63 | + yr = lt ? ya : yr; |
69 | 64 |
|
70 |
| - c = ax == ay; |
71 |
| - xr = c ? 0.0f : xr; |
| 65 | + float s = as_float(ey << EXPSHIFTBITS_SP32); |
| 66 | + xr *= lt ? 1.0f : s; |
72 | 67 |
|
73 |
| - xr = as_float(sx ^ as_int(xr)); |
| 68 | + c = ax == ay; |
| 69 | + xr = c ? 0.0f : xr; |
74 | 70 |
|
75 |
| - c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0; |
76 |
| - xr = c ? as_float(QNANBITPATT_SP32) : xr; |
| 71 | + xr = as_float(sx ^ as_int(xr)); |
77 | 72 |
|
78 |
| - return xr; |
| 73 | + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | |
| 74 | + ay == 0; |
| 75 | + xr = c ? as_float(QNANBITPATT_SP32) : xr; |
79 | 76 |
|
| 77 | + return xr; |
80 | 78 | }
|
81 | 79 | _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_fmod, float, float);
|
82 | 80 |
|
83 | 81 | #ifdef cl_khr_fp64
|
84 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) |
85 |
| -{ |
86 |
| - ulong ux = as_ulong(x); |
87 |
| - ulong ax = ux & ~SIGNBIT_DP64; |
88 |
| - ulong xsgn = ux ^ ax; |
89 |
| - double dx = as_double(ax); |
90 |
| - int xexp = convert_int(ax >> EXPSHIFTBITS_DP64); |
91 |
| - int xexp1 = 11 - (int) __clc_clz(ax & MANTBITS_DP64); |
92 |
| - xexp1 = xexp < 1 ? xexp1 : xexp; |
93 |
| - |
94 |
| - ulong uy = as_ulong(y); |
95 |
| - ulong ay = uy & ~SIGNBIT_DP64; |
96 |
| - double dy = as_double(ay); |
97 |
| - int yexp = convert_int(ay >> EXPSHIFTBITS_DP64); |
98 |
| - int yexp1 = 11 - (int) __clc_clz(ay & MANTBITS_DP64); |
99 |
| - yexp1 = yexp < 1 ? yexp1 : yexp; |
100 |
| - |
101 |
| - // First assume |x| > |y| |
102 |
| - |
103 |
| - // Set ntimes to the number of times we need to do a |
104 |
| - // partial remainder. If the exponent of x is an exact multiple |
105 |
| - // of 53 larger than the exponent of y, and the mantissa of x is |
106 |
| - // less than the mantissa of y, ntimes will be one too large |
107 |
| - // but it doesn't matter - it just means that we'll go round |
108 |
| - // the loop below one extra time. |
109 |
| - int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); |
110 |
| - double w = ldexp(dy, ntimes * 53); |
111 |
| - w = ntimes == 0 ? dy : w; |
112 |
| - double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; |
113 |
| - |
114 |
| - // Each time round the loop we compute a partial remainder. |
115 |
| - // This is done by subtracting a large multiple of w |
116 |
| - // from x each time, where w is a scaled up version of y. |
117 |
| - // The subtraction must be performed exactly in quad |
118 |
| - // precision, though the result at each stage can |
119 |
| - // fit exactly in a double precision number. |
120 |
| - int i; |
121 |
| - double t, v, p, pp; |
122 |
| - |
123 |
| - for (i = 0; i < ntimes; i++) { |
124 |
| - // Compute integral multiplier |
125 |
| - t = __clc_trunc(dx / w); |
126 |
| - |
127 |
| - // Compute w * t in quad precision |
128 |
| - p = w * t; |
129 |
| - pp = fma(w, t, -p); |
130 |
| - |
131 |
| - // Subtract w * t from dx |
132 |
| - v = dx - p; |
133 |
| - dx = v + (((dx - v) - p) - pp); |
134 |
| - |
135 |
| - // If t was one too large, dx will be negative. Add back one w. |
136 |
| - dx += dx < 0.0 ? w : 0.0; |
137 |
| - |
138 |
| - // Scale w down by 2^(-53) for the next iteration |
139 |
| - w *= scale; |
140 |
| - } |
141 |
| - |
142 |
| - // One more time |
143 |
| - // Variable todd says whether the integer t is odd or not |
144 |
| - t = __clc_floor(dx / w); |
145 |
| - long lt = (long)t; |
146 |
| - int todd = lt & 1; |
147 |
| - |
| 82 | +_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { |
| 83 | + ulong ux = as_ulong(x); |
| 84 | + ulong ax = ux & ~SIGNBIT_DP64; |
| 85 | + ulong xsgn = ux ^ ax; |
| 86 | + double dx = as_double(ax); |
| 87 | + int xexp = convert_int(ax >> EXPSHIFTBITS_DP64); |
| 88 | + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); |
| 89 | + xexp1 = xexp < 1 ? xexp1 : xexp; |
| 90 | + |
| 91 | + ulong uy = as_ulong(y); |
| 92 | + ulong ay = uy & ~SIGNBIT_DP64; |
| 93 | + double dy = as_double(ay); |
| 94 | + int yexp = convert_int(ay >> EXPSHIFTBITS_DP64); |
| 95 | + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); |
| 96 | + yexp1 = yexp < 1 ? yexp1 : yexp; |
| 97 | + |
| 98 | + // First assume |x| > |y| |
| 99 | + |
| 100 | + // Set ntimes to the number of times we need to do a |
| 101 | + // partial remainder. If the exponent of x is an exact multiple |
| 102 | + // of 53 larger than the exponent of y, and the mantissa of x is |
| 103 | + // less than the mantissa of y, ntimes will be one too large |
| 104 | + // but it doesn't matter - it just means that we'll go round |
| 105 | + // the loop below one extra time. |
| 106 | + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); |
| 107 | + double w = ldexp(dy, ntimes * 53); |
| 108 | + w = ntimes == 0 ? dy : w; |
| 109 | + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; |
| 110 | + |
| 111 | + // Each time round the loop we compute a partial remainder. |
| 112 | + // This is done by subtracting a large multiple of w |
| 113 | + // from x each time, where w is a scaled up version of y. |
| 114 | + // The subtraction must be performed exactly in quad |
| 115 | + // precision, though the result at each stage can |
| 116 | + // fit exactly in a double precision number. |
| 117 | + int i; |
| 118 | + double t, v, p, pp; |
| 119 | + |
| 120 | + for (i = 0; i < ntimes; i++) { |
| 121 | + // Compute integral multiplier |
| 122 | + t = __clc_trunc(dx / w); |
| 123 | + |
| 124 | + // Compute w * t in quad precision |
148 | 125 | p = w * t;
|
149 | 126 | pp = fma(w, t, -p);
|
| 127 | + |
| 128 | + // Subtract w * t from dx |
150 | 129 | v = dx - p;
|
151 | 130 | dx = v + (((dx - v) - p) - pp);
|
152 |
| - i = dx < 0.0; |
153 |
| - todd ^= i; |
154 |
| - dx += i ? w : 0.0; |
155 | 131 |
|
156 |
| - // At this point, dx lies in the range [0,dy) |
157 |
| - double ret = as_double(xsgn ^ as_ulong(dx)); |
158 |
| - dx = as_double(ax); |
| 132 | + // If t was one too large, dx will be negative. Add back one w. |
| 133 | + dx += dx < 0.0 ? w : 0.0; |
| 134 | + |
| 135 | + // Scale w down by 2^(-53) for the next iteration |
| 136 | + w *= scale; |
| 137 | + } |
| 138 | + |
| 139 | + // One more time |
| 140 | + // Variable todd says whether the integer t is odd or not |
| 141 | + t = __clc_floor(dx / w); |
| 142 | + long lt = (long)t; |
| 143 | + int todd = lt & 1; |
| 144 | + |
| 145 | + p = w * t; |
| 146 | + pp = fma(w, t, -p); |
| 147 | + v = dx - p; |
| 148 | + dx = v + (((dx - v) - p) - pp); |
| 149 | + i = dx < 0.0; |
| 150 | + todd ^= i; |
| 151 | + dx += i ? w : 0.0; |
| 152 | + |
| 153 | + // At this point, dx lies in the range [0,dy) |
| 154 | + double ret = as_double(xsgn ^ as_ulong(dx)); |
| 155 | + dx = as_double(ax); |
159 | 156 |
|
160 |
| - // Now handle |x| == |y| |
161 |
| - int c = dx == dy; |
162 |
| - t = as_double(xsgn); |
163 |
| - ret = c ? t : ret; |
| 157 | + // Now handle |x| == |y| |
| 158 | + int c = dx == dy; |
| 159 | + t = as_double(xsgn); |
| 160 | + ret = c ? t : ret; |
164 | 161 |
|
165 |
| - // Next, handle |x| < |y| |
166 |
| - c = dx < dy; |
167 |
| - ret = c ? x : ret; |
| 162 | + // Next, handle |x| < |y| |
| 163 | + c = dx < dy; |
| 164 | + ret = c ? x : ret; |
168 | 165 |
|
169 |
| - // We don't need anything special for |x| == 0 |
| 166 | + // We don't need anything special for |x| == 0 |
170 | 167 |
|
171 |
| - // |y| is 0 |
172 |
| - c = dy == 0.0; |
173 |
| - ret = c ? as_double(QNANBITPATT_DP64) : ret; |
| 168 | + // |y| is 0 |
| 169 | + c = dy == 0.0; |
| 170 | + ret = c ? as_double(QNANBITPATT_DP64) : ret; |
174 | 171 |
|
175 |
| - // y is +-Inf, NaN |
176 |
| - c = yexp > BIASEDEMAX_DP64; |
177 |
| - t = y == y ? x : y; |
178 |
| - ret = c ? t : ret; |
| 172 | + // y is +-Inf, NaN |
| 173 | + c = yexp > BIASEDEMAX_DP64; |
| 174 | + t = y == y ? x : y; |
| 175 | + ret = c ? t : ret; |
179 | 176 |
|
180 |
| - // x is +=Inf, NaN |
181 |
| - c = xexp > BIASEDEMAX_DP64; |
182 |
| - ret = c ? as_double(QNANBITPATT_DP64) : ret; |
| 177 | + // x is +=Inf, NaN |
| 178 | + c = xexp > BIASEDEMAX_DP64; |
| 179 | + ret = c ? as_double(QNANBITPATT_DP64) : ret; |
183 | 180 |
|
184 |
| - return ret; |
| 181 | + return ret; |
185 | 182 | }
|
186 |
| -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double, double); |
| 183 | +_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double, |
| 184 | + double); |
187 | 185 | #endif
|
0 commit comments