|
6 | 6 | //
|
7 | 7 | //===----------------------------------------------------------------------===//
|
8 | 8 |
|
| 9 | +#include <clc/shared/clc_vstore.h> |
9 | 10 | #include <libspirv/spirv.h>
|
10 | 11 |
|
11 |
| -#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable |
| 12 | +#define __CLC_BODY "vstore.inc" |
| 13 | +#include <clc/integer/gentype.inc> |
12 | 14 |
|
13 |
| -#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ |
14 |
| - typedef PRIM_TYPE less_aligned_##ADDR_SPACE##PRIM_TYPE \ |
15 |
| - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ |
16 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore(PRIM_TYPE vec, size_t offset, \ |
17 |
| - ADDR_SPACE PRIM_TYPE *mem) { \ |
18 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE *)(&mem[offset])) = \ |
19 |
| - vec; \ |
20 |
| - } \ |
21 |
| - \ |
22 |
| - typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ |
23 |
| - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ |
24 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \ |
25 |
| - PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ |
26 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ |
27 |
| - *)(&mem[2 * offset])) = vec; \ |
28 |
| - } \ |
29 |
| - \ |
30 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \ |
31 |
| - PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ |
32 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ |
33 |
| - *)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \ |
34 |
| - mem[3 * offset + 2] = vec.s2; \ |
35 |
| - } \ |
36 |
| - \ |
37 |
| - typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ |
38 |
| - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ |
39 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \ |
40 |
| - PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ |
41 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \ |
42 |
| - *)(&mem[4 * offset])) = vec; \ |
43 |
| - } \ |
44 |
| - \ |
45 |
| - typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ |
46 |
| - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ |
47 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \ |
48 |
| - PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ |
49 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \ |
50 |
| - *)(&mem[8 * offset])) = vec; \ |
51 |
| - } \ |
52 |
| - \ |
53 |
| - typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ |
54 |
| - __attribute__((aligned(sizeof(PRIM_TYPE)))); \ |
55 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \ |
56 |
| - PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ |
57 |
| - *((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \ |
58 |
| - *)(&mem[16 * offset])) = vec; \ |
59 |
| - } |
60 |
| - |
61 |
| -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED |
62 |
| -#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE |
63 |
| -#else |
64 |
| -// The generic address space isn't available, so make the macro do nothing |
65 |
| -#define VSTORE_VECTORIZE_GENERIC(X,Y) |
66 |
| -#endif |
67 |
| - |
68 |
| -#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ |
69 |
| - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ |
70 |
| - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ |
71 |
| - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ |
72 |
| - VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) |
73 |
| - |
74 |
| -VSTORE_ADDR_SPACES(char) |
75 |
| -VSTORE_ADDR_SPACES(uchar) |
76 |
| -VSTORE_ADDR_SPACES(short) |
77 |
| -VSTORE_ADDR_SPACES(ushort) |
78 |
| -VSTORE_ADDR_SPACES(int) |
79 |
| -VSTORE_ADDR_SPACES(uint) |
80 |
| -VSTORE_ADDR_SPACES(long) |
81 |
| -VSTORE_ADDR_SPACES(ulong) |
82 |
| -VSTORE_ADDR_SPACES(float) |
83 |
| - |
84 |
| -#ifdef cl_khr_fp64 |
85 |
| -#pragma OPENCL EXTENSION cl_khr_fp64 : enable |
86 |
| -VSTORE_ADDR_SPACES(double) |
87 |
| -#endif |
88 |
| - |
89 |
| -#ifdef cl_khr_fp16 |
90 |
| -#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
91 |
| -VSTORE_ADDR_SPACES(half) |
92 |
| -#endif |
93 |
| - |
94 |
| -/* vstore_half are legal even without cl_khr_fp16 */ |
95 |
| -#if __clang_major__ < 6 |
96 |
| -#define DECLARE_HELPER(STYPE, AS, builtin) \ |
97 |
| - void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *); |
98 |
| -#else |
99 |
| -#define DECLARE_HELPER(STYPE, AS, __builtin) \ |
100 |
| - _CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) { \ |
101 |
| - __builtin(s, d); \ |
102 |
| - } |
103 |
| -#endif |
104 |
| - |
105 |
| -DECLARE_HELPER(float, __private, __builtin_store_halff); |
106 |
| -DECLARE_HELPER(float, __global, __builtin_store_halff); |
107 |
| -DECLARE_HELPER(float, __local, __builtin_store_halff); |
108 |
| -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED |
109 |
| -DECLARE_HELPER(float, __generic, __builtin_store_halff); |
110 |
| -#endif |
111 |
| - |
112 |
| -#ifdef cl_khr_fp64 |
113 |
| -DECLARE_HELPER(double, __private, __builtin_store_half); |
114 |
| -DECLARE_HELPER(double, __global, __builtin_store_half); |
115 |
| -DECLARE_HELPER(double, __local, __builtin_store_half); |
116 |
| -#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED |
117 |
| -DECLARE_HELPER(double, __generic, __builtin_store_half); |
118 |
| -#endif |
119 |
| -#endif |
120 |
| - |
121 |
| -#define VEC_STORE1(STYPE, AS, val, ROUNDF) \ |
122 |
| - __clc_vstore_half_##STYPE##_helper##AS(ROUNDF(val), &mem[offset++]); |
123 |
| - |
124 |
| -#define VEC_STORE2(STYPE, AS, val, ROUNDF) \ |
125 |
| - VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \ |
126 |
| - VEC_STORE1(STYPE, AS, val.hi, ROUNDF) |
127 |
| -#define VEC_STORE3(STYPE, AS, val, ROUNDF) \ |
128 |
| - VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \ |
129 |
| - VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \ |
130 |
| - VEC_STORE1(STYPE, AS, val.s2, ROUNDF) |
131 |
| -#define VEC_STORE4(STYPE, AS, val, ROUNDF) \ |
132 |
| - VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \ |
133 |
| - VEC_STORE2(STYPE, AS, val.hi, ROUNDF) |
134 |
| -#define VEC_STORE8(STYPE, AS, val, ROUNDF) \ |
135 |
| - VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \ |
136 |
| - VEC_STORE4(STYPE, AS, val.hi, ROUNDF) |
137 |
| -#define VEC_STORE16(STYPE, AS, val, ROUNDF) \ |
138 |
| - VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \ |
139 |
| - VEC_STORE8(STYPE, AS, val.hi, ROUNDF) |
140 |
| - |
141 |
| -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \ |
142 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore_half##VEC_SUFFIX( \ |
143 |
| - TYPE vec, size_t offset, AS half *mem) { \ |
144 |
| - offset *= VEC_SIZE; \ |
145 |
| - VEC_STORE##VEC_SIZE(STYPE, AS, vec, __clc_noop) \ |
146 |
| - } \ |
147 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstorea_half##VEC_SUFFIX( \ |
148 |
| - TYPE vec, size_t offset, AS half *mem) { \ |
149 |
| - offset *= OFFSET; \ |
150 |
| - VEC_STORE##VEC_SIZE(STYPE, AS, vec, __clc_noop) \ |
151 |
| - } |
152 |
| - |
153 |
| -#define __FUNC_ROUND_CASE(CASE, VEC_SIZE, STYPE, AS, ROUNDF) \ |
154 |
| - case CASE: \ |
155 |
| - VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) break; |
156 |
| - |
157 |
| -#define __FUNC_ROUND(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \ |
158 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore_half##VEC_SUFFIX##_r( \ |
159 |
| - TYPE vec, size_t offset, AS half *mem, unsigned int round_mode) { \ |
160 |
| - offset *= VEC_SIZE; \ |
161 |
| - switch (round_mode) { \ |
162 |
| - __FUNC_ROUND_CASE(SPV_RTE, VEC_SIZE, STYPE, AS, __clc_rte) \ |
163 |
| - __FUNC_ROUND_CASE(SPV_RTZ, VEC_SIZE, STYPE, AS, __clc_rtz) \ |
164 |
| - __FUNC_ROUND_CASE(SPV_RTP, VEC_SIZE, STYPE, AS, __clc_rtp) \ |
165 |
| - __FUNC_ROUND_CASE(SPV_RTN, VEC_SIZE, STYPE, AS, __clc_rtn) \ |
166 |
| - default: \ |
167 |
| - break; \ |
168 |
| - } \ |
169 |
| - } \ |
170 |
| - _CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstorea_half##VEC_SUFFIX##_r( \ |
171 |
| - TYPE vec, size_t offset, AS half *mem, unsigned int round_mode) { \ |
172 |
| - offset *= OFFSET; \ |
173 |
| - switch (round_mode) { \ |
174 |
| - __FUNC_ROUND_CASE(SPV_RTE, VEC_SIZE, STYPE, AS, __clc_rte) \ |
175 |
| - __FUNC_ROUND_CASE(SPV_RTZ, VEC_SIZE, STYPE, AS, __clc_rtz) \ |
176 |
| - __FUNC_ROUND_CASE(SPV_RTP, VEC_SIZE, STYPE, AS, __clc_rtp) \ |
177 |
| - __FUNC_ROUND_CASE(SPV_RTN, VEC_SIZE, STYPE, AS, __clc_rtn) \ |
178 |
| - default: \ |
179 |
| - break; \ |
180 |
| - } \ |
181 |
| - } |
182 |
| - |
183 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } |
184 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) { |
185 |
| - /* Remove lower 13 bits to make sure the number is rounded down */ |
186 |
| - int mask = 0xffffe000; |
187 |
| - const int exp = ( __clc_as_uint(x) >> 23 & 0xff) - 127; |
188 |
| - /* Denormals cannot be flushed, and they use different bit for rounding */ |
189 |
| - if (exp < -14) |
190 |
| - mask <<= __spirv_ocl_s_min(-(exp + 14), 10); |
191 |
| - /* RTZ does not produce Inf for large numbers */ |
192 |
| - if (__spirv_ocl_fabs(x) > 65504.0f && !__spirv_IsInf(x)) |
193 |
| - return __spirv_ocl_copysign(65504.0f, x); |
194 |
| - /* Handle nan corner case */ |
195 |
| - if (__spirv_IsNan(x)) |
196 |
| - return x; |
197 |
| - return __clc_as_float( __clc_as_uint(x) & mask); |
198 |
| -} |
199 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) { |
200 |
| - const float inf = __spirv_ocl_copysign(INFINITY, x); |
201 |
| - /* Set lower 13 bits */ |
202 |
| - int mask = (1 << 13) - 1; |
203 |
| - const int exp = ( __clc_as_uint(x) >> 23 & 0xff) - 127; |
204 |
| - /* Denormals cannot be flushed, and they use different bit for rounding */ |
205 |
| - if (exp < -14) |
206 |
| - mask = (1 << (13 + __spirv_ocl_s_min(-(exp + 14), 10))) - 1; |
207 |
| - /* Handle nan corner case */ |
208 |
| - if (__spirv_IsNan(x)) |
209 |
| - return x; |
210 |
| - const float next = __spirv_ocl_nextafter(__clc_as_float(__clc_as_uint(x) | mask), inf); |
211 |
| - return ((__clc_as_uint(x) & mask) == 0) ? x : next; |
212 |
| -} |
213 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) { |
214 |
| - return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x); |
215 |
| -} |
216 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) { |
217 |
| - return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x); |
218 |
| -} |
219 |
| -_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { |
220 |
| - /* Mantisa + implicit bit */ |
221 |
| - const uint mantissa = (__clc_as_uint(x) & 0x7fffff) | (1u << 23); |
222 |
| - const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127; |
223 |
| - int shift = 13; |
224 |
| - if (exp < -14) { |
225 |
| - /* The default assumes lower 13 bits are rounded, |
226 |
| - * but it might be more for denormals. |
227 |
| - * Shifting beyond last == 0b, and qr == 00b is not necessary */ |
228 |
| - shift += __spirv_ocl_s_min(-(exp + 14), 15); |
229 |
| - } |
230 |
| - int mask = (1 << shift) - 1; |
231 |
| - const uint grs = mantissa & mask; |
232 |
| - const uint last = mantissa & (1 << shift); |
233 |
| - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. |
234 |
| - * exp > 15 should round to inf. */ |
235 |
| - bool roundup = (grs > (1 << (shift - 1))) || |
236 |
| - (grs == (1 << (shift - 1)) && last != 0) || (exp > 15); |
237 |
| - return roundup ? __clc_rti(x) : __clc_rtz(x); |
238 |
| -} |
239 |
| - |
240 |
| -#ifdef cl_khr_fp64 |
241 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; } |
242 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) { |
243 |
| - /* Remove lower 42 bits to make sure the number is rounded down */ |
244 |
| - ulong mask = 0xfffffc0000000000UL; |
245 |
| - const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; |
246 |
| - /* Denormals cannot be flushed, and they use different bit for rounding */ |
247 |
| - if (exp < -14) |
248 |
| - mask <<= __spirv_ocl_s_min(-(exp + 14), 10); |
249 |
| - /* RTZ does not produce Inf for large numbers */ |
250 |
| - if (__spirv_ocl_fabs(x) > 65504.0 && !__spirv_IsInf(x)) |
251 |
| - return __spirv_ocl_copysign(65504.0, x); |
252 |
| - /* Handle nan corner case */ |
253 |
| - if (__spirv_IsNan(x)) |
254 |
| - return x; |
255 |
| - return __clc_as_double(__clc_as_ulong(x) & mask); |
256 |
| -} |
257 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) { |
258 |
| - const double inf = __spirv_ocl_copysign((double)INFINITY, x); |
259 |
| - /* Set lower 42 bits */ |
260 |
| - long mask = (1UL << 42UL) - 1UL; |
261 |
| - const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; |
262 |
| - /* Denormals cannot be flushed, and they use different bit for rounding */ |
263 |
| - if (exp < -14) |
264 |
| - mask = (1UL << (42UL + __spirv_ocl_s_min(-(exp + 14), 10))) - 1; |
265 |
| - /* Handle nan corner case */ |
266 |
| - if (__spirv_IsNan(x)) |
267 |
| - return x; |
268 |
| - const double next = __spirv_ocl_nextafter(__clc_as_double(__clc_as_ulong(x) | mask), inf); |
269 |
| - return ((__clc_as_ulong(x) & mask) == 0) ? x : next; |
270 |
| -} |
271 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) { |
272 |
| - return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x) |
273 |
| - : __clc_rti(x); |
274 |
| -} |
275 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) { |
276 |
| - return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x) |
277 |
| - : __clc_rtz(x); |
278 |
| -} |
279 |
| -_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { |
280 |
| - /* Mantisa + implicit bit */ |
281 |
| - const ulong mantissa = (__clc_as_ulong(x) & 0xfffffffffffff) | (1UL << 52); |
282 |
| - const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023; |
283 |
| - int shift = 42; |
284 |
| - if (exp < -14) { |
285 |
| - /* The default assumes lower 13 bits are rounded, |
286 |
| - * but it might be more for denormals. |
287 |
| - * Shifting beyond last == 0b, and qr == 00b is not necessary */ |
288 |
| - shift += __spirv_ocl_s_min(-(exp + 14), 15); |
289 |
| - } |
290 |
| - ulong mask = (1UL << shift) - 1UL; |
291 |
| - const ulong grs = mantissa & mask; |
292 |
| - const ulong last = mantissa & (1UL << shift); |
293 |
| - /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1. |
294 |
| - * exp > 15 should round to inf. */ |
295 |
| - bool roundup = (grs > (1UL << (shift - 1UL))) || |
296 |
| - (grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15); |
297 |
| - return roundup ? __clc_rti(x) : __clc_rtz(x); |
298 |
| -} |
299 |
| -#endif |
300 |
| - |
301 |
| -#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \ |
302 |
| - __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \ |
303 |
| - __FUNC_ROUND(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) |
304 |
| - |
305 |
| -#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \ |
306 |
| - __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) |
307 |
| - |
308 |
| -#define FUNC_SCALAR(VEC_SIZE, OFFSET, TYPE, STYPE, AS) \ |
309 |
| - __XFUNC(, VEC_SIZE, OFFSET, TYPE, STYPE, AS, ) |
310 |
| - |
311 |
| -#define __CLC_BODY "vstore_half.inc" |
| 15 | +#define __CLC_BODY "vstore.inc" |
312 | 16 | #include <clc/math/gentype.inc>
|
313 |
| -#undef __CLC_BODY |
314 |
| -#undef FUNC |
315 |
| -#undef __XFUNC |
316 |
| -#undef __FUNC |
317 |
| -#undef VEC_LOAD16 |
318 |
| -#undef VEC_LOAD8 |
319 |
| -#undef VEC_LOAD4 |
320 |
| -#undef VEC_LOAD3 |
321 |
| -#undef VEC_LOAD2 |
322 |
| -#undef VEC_LOAD1 |
323 |
| -#undef DECLARE_HELPER |
324 |
| -#undef VSTORE_ADDR_SPACES |
325 |
| -#undef VSTORE_VECTORIZE |
0 commit comments