Skip to content

Commit e5a13e9

Browse files
authored
[libspirv] Implement SPIR-V vstore builtins via CLC (#19142)
These implementations were recently added upstream, and the OpenCL vstore builtins use them. Aligning ourselves with those implementations reduces the amount of almost identical code. There are changes to the generated builtins, but they appear harmless, at least for native-cpu. It ultimately stems from the different implementations of the various relational functions used in the rounding-mode helpers. Some comparisons and branch targets have been inverted, for example.
1 parent b5871ee commit e5a13e9

File tree

3 files changed

+139
-352
lines changed

3 files changed

+139
-352
lines changed

libclc/libspirv/lib/generic/shared/vstore.cl

Lines changed: 4 additions & 313 deletions
Original file line numberDiff line numberDiff line change
@@ -6,320 +6,11 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <clc/shared/clc_vstore.h>
910
#include <libspirv/spirv.h>
1011

11-
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
12+
#define __CLC_BODY "vstore.inc"
13+
#include <clc/integer/gentype.inc>
1214

13-
#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \
14-
typedef PRIM_TYPE less_aligned_##ADDR_SPACE##PRIM_TYPE \
15-
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
16-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore(PRIM_TYPE vec, size_t offset, \
17-
ADDR_SPACE PRIM_TYPE *mem) { \
18-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE *)(&mem[offset])) = \
19-
vec; \
20-
} \
21-
\
22-
typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
23-
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
24-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \
25-
PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
26-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
27-
*)(&mem[2 * offset])) = vec; \
28-
} \
29-
\
30-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \
31-
PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
32-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \
33-
*)(&mem[3 * offset])) = (PRIM_TYPE##2)(vec.s0, vec.s1); \
34-
mem[3 * offset + 2] = vec.s2; \
35-
} \
36-
\
37-
typedef PRIM_TYPE##4 less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
38-
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
39-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \
40-
PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
41-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##4 \
42-
*)(&mem[4 * offset])) = vec; \
43-
} \
44-
\
45-
typedef PRIM_TYPE##8 less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
46-
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
47-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \
48-
PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
49-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##8 \
50-
*)(&mem[8 * offset])) = vec; \
51-
} \
52-
\
53-
typedef PRIM_TYPE##16 less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
54-
__attribute__((aligned(sizeof(PRIM_TYPE)))); \
55-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstoren( \
56-
PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \
57-
*((ADDR_SPACE less_aligned_##ADDR_SPACE##PRIM_TYPE##16 \
58-
*)(&mem[16 * offset])) = vec; \
59-
}
60-
61-
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
62-
#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE
63-
#else
64-
// The generic address space isn't available, so make the macro do nothing
65-
#define VSTORE_VECTORIZE_GENERIC(X,Y)
66-
#endif
67-
68-
#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \
69-
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \
70-
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \
71-
VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \
72-
VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic)
73-
74-
VSTORE_ADDR_SPACES(char)
75-
VSTORE_ADDR_SPACES(uchar)
76-
VSTORE_ADDR_SPACES(short)
77-
VSTORE_ADDR_SPACES(ushort)
78-
VSTORE_ADDR_SPACES(int)
79-
VSTORE_ADDR_SPACES(uint)
80-
VSTORE_ADDR_SPACES(long)
81-
VSTORE_ADDR_SPACES(ulong)
82-
VSTORE_ADDR_SPACES(float)
83-
84-
#ifdef cl_khr_fp64
85-
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
86-
VSTORE_ADDR_SPACES(double)
87-
#endif
88-
89-
#ifdef cl_khr_fp16
90-
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
91-
VSTORE_ADDR_SPACES(half)
92-
#endif
93-
94-
/* vstore_half are legal even without cl_khr_fp16 */
95-
#if __clang_major__ < 6
96-
#define DECLARE_HELPER(STYPE, AS, builtin) \
97-
void __clc_vstore_half_##STYPE##_helper##AS(STYPE, AS half *);
98-
#else
99-
#define DECLARE_HELPER(STYPE, AS, __builtin) \
100-
_CLC_DEF void __clc_vstore_half_##STYPE##_helper##AS(STYPE s, AS half *d) { \
101-
__builtin(s, d); \
102-
}
103-
#endif
104-
105-
DECLARE_HELPER(float, __private, __builtin_store_halff);
106-
DECLARE_HELPER(float, __global, __builtin_store_halff);
107-
DECLARE_HELPER(float, __local, __builtin_store_halff);
108-
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
109-
DECLARE_HELPER(float, __generic, __builtin_store_halff);
110-
#endif
111-
112-
#ifdef cl_khr_fp64
113-
DECLARE_HELPER(double, __private, __builtin_store_half);
114-
DECLARE_HELPER(double, __global, __builtin_store_half);
115-
DECLARE_HELPER(double, __local, __builtin_store_half);
116-
#if _CLC_DISTINCT_GENERIC_AS_SUPPORTED
117-
DECLARE_HELPER(double, __generic, __builtin_store_half);
118-
#endif
119-
#endif
120-
121-
#define VEC_STORE1(STYPE, AS, val, ROUNDF) \
122-
__clc_vstore_half_##STYPE##_helper##AS(ROUNDF(val), &mem[offset++]);
123-
124-
#define VEC_STORE2(STYPE, AS, val, ROUNDF) \
125-
VEC_STORE1(STYPE, AS, val.lo, ROUNDF) \
126-
VEC_STORE1(STYPE, AS, val.hi, ROUNDF)
127-
#define VEC_STORE3(STYPE, AS, val, ROUNDF) \
128-
VEC_STORE1(STYPE, AS, val.s0, ROUNDF) \
129-
VEC_STORE1(STYPE, AS, val.s1, ROUNDF) \
130-
VEC_STORE1(STYPE, AS, val.s2, ROUNDF)
131-
#define VEC_STORE4(STYPE, AS, val, ROUNDF) \
132-
VEC_STORE2(STYPE, AS, val.lo, ROUNDF) \
133-
VEC_STORE2(STYPE, AS, val.hi, ROUNDF)
134-
#define VEC_STORE8(STYPE, AS, val, ROUNDF) \
135-
VEC_STORE4(STYPE, AS, val.lo, ROUNDF) \
136-
VEC_STORE4(STYPE, AS, val.hi, ROUNDF)
137-
#define VEC_STORE16(STYPE, AS, val, ROUNDF) \
138-
VEC_STORE8(STYPE, AS, val.lo, ROUNDF) \
139-
VEC_STORE8(STYPE, AS, val.hi, ROUNDF)
140-
141-
#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \
142-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore_half##VEC_SUFFIX( \
143-
TYPE vec, size_t offset, AS half *mem) { \
144-
offset *= VEC_SIZE; \
145-
VEC_STORE##VEC_SIZE(STYPE, AS, vec, __clc_noop) \
146-
} \
147-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstorea_half##VEC_SUFFIX( \
148-
TYPE vec, size_t offset, AS half *mem) { \
149-
offset *= OFFSET; \
150-
VEC_STORE##VEC_SIZE(STYPE, AS, vec, __clc_noop) \
151-
}
152-
153-
#define __FUNC_ROUND_CASE(CASE, VEC_SIZE, STYPE, AS, ROUNDF) \
154-
case CASE: \
155-
VEC_STORE##VEC_SIZE(STYPE, AS, vec, ROUNDF) break;
156-
157-
#define __FUNC_ROUND(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \
158-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstore_half##VEC_SUFFIX##_r( \
159-
TYPE vec, size_t offset, AS half *mem, unsigned int round_mode) { \
160-
offset *= VEC_SIZE; \
161-
switch (round_mode) { \
162-
__FUNC_ROUND_CASE(SPV_RTE, VEC_SIZE, STYPE, AS, __clc_rte) \
163-
__FUNC_ROUND_CASE(SPV_RTZ, VEC_SIZE, STYPE, AS, __clc_rtz) \
164-
__FUNC_ROUND_CASE(SPV_RTP, VEC_SIZE, STYPE, AS, __clc_rtp) \
165-
__FUNC_ROUND_CASE(SPV_RTN, VEC_SIZE, STYPE, AS, __clc_rtn) \
166-
default: \
167-
break; \
168-
} \
169-
} \
170-
_CLC_OVERLOAD _CLC_DEF void __spirv_ocl_vstorea_half##VEC_SUFFIX##_r( \
171-
TYPE vec, size_t offset, AS half *mem, unsigned int round_mode) { \
172-
offset *= OFFSET; \
173-
switch (round_mode) { \
174-
__FUNC_ROUND_CASE(SPV_RTE, VEC_SIZE, STYPE, AS, __clc_rte) \
175-
__FUNC_ROUND_CASE(SPV_RTZ, VEC_SIZE, STYPE, AS, __clc_rtz) \
176-
__FUNC_ROUND_CASE(SPV_RTP, VEC_SIZE, STYPE, AS, __clc_rtp) \
177-
__FUNC_ROUND_CASE(SPV_RTN, VEC_SIZE, STYPE, AS, __clc_rtn) \
178-
default: \
179-
break; \
180-
} \
181-
}
182-
183-
_CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; }
184-
_CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
185-
/* Remove lower 13 bits to make sure the number is rounded down */
186-
int mask = 0xffffe000;
187-
const int exp = ( __clc_as_uint(x) >> 23 & 0xff) - 127;
188-
/* Denormals cannot be flushed, and they use different bit for rounding */
189-
if (exp < -14)
190-
mask <<= __spirv_ocl_s_min(-(exp + 14), 10);
191-
/* RTZ does not produce Inf for large numbers */
192-
if (__spirv_ocl_fabs(x) > 65504.0f && !__spirv_IsInf(x))
193-
return __spirv_ocl_copysign(65504.0f, x);
194-
/* Handle nan corner case */
195-
if (__spirv_IsNan(x))
196-
return x;
197-
return __clc_as_float( __clc_as_uint(x) & mask);
198-
}
199-
_CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
200-
const float inf = __spirv_ocl_copysign(INFINITY, x);
201-
/* Set lower 13 bits */
202-
int mask = (1 << 13) - 1;
203-
const int exp = ( __clc_as_uint(x) >> 23 & 0xff) - 127;
204-
/* Denormals cannot be flushed, and they use different bit for rounding */
205-
if (exp < -14)
206-
mask = (1 << (13 + __spirv_ocl_s_min(-(exp + 14), 10))) - 1;
207-
/* Handle nan corner case */
208-
if (__spirv_IsNan(x))
209-
return x;
210-
const float next = __spirv_ocl_nextafter(__clc_as_float(__clc_as_uint(x) | mask), inf);
211-
return ((__clc_as_uint(x) & mask) == 0) ? x : next;
212-
}
213-
_CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
214-
return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
215-
}
216-
_CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
217-
return ((__clc_as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
218-
}
219-
_CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
220-
/* Mantisa + implicit bit */
221-
const uint mantissa = (__clc_as_uint(x) & 0x7fffff) | (1u << 23);
222-
const int exp = (__clc_as_uint(x) >> 23 & 0xff) - 127;
223-
int shift = 13;
224-
if (exp < -14) {
225-
/* The default assumes lower 13 bits are rounded,
226-
* but it might be more for denormals.
227-
* Shifting beyond last == 0b, and qr == 00b is not necessary */
228-
shift += __spirv_ocl_s_min(-(exp + 14), 15);
229-
}
230-
int mask = (1 << shift) - 1;
231-
const uint grs = mantissa & mask;
232-
const uint last = mantissa & (1 << shift);
233-
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
234-
* exp > 15 should round to inf. */
235-
bool roundup = (grs > (1 << (shift - 1))) ||
236-
(grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
237-
return roundup ? __clc_rti(x) : __clc_rtz(x);
238-
}
239-
240-
#ifdef cl_khr_fp64
241-
_CLC_DEF _CLC_OVERLOAD double __clc_noop(double x) { return x; }
242-
_CLC_DEF _CLC_OVERLOAD double __clc_rtz(double x) {
243-
/* Remove lower 42 bits to make sure the number is rounded down */
244-
ulong mask = 0xfffffc0000000000UL;
245-
const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
246-
/* Denormals cannot be flushed, and they use different bit for rounding */
247-
if (exp < -14)
248-
mask <<= __spirv_ocl_s_min(-(exp + 14), 10);
249-
/* RTZ does not produce Inf for large numbers */
250-
if (__spirv_ocl_fabs(x) > 65504.0 && !__spirv_IsInf(x))
251-
return __spirv_ocl_copysign(65504.0, x);
252-
/* Handle nan corner case */
253-
if (__spirv_IsNan(x))
254-
return x;
255-
return __clc_as_double(__clc_as_ulong(x) & mask);
256-
}
257-
_CLC_DEF _CLC_OVERLOAD double __clc_rti(double x) {
258-
const double inf = __spirv_ocl_copysign((double)INFINITY, x);
259-
/* Set lower 42 bits */
260-
long mask = (1UL << 42UL) - 1UL;
261-
const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
262-
/* Denormals cannot be flushed, and they use different bit for rounding */
263-
if (exp < -14)
264-
mask = (1UL << (42UL + __spirv_ocl_s_min(-(exp + 14), 10))) - 1;
265-
/* Handle nan corner case */
266-
if (__spirv_IsNan(x))
267-
return x;
268-
const double next = __spirv_ocl_nextafter(__clc_as_double(__clc_as_ulong(x) | mask), inf);
269-
return ((__clc_as_ulong(x) & mask) == 0) ? x : next;
270-
}
271-
_CLC_DEF _CLC_OVERLOAD double __clc_rtn(double x) {
272-
return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rtz(x)
273-
: __clc_rti(x);
274-
}
275-
_CLC_DEF _CLC_OVERLOAD double __clc_rtp(double x) {
276-
return ((__clc_as_ulong(x) & 0x8000000000000000UL) == 0) ? __clc_rti(x)
277-
: __clc_rtz(x);
278-
}
279-
_CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) {
280-
/* Mantisa + implicit bit */
281-
const ulong mantissa = (__clc_as_ulong(x) & 0xfffffffffffff) | (1UL << 52);
282-
const int exp = (__clc_as_ulong(x) >> 52 & 0x7ff) - 1023;
283-
int shift = 42;
284-
if (exp < -14) {
285-
/* The default assumes lower 13 bits are rounded,
286-
* but it might be more for denormals.
287-
* Shifting beyond last == 0b, and qr == 00b is not necessary */
288-
shift += __spirv_ocl_s_min(-(exp + 14), 15);
289-
}
290-
ulong mask = (1UL << shift) - 1UL;
291-
const ulong grs = mantissa & mask;
292-
const ulong last = mantissa & (1UL << shift);
293-
/* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
294-
* exp > 15 should round to inf. */
295-
bool roundup = (grs > (1UL << (shift - 1UL))) ||
296-
(grs == (1UL << (shift - 1UL)) && last != 0) || (exp > 15);
297-
return roundup ? __clc_rti(x) : __clc_rtz(x);
298-
}
299-
#endif
300-
301-
#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \
302-
__FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \
303-
__FUNC_ROUND(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX)
304-
305-
#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX) \
306-
__XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, STYPE, AS, VEC_SUFFIX)
307-
308-
#define FUNC_SCALAR(VEC_SIZE, OFFSET, TYPE, STYPE, AS) \
309-
__XFUNC(, VEC_SIZE, OFFSET, TYPE, STYPE, AS, )
310-
311-
#define __CLC_BODY "vstore_half.inc"
15+
#define __CLC_BODY "vstore.inc"
31216
#include <clc/math/gentype.inc>
313-
#undef __CLC_BODY
314-
#undef FUNC
315-
#undef __XFUNC
316-
#undef __FUNC
317-
#undef VEC_LOAD16
318-
#undef VEC_LOAD8
319-
#undef VEC_LOAD4
320-
#undef VEC_LOAD3
321-
#undef VEC_LOAD2
322-
#undef VEC_LOAD1
323-
#undef DECLARE_HELPER
324-
#undef VSTORE_ADDR_SPACES
325-
#undef VSTORE_VECTORIZE

0 commit comments

Comments
 (0)