Skip to content

Commit 18ed668

Browse files
author
Alexander Khokhlov
committed
Added normalization, reduction and atomic premitieves
1 parent 688c66a commit 18ed668

File tree

3 files changed

+346
-87
lines changed

3 files changed

+346
-87
lines changed

CLW/CL/CLW.cl

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ THE SOFTWARE.
2323
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
2424
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
2525

26+
// --------------------- CONSTANTS ------------------------
27+
// add neutral elements
28+
__constant int neutral_add_int = 0;
29+
__constant float neutral_add_float = 0;
30+
__constant float3 neutral_add_float3 = (float3)(0.0, 0.0, 0.0);
31+
// max neutral elements
32+
__constant int neutral_max_int = INT_MIN;
33+
__constant float neutral_max_float = FLT_MIN;
34+
__constant float3 neutral_max_float3 = (float3)(FLT_MIN, FLT_MIN, FLT_MIN);
35+
// min neutral elements
36+
__constant int neutral_min_int = INT_MAX;
37+
__constant float neutral_min_float = FLT_MAX;
38+
__constant float3 neutral_min_float3 = (float3)(FLT_MAX, FLT_MAX, FLT_MAX);
39+
40+
__constant float epsilon = .00001f;
2641

2742
// --------------------- HELPERS ------------------------
2843
//#define INT_MAX 0x7FFFFFFF
@@ -1492,3 +1507,196 @@ __kernel void segmented_distribute_part_sum_int_nocut(
14921507
}
14931508
}
14941509
}
1510+
1511+
// --------------------- ATOMIC OPERTIONS ------------------------
1512+
1513+
#define DEFINE_ATOMIC(operation)\
1514+
inline void atomic_##operation##_float(volatile __global float* addr, float value)\
1515+
{\
1516+
union{\
1517+
unsigned int u32;\
1518+
float f32;\
1519+
} next, expected, current;\
1520+
current.f32 = *addr;\
1521+
do{\
1522+
expected.f32 = current.f32;\
1523+
next.f32 = operation(expected.f32, value);\
1524+
current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\
1525+
expected.u32, next.u32);\
1526+
} while (current.u32 != expected.u32);\
1527+
}
1528+
1529+
#define DEFINE_ATOMIC_FLOAT3(operation)\
1530+
inline void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
1531+
{\
1532+
volatile __global float* p = (volatile __global float*)addr;\
1533+
atomic_##operation##_float(p, value.x);\
1534+
atomic_##operation##_float(p + 1, value.y);\
1535+
atomic_##operation##_float(p + 2, value.z);\
1536+
}
1537+
1538+
inline void atomic_max_int(volatile __global int* addr, int value)
1539+
{
1540+
atomic_max(addr, value);
1541+
}
1542+
1543+
inline void atomic_min_int(volatile __global int* addr, int value)
1544+
{
1545+
atomic_min(addr, value);
1546+
}
1547+
1548+
// --------------------- HELPERS ------------------------
1549+
1550+
#define DEFINE_ASSIGN_OPERATOR(type)\
1551+
inline void assign_##type(__local type* addr, type value)\
1552+
{\
1553+
*addr = value;\
1554+
}
1555+
1556+
inline void assign_float3(__local float3* addr, float3 value)
1557+
{
1558+
(*addr).xyz = value.xyz;
1559+
}
1560+
1561+
inline int divide_int(int dividend, int divider)
1562+
{
1563+
return dividend / (divider != 0 ? divider : 1);
1564+
}
1565+
1566+
inline float divide_float(float dividend, float divider)
1567+
{
1568+
return dividend / (fabs(divider) > epsilon ? divider : 1.f);
1569+
}
1570+
1571+
inline float3 divide_float3(float3 dividend, float3 divider)
1572+
{
1573+
return (float3)(divide_float(dividend.x, divider.x),
1574+
divide_float(dividend.y, divider.y),
1575+
divide_float(dividend.z, divider.z));
1576+
}
1577+
1578+
// --------------------- REDUCTION ------------------------
1579+
1580+
#define DEFINE_REDUCTION(bin_op, type)\
1581+
__kernel void reduction_##bin_op##_##type(__global type* buffer,\
1582+
int buf_count,\
1583+
__local type* shared_mem,\
1584+
__global type* out)\
1585+
{\
1586+
int global_id = get_global_id(0);\
1587+
int group_id = get_group_id(0);\
1588+
int local_id = get_local_id(0);\
1589+
int group_size = get_local_size(0);\
1590+
\
1591+
if (global_id < buf_count)\
1592+
{\
1593+
assign_##type(shared_mem + local_id, buffer[global_id]);\
1594+
}\
1595+
else\
1596+
{\
1597+
assign_##type(shared_mem + local_id, neutral_##bin_op##_##type);\
1598+
}\
1599+
\
1600+
if (global_id == 0)\
1601+
{\
1602+
*out = neutral_##bin_op##_##type;\
1603+
}\
1604+
\
1605+
barrier(CLK_LOCAL_MEM_FENCE);\
1606+
for (int i = group_size / 2; i > 0; i >>= 1)\
1607+
{\
1608+
if (local_id < i)\
1609+
{\
1610+
assign_##type(shared_mem + local_id,\
1611+
bin_op(shared_mem[local_id], shared_mem[local_id + i]));\
1612+
}\
1613+
barrier(CLK_LOCAL_MEM_FENCE);\
1614+
}\
1615+
\
1616+
if (local_id == 0)\
1617+
{\
1618+
atomic_##bin_op##_##type(out, shared_mem[0]);\
1619+
}\
1620+
}
1621+
1622+
// --------------------- NORMALIZATION ------------------------
1623+
1624+
#define DEFINE_BUFFER_NORMALIZATION(type)\
1625+
__kernel void buffer_normalization_##type(__global type* input,\
1626+
__global type* output,\
1627+
int buffer_count,\
1628+
__local type* shared_mem,\
1629+
__global type* auxiliary_buf)\
1630+
{\
1631+
int global_id = get_global_id(0);\
1632+
int group_id = get_group_id(0);\
1633+
int local_id = get_local_id(0);\
1634+
int group_size = get_local_size(0);\
1635+
\
1636+
__local type* min_buffer = shared_mem;\
1637+
__local type* max_buffer = shared_mem + group_size;\
1638+
\
1639+
if (global_id < buffer_count)\
1640+
{\
1641+
min_buffer[local_id] = input[global_id];\
1642+
max_buffer[local_id] = input[global_id];\
1643+
}\
1644+
else\
1645+
{\
1646+
min_buffer[local_id] = neutral_min_##type;\
1647+
max_buffer[local_id] = neutral_max_##type;\
1648+
}\
1649+
\
1650+
if (global_id == 0)\
1651+
{\
1652+
auxiliary_buf[0] = neutral_min_##type;\
1653+
auxiliary_buf[1] = neutral_max_##type;\
1654+
}\
1655+
\
1656+
barrier(CLK_LOCAL_MEM_FENCE);\
1657+
\
1658+
for (int i = group_size / 2; i > 0; i >>= 1)\
1659+
{\
1660+
if (local_id < i)\
1661+
{\
1662+
assign_##type(min_buffer + local_id,\
1663+
min(min_buffer[local_id], min_buffer[local_id + i]));\
1664+
assign_##type(max_buffer + local_id,\
1665+
max(max_buffer[local_id], max_buffer[local_id + i]));\
1666+
}\
1667+
barrier(CLK_LOCAL_MEM_FENCE);\
1668+
}\
1669+
\
1670+
if (local_id == 0)\
1671+
{\
1672+
atomic_min_##type(auxiliary_buf, min_buffer[0]);\
1673+
atomic_max_##type(auxiliary_buf + 1, max_buffer[0]);\
1674+
}\
1675+
\
1676+
barrier(CLK_LOCAL_MEM_FENCE);\
1677+
type diff = auxiliary_buf[1] - auxiliary_buf[0];\
1678+
if (global_id < buffer_count)\
1679+
{\
1680+
output[global_id] = divide_##type(input[global_id], diff);\
1681+
}\
1682+
}
1683+
1684+
// Do not change the order
1685+
DEFINE_ATOMIC(min)
1686+
DEFINE_ATOMIC(max)
1687+
DEFINE_ATOMIC_FLOAT3(min)
1688+
DEFINE_ATOMIC_FLOAT3(max)
1689+
1690+
DEFINE_ASSIGN_OPERATOR(int)
1691+
DEFINE_ASSIGN_OPERATOR(float)
1692+
1693+
DEFINE_REDUCTION(min, int)
1694+
DEFINE_REDUCTION(min, float)
1695+
DEFINE_REDUCTION(min, float3)
1696+
DEFINE_REDUCTION(max, int)
1697+
DEFINE_REDUCTION(max, float)
1698+
DEFINE_REDUCTION(max, float3)
1699+
1700+
DEFINE_BUFFER_NORMALIZATION(int)
1701+
DEFINE_BUFFER_NORMALIZATION(float)
1702+
DEFINE_BUFFER_NORMALIZATION(float3)

0 commit comments

Comments
 (0)