Skip to content

Commit 0144348

Browse files
author
Alexander Khokhlov
committed
Fixed primitieves
1 parent 18ed668 commit 0144348

File tree

3 files changed

+170
-126
lines changed

3 files changed

+170
-126
lines changed

CLW/CL/CLW.cl

Lines changed: 10 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,41 +1545,11 @@ inline void atomic_min_int(volatile __global int* addr, int value)
15451545
atomic_min(addr, value);
15461546
}
15471547

1548-
// --------------------- HELPERS ------------------------
1549-
1550-
#define DEFINE_ASSIGN_OPERATOR(type)\
1551-
inline void assign_##type(__local type* addr, type value)\
1552-
{\
1553-
*addr = value;\
1554-
}
1555-
1556-
inline void assign_float3(__local float3* addr, float3 value)
1557-
{
1558-
(*addr).xyz = value.xyz;
1559-
}
1560-
1561-
inline int divide_int(int dividend, int divider)
1562-
{
1563-
return dividend / (divider != 0 ? divider : 1);
1564-
}
1565-
1566-
inline float divide_float(float dividend, float divider)
1567-
{
1568-
return dividend / (fabs(divider) > epsilon ? divider : 1.f);
1569-
}
1570-
1571-
inline float3 divide_float3(float3 dividend, float3 divider)
1572-
{
1573-
return (float3)(divide_float(dividend.x, divider.x),
1574-
divide_float(dividend.y, divider.y),
1575-
divide_float(dividend.z, divider.z));
1576-
}
1577-
15781548
// --------------------- REDUCTION ------------------------
15791549

15801550
#define DEFINE_REDUCTION(bin_op, type)\
15811551
__kernel void reduction_##bin_op##_##type(__global type* buffer,\
1582-
int buf_count,\
1552+
int count,\
15831553
__local type* shared_mem,\
15841554
__global type* out)\
15851555
{\
@@ -1588,27 +1558,21 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
15881558
int local_id = get_local_id(0);\
15891559
int group_size = get_local_size(0);\
15901560
\
1591-
if (global_id < buf_count)\
1561+
if (global_id < count)\
15921562
{\
1593-
assign_##type(shared_mem + local_id, buffer[global_id]);\
1563+
*(shared_mem + local_id) = buffer[global_id];\
15941564
}\
15951565
else\
15961566
{\
1597-
assign_##type(shared_mem + local_id, neutral_##bin_op##_##type);\
1598-
}\
1599-
\
1600-
if (global_id == 0)\
1601-
{\
1602-
*out = neutral_##bin_op##_##type;\
1567+
*(shared_mem + local_id) = neutral_##bin_op##_##type;\
16031568
}\
16041569
\
16051570
barrier(CLK_LOCAL_MEM_FENCE);\
16061571
for (int i = group_size / 2; i > 0; i >>= 1)\
16071572
{\
16081573
if (local_id < i)\
16091574
{\
1610-
assign_##type(shared_mem + local_id,\
1611-
bin_op(shared_mem[local_id], shared_mem[local_id + i]));\
1575+
*(shared_mem + local_id) = bin_op(shared_mem[local_id], shared_mem[local_id + i]);\
16121576
}\
16131577
barrier(CLK_LOCAL_MEM_FENCE);\
16141578
}\
@@ -1624,60 +1588,14 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
16241588
#define DEFINE_BUFFER_NORMALIZATION(type)\
16251589
__kernel void buffer_normalization_##type(__global type* input,\
16261590
__global type* output,\
1627-
int buffer_count,\
1628-
__local type* shared_mem,\
1629-
__global type* auxiliary_buf)\
1591+
int count,\
1592+
type max,\
1593+
type min)\
16301594
{\
16311595
int global_id = get_global_id(0);\
1632-
int group_id = get_group_id(0);\
1633-
int local_id = get_local_id(0);\
1634-
int group_size = get_local_size(0);\
1635-
\
1636-
__local type* min_buffer = shared_mem;\
1637-
__local type* max_buffer = shared_mem + group_size;\
1638-
\
1639-
if (global_id < buffer_count)\
1596+
if (global_id < count)\
16401597
{\
1641-
min_buffer[local_id] = input[global_id];\
1642-
max_buffer[local_id] = input[global_id];\
1643-
}\
1644-
else\
1645-
{\
1646-
min_buffer[local_id] = neutral_min_##type;\
1647-
max_buffer[local_id] = neutral_max_##type;\
1648-
}\
1649-
\
1650-
if (global_id == 0)\
1651-
{\
1652-
auxiliary_buf[0] = neutral_min_##type;\
1653-
auxiliary_buf[1] = neutral_max_##type;\
1654-
}\
1655-
\
1656-
barrier(CLK_LOCAL_MEM_FENCE);\
1657-
\
1658-
for (int i = group_size / 2; i > 0; i >>= 1)\
1659-
{\
1660-
if (local_id < i)\
1661-
{\
1662-
assign_##type(min_buffer + local_id,\
1663-
min(min_buffer[local_id], min_buffer[local_id + i]));\
1664-
assign_##type(max_buffer + local_id,\
1665-
max(max_buffer[local_id], max_buffer[local_id + i]));\
1666-
}\
1667-
barrier(CLK_LOCAL_MEM_FENCE);\
1668-
}\
1669-
\
1670-
if (local_id == 0)\
1671-
{\
1672-
atomic_min_##type(auxiliary_buf, min_buffer[0]);\
1673-
atomic_max_##type(auxiliary_buf + 1, max_buffer[0]);\
1674-
}\
1675-
\
1676-
barrier(CLK_LOCAL_MEM_FENCE);\
1677-
type diff = auxiliary_buf[1] - auxiliary_buf[0];\
1678-
if (global_id < buffer_count)\
1679-
{\
1680-
output[global_id] = divide_##type(input[global_id], diff);\
1598+
output[global_id] = input[global_id] / (max - min);\
16811599
}\
16821600
}
16831601

@@ -1687,9 +1605,6 @@ DEFINE_ATOMIC(max)
16871605
DEFINE_ATOMIC_FLOAT3(min)
16881606
DEFINE_ATOMIC_FLOAT3(max)
16891607

1690-
DEFINE_ASSIGN_OPERATOR(int)
1691-
DEFINE_ASSIGN_OPERATOR(float)
1692-
16931608
DEFINE_REDUCTION(min, int)
16941609
DEFINE_REDUCTION(min, float)
16951610
DEFINE_REDUCTION(min, float3)

CLW/CLWParallelPrimitives.cpp

Lines changed: 144 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ THE SOFTWARE.
3939
#define NUM_SEG_SCAN_ELEMS_PER_WI 1
4040
#define NUM_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SCAN_ELEMS_PER_WI)
4141
#define NUM_SEG_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SEG_SCAN_ELEMS_PER_WI)
42-
#define NORMALIZATION_CACHE (2)
4342

4443
CLWParallelPrimitives::CLWParallelPrimitives(CLWContext context, char const* buildopts)
4544
: context_(context)
@@ -921,65 +920,181 @@ void CLWParallelPrimitives::ReclaimTempBuffer(std::map<size_t, CLWBuffer<T> > co
921920
collection[buffer.GetElementCount()] = buffer;
922921
}
923922

923+
CLWEvent CLWParallelPrimitives::Copy(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
924+
{
925+
int ELEMS_PER_WI = 4;
926+
int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
927+
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;
928+
929+
CLWKernel copyKernel = program_.GetKernel("copy");
930+
931+
copyKernel.SetArg(0, input);
932+
copyKernel.SetArg(1, numElems);
933+
copyKernel.SetArg(2, output);
934+
935+
return context_.Launch1D(0, NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
936+
}
937+
938+
const float epsilon = 0.001f;
939+
924940
template <class T>
925-
CLWEvent CLWParallelPrimitives::Normalize(const char* kernelName, unsigned int deviceIdx, CLWBuffer<T> input, CLWBuffer<T> output, int numElems, int groupSize, CLWBuffer<T> cache)
941+
T CLWParallelPrimitives::GetMaxNum()
942+
{
943+
return std::numeric_limits<T>::max();
944+
}
945+
946+
947+
template <>
948+
cl_float3 CLWParallelPrimitives::GetMaxNum<cl_float3>()
949+
{
950+
cl_float3 val;
951+
952+
val.s[0] = std::numeric_limits<float>::max();
953+
val.s[1] = std::numeric_limits<float>::max();
954+
val.s[2] = std::numeric_limits<float>::max();
955+
956+
return val;
957+
}
958+
959+
template <class T>
960+
T CLWParallelPrimitives::GetMinNum()
961+
{
962+
return std::numeric_limits<T>::min();
963+
}
964+
965+
template <>
966+
cl_float3 CLWParallelPrimitives::GetMinNum<cl_float3>()
967+
{
968+
cl_float3 val;
969+
970+
val.s[0] = std::numeric_limits<float>::min();
971+
val.s[1] = std::numeric_limits<float>::min();
972+
val.s[2] = std::numeric_limits<float>::min();
973+
974+
return val;
975+
}
976+
977+
template <class T>
978+
CLWEvent CLWParallelPrimitives::Reduction(const char* kernelName,
979+
unsigned int deviceIdx,
980+
CLWBuffer<T> input,
981+
int numElems,
982+
CLWBuffer<T> out)
926983
{
927-
assert(groupSize);
928984
assert(input.GetElementCount() >= numElems);
929-
assert(output.GetElementCount() >= numElems);
930985

931-
int ELEMS_PER_WI = 4;
932-
int GROUP_BLOCK_SIZE = (groupSize * ELEMS_PER_WI);
933-
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;
986+
int NUM_BLOCKS = (int)((numElems + WG_SIZE - 1) / WG_SIZE);
934987

935-
CLWKernel normalizeKernel = program_.GetKernel(kernelName);
988+
CLWKernel reductionKernel = program_.GetKernel(kernelName);
936989

937990
int argc = 0;
938991

939-
normalizeKernel.SetArg(argc++, input);
940-
normalizeKernel.SetArg(argc++, output);
941-
normalizeKernel.SetArg(argc++, numElems);
942-
normalizeKernel.SetArg(argc++, SharedMemory(2 * sizeof(T) * groupSize));
943-
normalizeKernel.SetArg(argc++, cache);
992+
reductionKernel.SetArg(argc++, input);
993+
reductionKernel.SetArg(argc++, numElems);
994+
reductionKernel.SetArg(argc++, SharedMemory(sizeof(T) * WG_SIZE));
995+
reductionKernel.SetArg(argc++, out);
944996

945-
return context_.Launch1D(deviceIdx, NUM_BLOCKS * groupSize, groupSize, normalizeKernel);
997+
return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, reductionKernel);
946998
}
947999

948-
CLWEvent CLWParallelPrimitives::Copy(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1000+
template <class T>
1001+
CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
1002+
const char* minReductionKernelName,
1003+
const char* maxReductionKernelName,
1004+
unsigned int deviceIdx,
1005+
CLWBuffer<T> input,
1006+
CLWBuffer<T> output,
1007+
int numElems,
1008+
CLWBuffer<T> cache)
9491009
{
950-
int ELEMS_PER_WI = 4;
951-
int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
952-
int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1) / GROUP_BLOCK_SIZE;
1010+
assert(input.GetElementCount() >= numElems);
1011+
assert(output.GetElementCount() >= numElems);
9531012

954-
CLWKernel copyKernel = program_.GetKernel("copy");
1013+
int NUM_BLOCKS = (int)((numElems + WG_SIZE - 1) / WG_SIZE);
9551014

956-
copyKernel.SetArg(0, input);
957-
copyKernel.SetArg(1, numElems);
958-
copyKernel.SetArg(2, output);
1015+
T min = GetMaxNum<T>();
1016+
T max = GetMinNum<T>();
9591017

960-
return context_.Launch1D(0, NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
1018+
context_.WriteBuffer<T>(deviceIdx, cache, &min, 1);
1019+
1020+
Reduction(minReductionKernelName,
1021+
0,
1022+
input,
1023+
numElems,
1024+
cache).Wait();
1025+
1026+
context_.ReadBuffer<T>(deviceIdx, cache, &min, 1).Wait();
1027+
context_.WriteBuffer<T>(deviceIdx, cache, &max, 1).Wait();
1028+
1029+
Reduction(maxReductionKernelName,
1030+
0,
1031+
input,
1032+
numElems,
1033+
cache).Wait();
1034+
1035+
context_.ReadBuffer<T>(deviceIdx, cache, &max, 1).Wait();
1036+
1037+
// launch normalization kernel
1038+
CLWKernel normalizeKernel = program_.GetKernel(normalizeKernelName);
1039+
1040+
int argc = 0;
1041+
1042+
normalizeKernel.SetArg(argc++, input);
1043+
normalizeKernel.SetArg(argc++, output);
1044+
normalizeKernel.SetArg(argc++, numElems);
1045+
normalizeKernel.SetArg(argc++, max);
1046+
normalizeKernel.SetArg(argc++, min);
1047+
1048+
return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, normalizeKernel);
9611049
}
9621050

9631051
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
9641052
{
965-
CLWBuffer<cl_int> cache = GetTempIntBuffer(NORMALIZATION_CACHE);
966-
CLWEvent event = Normalize<cl_int>("buffer_normalization_int", deviceIdx, input, output, numElems, WG_SIZE, cache);
1053+
CLWBuffer<cl_int> cache = GetTempIntBuffer(1);
1054+
1055+
CLWEvent event = Normalize("buffer_normalization_int",
1056+
"reduction_min_int",
1057+
"reduction_max_int",
1058+
deviceIdx,
1059+
input,
1060+
output,
1061+
numElems,
1062+
cache);
1063+
9671064
ReclaimTempIntBuffer(cache);
9681065
return event;
9691066
}
9701067

9711068
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
9721069
{
973-
CLWBuffer<cl_float> cache = GetTempFloatBuffer(NORMALIZATION_CACHE);
974-
CLWEvent event = Normalize<cl_float>("buffer_normalization_float", deviceIdx, input, output, numElems, WG_SIZE, cache);
1070+
CLWBuffer<cl_float> cache = GetTempFloatBuffer(1);
1071+
1072+
CLWEvent event = Normalize("buffer_normalization_float",
1073+
"reduction_min_float",
1074+
"reduction_max_float",
1075+
deviceIdx,
1076+
input,
1077+
output,
1078+
numElems,
1079+
cache);
1080+
9751081
ReclaimTempFloatBuffer(cache);
9761082
return event;
9771083
}
9781084

9791085
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float3> input, CLWBuffer<cl_float3> output, int numElems)
9801086
{
981-
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, NORMALIZATION_CACHE);
982-
CLWEvent event = Normalize<cl_float3>("buffer_normalization_float3", deviceIdx, input, output, numElems, WG_SIZE, cache);
1087+
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 1);
1088+
1089+
CLWEvent event = Normalize("buffer_normalization_float3",
1090+
"reduction_min_float3",
1091+
"reduction_max_float3",
1092+
deviceIdx,
1093+
input,
1094+
output,
1095+
numElems,
1096+
cache);
1097+
9831098
ReclaimTempBuffer(float3_BufferCache_, cache);
9841099
return event;
9851100
}

CLW/CLWParallelPrimitives.h

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,26 @@ class CLWParallelPrimitives
9191
void ReclaimTempBuffer(std::map<size_t, CLWBuffer<T>> collection, CLWBuffer<T> buffer);
9292

9393
template <class T>
94-
CLWEvent Normalize(const char* kernelName,
94+
CLWEvent Reduction(const char* kernelName,
95+
unsigned int deviceIdx,
96+
CLWBuffer<T> input,
97+
int numElems,
98+
CLWBuffer<T> out);
99+
100+
template <class T>
101+
T GetMaxNum();
102+
103+
template <class T>
104+
T GetMinNum();
105+
106+
template <class T>
107+
CLWEvent Normalize(const char* normalizeKernelName,
108+
const char* minReductionKernelName,
109+
const char* maxReductionKernelName,
95110
unsigned int deviceIdx,
96111
CLWBuffer<T> input,
97112
CLWBuffer<T> output,
98113
int numElems,
99-
int groupSize,
100114
CLWBuffer<T> cache);
101115

102116
CLWContext context_;

0 commit comments

Comments
 (0)