Skip to content

Commit 75a6990

Browse files
author
Alexander Khokhlov
committed
More code review coments fixed
1 parent dad044d commit 75a6990

File tree

3 files changed

+65
-63
lines changed

3 files changed

+65
-63
lines changed

CLW/CL/CLW.cl

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1511,14 +1511,15 @@ __kernel void segmented_distribute_part_sum_int_nocut(
15111511
// --------------------- ATOMIC OPERTIONS ------------------------
15121512

15131513
#define DEFINE_ATOMIC(operation)\
1514-
inline void atomic_##operation##_float(volatile __global float* addr, float value)\
1514+
__attribute__((always_inline)) void atomic_##operation##_float(volatile __global float* addr, float value)\
15151515
{\
15161516
union{\
15171517
unsigned int u32;\
15181518
float f32;\
15191519
} next, expected, current;\
15201520
current.f32 = *addr;\
1521-
do{\
1521+
do\
1522+
{\
15221523
expected.f32 = current.f32;\
15231524
next.f32 = operation(expected.f32, value);\
15241525
current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\
@@ -1527,20 +1528,20 @@ __kernel void segmented_distribute_part_sum_int_nocut(
15271528
}
15281529

15291530
#define DEFINE_ATOMIC_FLOAT3(operation)\
1530-
inline void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
1531+
__attribute__((always_inline)) void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\
15311532
{\
15321533
volatile __global float* p = (volatile __global float*)addr;\
15331534
atomic_##operation##_float(p, value.x);\
15341535
atomic_##operation##_float(p + 1, value.y);\
15351536
atomic_##operation##_float(p + 2, value.z);\
15361537
}
15371538

1538-
inline void atomic_max_int(volatile __global int* addr, int value)
1539+
__attribute__((always_inline)) void atomic_max_int(volatile __global int* addr, int value)
15391540
{
15401541
atomic_max(addr, value);
15411542
}
15421543

1543-
inline void atomic_min_int(volatile __global int* addr, int value)
1544+
__attribute__((always_inline)) void atomic_min_int(volatile __global int* addr, int value)
15441545
{
15451546
atomic_min(addr, value);
15461547
}
@@ -1551,7 +1552,8 @@ inline void atomic_min_int(volatile __global int* addr, int value)
15511552
__kernel void reduction_##bin_op##_##type(__global type* buffer,\
15521553
int count,\
15531554
__local type* shared_mem,\
1554-
__global type* out)\
1555+
__global type* out,\
1556+
int /* in elements */ out_offset)\
15551557
{\
15561558
int global_id = get_global_id(0);\
15571559
int group_id = get_group_id(0);\
@@ -1569,7 +1571,7 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
15691571
barrier(CLK_LOCAL_MEM_FENCE);\
15701572
}\
15711573
if (local_id == 0)\
1572-
atomic_##bin_op##_##type(out, shared_mem[0]);\
1574+
atomic_##bin_op##_##type(out + out_offset, shared_mem[0]);\
15731575
}
15741576

15751577
// --------------------- NORMALIZATION ------------------------
@@ -1578,12 +1580,12 @@ __kernel void reduction_##bin_op##_##type(__global type* buffer,\
15781580
__kernel void buffer_normalization_##type(__global type* input,\
15791581
__global type* output,\
15801582
int count,\
1581-
type max,\
1582-
type min)\
1583+
__global type* storage)\
15831584
{\
1585+
type norm_coef = storage[0] - storage[1];\
15841586
int global_id = get_global_id(0);\
15851587
if (global_id < count)\
1586-
output[global_id] = input[global_id] / (max - min);\
1588+
output[global_id] = input[global_id] / norm_coef;\
15871589
}
15881590

15891591
// Do not change the order

CLW/CLWParallelPrimitives.cpp

Lines changed: 51 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -976,10 +976,11 @@ cl_float3 CLWParallelPrimitives::GetMinNum<cl_float3>()
976976

977977
template <class T>
978978
CLWEvent CLWParallelPrimitives::Reduction(const char* kernelName,
979-
unsigned int deviceIdx,
980-
CLWBuffer<T> input,
981-
int numElems,
982-
CLWBuffer<T> out)
979+
unsigned int deviceIdx,
980+
CLWBuffer<T> input,
981+
int numElems,
982+
CLWBuffer<T> out,
983+
int out_offset)
983984
{
984985
assert(input.GetElementCount() >= numElems);
985986

@@ -993,6 +994,7 @@ CLWEvent CLWParallelPrimitives::Reduction(const char* kernelName,
993994
reductionKernel.SetArg(argc++, numElems);
994995
reductionKernel.SetArg(argc++, SharedMemory(sizeof(T) * WG_SIZE));
995996
reductionKernel.SetArg(argc++, out);
997+
reductionKernel.SetArg(argc++, out_offset);
996998

997999
return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, reductionKernel);
9981000
}
@@ -1015,24 +1017,22 @@ CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
10151017
T min = GetMaxNum<T>();
10161018
T max = GetMinNum<T>();
10171019

1018-
context_.WriteBuffer<T>(deviceIdx, cache, &min, 1);
1019-
1020-
Reduction(minReductionKernelName,
1021-
0,
1022-
input,
1023-
numElems,
1024-
cache).Wait();
1020+
context_.WriteBuffer<T>(deviceIdx, cache, &max, 1);
1021+
context_.WriteBuffer<T>(deviceIdx, cache, &min, 1, 1);
10251022

1026-
context_.ReadBuffer<T>(deviceIdx, cache, &min, 1).Wait();
1027-
context_.WriteBuffer<T>(deviceIdx, cache, &max, 1).Wait();
1023+
Reduction(minReductionKernelName,
1024+
0,
1025+
input,
1026+
numElems,
1027+
cache,
1028+
1);
10281029

10291030
Reduction(maxReductionKernelName,
10301031
0,
10311032
input,
10321033
numElems,
1033-
cache).Wait();
1034-
1035-
context_.ReadBuffer<T>(deviceIdx, cache, &max, 1).Wait();
1034+
cache,
1035+
0);
10361036

10371037
// launch normalization kernel
10381038
CLWKernel normalizeKernel = program_.GetKernel(normalizeKernelName);
@@ -1042,49 +1042,48 @@ CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
10421042
normalizeKernel.SetArg(argc++, input);
10431043
normalizeKernel.SetArg(argc++, output);
10441044
normalizeKernel.SetArg(argc++, numElems);
1045-
normalizeKernel.SetArg(argc++, max);
1046-
normalizeKernel.SetArg(argc++, min);
1045+
normalizeKernel.SetArg(argc++, cache);
10471046

10481047
return context_.Launch1D(deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, normalizeKernel);
10491048
}
10501049

1051-
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1052-
{
1053-
CLWBuffer<cl_int> cache = GetTempIntBuffer(1);
1054-
1055-
CLWEvent event = Normalize("buffer_normalization_int",
1056-
"reduction_min_int",
1057-
"reduction_max_int",
1058-
deviceIdx,
1059-
input,
1060-
output,
1061-
numElems,
1062-
cache);
1063-
1064-
ReclaimTempIntBuffer(cache);
1065-
return event;
1066-
}
1067-
1068-
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
1069-
{
1070-
CLWBuffer<cl_float> cache = GetTempFloatBuffer(1);
1071-
1072-
CLWEvent event = Normalize("buffer_normalization_float",
1073-
"reduction_min_float",
1074-
"reduction_max_float",
1075-
deviceIdx,
1076-
input,
1077-
output,
1078-
numElems,
1079-
cache);
1080-
1081-
ReclaimTempFloatBuffer(cache);
1082-
return event;
1083-
}
1050+
//CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1051+
//{
1052+
// CLWBuffer<cl_int> cache = GetTempIntBuffer(2);
1053+
//
1054+
// CLWEvent event = Normalize("buffer_normalization_int",
1055+
// "reduction_min_int",
1056+
// "reduction_max_int",
1057+
// deviceIdx,
1058+
// input,
1059+
// output,
1060+
// numElems,
1061+
// cache);
1062+
//
1063+
// ReclaimTempIntBuffer(cache);
1064+
// return event;
1065+
//}
1066+
//
1067+
//CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
1068+
//{
1069+
// CLWBuffer<cl_float> cache = GetTempFloatBuffer(2);
1070+
//
1071+
// CLWEvent event = Normalize("buffer_normalization_float",
1072+
// "reduction_min_float",
1073+
// "reduction_max_float",
1074+
// deviceIdx,
1075+
// input,
1076+
// output,
1077+
// numElems,
1078+
// cache);
1079+
//
1080+
// ReclaimTempFloatBuffer(cache);
1081+
// return event;
1082+
//}
10841083

10851084
CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float3> input, CLWBuffer<cl_float3> output, int numElems)
10861085
{
1087-
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 1);
1086+
CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 2);
10881087

10891088
CLWEvent event = Normalize("buffer_normalization_float3",
10901089
"reduction_min_float3",

CLW/CLWParallelPrimitives.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ class CLWParallelPrimitives
9595
unsigned int deviceIdx,
9696
CLWBuffer<T> input,
9797
int numElems,
98-
CLWBuffer<T> out);
98+
CLWBuffer<T> out,
99+
int /* in elements */out_offset = 0);
99100

100101
template <class T>
101102
T GetMaxNum();

0 commit comments

Comments
 (0)