@@ -976,10 +976,11 @@ cl_float3 CLWParallelPrimitives::GetMinNum<cl_float3>()
976
976
977
977
template <class T >
978
978
CLWEvent CLWParallelPrimitives::Reduction (const char * kernelName,
979
- unsigned int deviceIdx,
980
- CLWBuffer<T> input,
981
- int numElems,
982
- CLWBuffer<T> out)
979
+ unsigned int deviceIdx,
980
+ CLWBuffer<T> input,
981
+ int numElems,
982
+ CLWBuffer<T> out,
983
+ int out_offset)
983
984
{
984
985
assert (input.GetElementCount () >= numElems);
985
986
@@ -993,6 +994,7 @@ CLWEvent CLWParallelPrimitives::Reduction(const char* kernelName,
993
994
reductionKernel.SetArg (argc++, numElems);
994
995
reductionKernel.SetArg (argc++, SharedMemory (sizeof (T) * WG_SIZE));
995
996
reductionKernel.SetArg (argc++, out);
997
+ reductionKernel.SetArg (argc++, out_offset);
996
998
997
999
return context_.Launch1D (deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, reductionKernel);
998
1000
}
@@ -1015,24 +1017,22 @@ CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
1015
1017
T min = GetMaxNum<T>();
1016
1018
T max = GetMinNum<T>();
1017
1019
1018
- context_.WriteBuffer <T>(deviceIdx, cache, &min, 1 );
1019
-
1020
- Reduction (minReductionKernelName,
1021
- 0 ,
1022
- input,
1023
- numElems,
1024
- cache).Wait ();
1020
+ context_.WriteBuffer <T>(deviceIdx, cache, &max, 1 );
1021
+ context_.WriteBuffer <T>(deviceIdx, cache, &min, 1 , 1 );
1025
1022
1026
- context_.ReadBuffer <T>(deviceIdx, cache, &min, 1 ).Wait ();
1027
- context_.WriteBuffer <T>(deviceIdx, cache, &max, 1 ).Wait ();
1023
+ Reduction (minReductionKernelName,
1024
+ 0 ,
1025
+ input,
1026
+ numElems,
1027
+ cache,
1028
+ 1 );
1028
1029
1029
1030
Reduction (maxReductionKernelName,
1030
1031
0 ,
1031
1032
input,
1032
1033
numElems,
1033
- cache).Wait ();
1034
-
1035
- context_.ReadBuffer <T>(deviceIdx, cache, &max, 1 ).Wait ();
1034
+ cache,
1035
+ 0 );
1036
1036
1037
1037
// launch normalization kernel
1038
1038
CLWKernel normalizeKernel = program_.GetKernel (normalizeKernelName);
@@ -1042,49 +1042,48 @@ CLWEvent CLWParallelPrimitives::Normalize(const char* normalizeKernelName,
1042
1042
normalizeKernel.SetArg (argc++, input);
1043
1043
normalizeKernel.SetArg (argc++, output);
1044
1044
normalizeKernel.SetArg (argc++, numElems);
1045
- normalizeKernel.SetArg (argc++, max);
1046
- normalizeKernel.SetArg (argc++, min);
1045
+ normalizeKernel.SetArg (argc++, cache);
1047
1046
1048
1047
return context_.Launch1D (deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, normalizeKernel);
1049
1048
}
1050
1049
1051
- CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1052
- {
1053
- CLWBuffer<cl_int> cache = GetTempIntBuffer (1 );
1054
-
1055
- CLWEvent event = Normalize (" buffer_normalization_int" ,
1056
- " reduction_min_int" ,
1057
- " reduction_max_int" ,
1058
- deviceIdx,
1059
- input,
1060
- output,
1061
- numElems,
1062
- cache);
1063
-
1064
- ReclaimTempIntBuffer (cache);
1065
- return event;
1066
- }
1067
-
1068
- CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
1069
- {
1070
- CLWBuffer<cl_float> cache = GetTempFloatBuffer (1 );
1071
-
1072
- CLWEvent event = Normalize (" buffer_normalization_float" ,
1073
- " reduction_min_float" ,
1074
- " reduction_max_float" ,
1075
- deviceIdx,
1076
- input,
1077
- output,
1078
- numElems,
1079
- cache);
1080
-
1081
- ReclaimTempFloatBuffer (cache);
1082
- return event;
1083
- }
1050
+ // CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1051
+ // {
1052
+ // CLWBuffer<cl_int> cache = GetTempIntBuffer(2 );
1053
+ //
1054
+ // CLWEvent event = Normalize("buffer_normalization_int",
1055
+ // "reduction_min_int",
1056
+ // "reduction_max_int",
1057
+ // deviceIdx,
1058
+ // input,
1059
+ // output,
1060
+ // numElems,
1061
+ // cache);
1062
+ //
1063
+ // ReclaimTempIntBuffer(cache);
1064
+ // return event;
1065
+ // }
1066
+ //
1067
+ // CLWEvent CLWParallelPrimitives::Normalize(unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
1068
+ // {
1069
+ // CLWBuffer<cl_float> cache = GetTempFloatBuffer(2 );
1070
+ //
1071
+ // CLWEvent event = Normalize("buffer_normalization_float",
1072
+ // "reduction_min_float",
1073
+ // "reduction_max_float",
1074
+ // deviceIdx,
1075
+ // input,
1076
+ // output,
1077
+ // numElems,
1078
+ // cache);
1079
+ //
1080
+ // ReclaimTempFloatBuffer(cache);
1081
+ // return event;
1082
+ // }
1084
1083
1085
1084
CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_float3> input, CLWBuffer<cl_float3> output, int numElems)
1086
1085
{
1087
- CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 1 );
1086
+ CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 2 );
1088
1087
1089
1088
CLWEvent event = Normalize (" buffer_normalization_float3" ,
1090
1089
" reduction_min_float3" ,
0 commit comments