@@ -39,7 +39,6 @@ THE SOFTWARE.
39
39
#define NUM_SEG_SCAN_ELEMS_PER_WI 1
40
40
#define NUM_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SCAN_ELEMS_PER_WI)
41
41
#define NUM_SEG_SCAN_ELEMS_PER_WG (WG_SIZE * NUM_SEG_SCAN_ELEMS_PER_WI)
42
- #define NORMALIZATION_CACHE (2 )
43
42
44
43
CLWParallelPrimitives::CLWParallelPrimitives (CLWContext context, char const * buildopts)
45
44
: context_(context)
@@ -921,65 +920,181 @@ void CLWParallelPrimitives::ReclaimTempBuffer(std::map<size_t, CLWBuffer<T> > co
921
920
collection[buffer.GetElementCount ()] = buffer;
922
921
}
923
922
923
+ CLWEvent CLWParallelPrimitives::Copy (unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
924
+ {
925
+ int ELEMS_PER_WI = 4 ;
926
+ int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
927
+ int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1 ) / GROUP_BLOCK_SIZE;
928
+
929
+ CLWKernel copyKernel = program_.GetKernel (" copy" );
930
+
931
+ copyKernel.SetArg (0 , input);
932
+ copyKernel.SetArg (1 , numElems);
933
+ copyKernel.SetArg (2 , output);
934
+
935
+ return context_.Launch1D (0 , NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
936
+ }
937
+
938
+ const float epsilon = 0 .001f ;
939
+
924
940
template <class T >
925
- CLWEvent CLWParallelPrimitives::Normalize (const char * kernelName, unsigned int deviceIdx, CLWBuffer<T> input, CLWBuffer<T> output, int numElems, int groupSize, CLWBuffer<T> cache)
941
+ T CLWParallelPrimitives::GetMaxNum ()
942
+ {
943
+ return std::numeric_limits<T>::max ();
944
+ }
945
+
946
+
947
+ template <>
948
+ cl_float3 CLWParallelPrimitives::GetMaxNum<cl_float3>()
949
+ {
950
+ cl_float3 val;
951
+
952
+ val.s [0 ] = std::numeric_limits<float >::max ();
953
+ val.s [1 ] = std::numeric_limits<float >::max ();
954
+ val.s [2 ] = std::numeric_limits<float >::max ();
955
+
956
+ return val;
957
+ }
958
+
959
+ template <class T >
960
+ T CLWParallelPrimitives::GetMinNum ()
961
+ {
962
+ return std::numeric_limits<T>::min ();
963
+ }
964
+
965
+ template <>
966
+ cl_float3 CLWParallelPrimitives::GetMinNum<cl_float3>()
967
+ {
968
+ cl_float3 val;
969
+
970
+ val.s [0 ] = std::numeric_limits<float >::min ();
971
+ val.s [1 ] = std::numeric_limits<float >::min ();
972
+ val.s [2 ] = std::numeric_limits<float >::min ();
973
+
974
+ return val;
975
+ }
976
+
977
+ template <class T >
978
+ CLWEvent CLWParallelPrimitives::Reduction (const char * kernelName,
979
+ unsigned int deviceIdx,
980
+ CLWBuffer<T> input,
981
+ int numElems,
982
+ CLWBuffer<T> out)
926
983
{
927
- assert (groupSize);
928
984
assert (input.GetElementCount () >= numElems);
929
- assert (output.GetElementCount () >= numElems);
930
985
931
- int ELEMS_PER_WI = 4 ;
932
- int GROUP_BLOCK_SIZE = (groupSize * ELEMS_PER_WI);
933
- int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1 ) / GROUP_BLOCK_SIZE;
986
+ int NUM_BLOCKS = (int )((numElems + WG_SIZE - 1 ) / WG_SIZE);
934
987
935
- CLWKernel normalizeKernel = program_.GetKernel (kernelName);
988
+ CLWKernel reductionKernel = program_.GetKernel (kernelName);
936
989
937
990
int argc = 0 ;
938
991
939
- normalizeKernel.SetArg (argc++, input);
940
- normalizeKernel.SetArg (argc++, output);
941
- normalizeKernel.SetArg (argc++, numElems);
942
- normalizeKernel.SetArg (argc++, SharedMemory (2 * sizeof (T) * groupSize));
943
- normalizeKernel.SetArg (argc++, cache);
992
+ reductionKernel.SetArg (argc++, input);
993
+ reductionKernel.SetArg (argc++, numElems);
994
+ reductionKernel.SetArg (argc++, SharedMemory (sizeof (T) * WG_SIZE));
995
+ reductionKernel.SetArg (argc++, out);
944
996
945
- return context_.Launch1D (deviceIdx, NUM_BLOCKS * groupSize, groupSize, normalizeKernel );
997
+ return context_.Launch1D (deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, reductionKernel );
946
998
}
947
999
948
- CLWEvent CLWParallelPrimitives::Copy (unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
1000
+ template <class T >
1001
+ CLWEvent CLWParallelPrimitives::Normalize (const char * normalizeKernelName,
1002
+ const char * minReductionKernelName,
1003
+ const char * maxReductionKernelName,
1004
+ unsigned int deviceIdx,
1005
+ CLWBuffer<T> input,
1006
+ CLWBuffer<T> output,
1007
+ int numElems,
1008
+ CLWBuffer<T> cache)
949
1009
{
950
- int ELEMS_PER_WI = 4 ;
951
- int GROUP_BLOCK_SIZE = (WG_SIZE * ELEMS_PER_WI);
952
- int NUM_BLOCKS = (numElems + GROUP_BLOCK_SIZE - 1 ) / GROUP_BLOCK_SIZE;
1010
+ assert (input.GetElementCount () >= numElems);
1011
+ assert (output.GetElementCount () >= numElems);
953
1012
954
- CLWKernel copyKernel = program_. GetKernel ( " copy " );
1013
+ int NUM_BLOCKS = ( int )((numElems + WG_SIZE - 1 ) / WG_SIZE );
955
1014
956
- copyKernel.SetArg (0 , input);
957
- copyKernel.SetArg (1 , numElems);
958
- copyKernel.SetArg (2 , output);
1015
+ T min = GetMaxNum<T>();
1016
+ T max = GetMinNum<T>();
959
1017
960
- return context_.Launch1D (0 , NUM_BLOCKS * WG_SIZE, WG_SIZE, copyKernel);
1018
+ context_.WriteBuffer <T>(deviceIdx, cache, &min, 1 );
1019
+
1020
+ Reduction (minReductionKernelName,
1021
+ 0 ,
1022
+ input,
1023
+ numElems,
1024
+ cache).Wait ();
1025
+
1026
+ context_.ReadBuffer <T>(deviceIdx, cache, &min, 1 ).Wait ();
1027
+ context_.WriteBuffer <T>(deviceIdx, cache, &max, 1 ).Wait ();
1028
+
1029
+ Reduction (maxReductionKernelName,
1030
+ 0 ,
1031
+ input,
1032
+ numElems,
1033
+ cache).Wait ();
1034
+
1035
+ context_.ReadBuffer <T>(deviceIdx, cache, &max, 1 ).Wait ();
1036
+
1037
+ // launch normalization kernel
1038
+ CLWKernel normalizeKernel = program_.GetKernel (normalizeKernelName);
1039
+
1040
+ int argc = 0 ;
1041
+
1042
+ normalizeKernel.SetArg (argc++, input);
1043
+ normalizeKernel.SetArg (argc++, output);
1044
+ normalizeKernel.SetArg (argc++, numElems);
1045
+ normalizeKernel.SetArg (argc++, max);
1046
+ normalizeKernel.SetArg (argc++, min);
1047
+
1048
+ return context_.Launch1D (deviceIdx, NUM_BLOCKS * WG_SIZE, WG_SIZE, normalizeKernel);
961
1049
}
962
1050
963
1051
CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_int> input, CLWBuffer<cl_int> output, int numElems)
964
1052
{
965
- CLWBuffer<cl_int> cache = GetTempIntBuffer (NORMALIZATION_CACHE);
966
- CLWEvent event = Normalize<cl_int>(" buffer_normalization_int" , deviceIdx, input, output, numElems, WG_SIZE, cache);
1053
+ CLWBuffer<cl_int> cache = GetTempIntBuffer (1 );
1054
+
1055
+ CLWEvent event = Normalize (" buffer_normalization_int" ,
1056
+ " reduction_min_int" ,
1057
+ " reduction_max_int" ,
1058
+ deviceIdx,
1059
+ input,
1060
+ output,
1061
+ numElems,
1062
+ cache);
1063
+
967
1064
ReclaimTempIntBuffer (cache);
968
1065
return event;
969
1066
}
970
1067
971
1068
CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_float> input, CLWBuffer<cl_float> output, int numElems)
972
1069
{
973
- CLWBuffer<cl_float> cache = GetTempFloatBuffer (NORMALIZATION_CACHE);
974
- CLWEvent event = Normalize<cl_float>(" buffer_normalization_float" , deviceIdx, input, output, numElems, WG_SIZE, cache);
1070
+ CLWBuffer<cl_float> cache = GetTempFloatBuffer (1 );
1071
+
1072
+ CLWEvent event = Normalize (" buffer_normalization_float" ,
1073
+ " reduction_min_float" ,
1074
+ " reduction_max_float" ,
1075
+ deviceIdx,
1076
+ input,
1077
+ output,
1078
+ numElems,
1079
+ cache);
1080
+
975
1081
ReclaimTempFloatBuffer (cache);
976
1082
return event;
977
1083
}
978
1084
979
1085
CLWEvent CLWParallelPrimitives::Normalize (unsigned int deviceIdx, CLWBuffer<cl_float3> input, CLWBuffer<cl_float3> output, int numElems)
980
1086
{
981
- CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, NORMALIZATION_CACHE);
982
- CLWEvent event = Normalize<cl_float3>(" buffer_normalization_float3" , deviceIdx, input, output, numElems, WG_SIZE, cache);
1087
+ CLWBuffer<cl_float3> cache = GetTempBuffer<cl_float3>(float3_BufferCache_, 1 );
1088
+
1089
+ CLWEvent event = Normalize (" buffer_normalization_float3" ,
1090
+ " reduction_min_float3" ,
1091
+ " reduction_max_float3" ,
1092
+ deviceIdx,
1093
+ input,
1094
+ output,
1095
+ numElems,
1096
+ cache);
1097
+
983
1098
ReclaimTempBuffer (float3_BufferCache_, cache);
984
1099
return event;
985
1100
}
0 commit comments