psychocoderHPC
diff --git a/‎src/libPMacc/include/cuSTL/algorithm/kernel/Foreach.hpp
Lines changed: 2 additions & 2 deletions b/‎src/libPMacc/include/cuSTL/algorithm/kernel/Foreach.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/libPMacc/include/cuSTL/algorithm/kernel/ForeachBlock.hpp
Lines changed: 7 additions & 3 deletions b/‎src/libPMacc/include/cuSTL/algorithm/kernel/ForeachBlock.hpp
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/libPMacc/include/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp
Lines changed: 4 additions & 2 deletions b/‎src/libPMacc/include/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/libPMacc/include/cuSTL/algorithm/kernel/detail/SphericMapper.hpp
Lines changed: 42 additions & 18 deletions b/‎src/libPMacc/include/cuSTL/algorithm/kernel/detail/SphericMapper.hpp
Lines changed: 42 additions & 18 deletions
diff --git a/‎src/libPMacc/include/cuSTL/algorithm/kernel/run-time/Foreach.hpp
Lines changed: 6 additions & 2 deletions b/‎src/libPMacc/include/cuSTL/algorithm/kernel/run-time/Foreach.hpp
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/libPMacc/include/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp
Lines changed: 18 additions & 6 deletions b/‎src/libPMacc/include/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp
Lines changed: 18 additions & 6 deletions
diff --git a/‎src/libPMacc/include/eventSystem/tasks/TaskSetValue.hpp
Lines changed: 44 additions & 20 deletions b/‎src/libPMacc/include/eventSystem/tasks/TaskSetValue.hpp
Lines changed: 44 additions & 20 deletions
@@ -63,10 +63,10 @@ namespace kernel
         /* ... */                                                                                           \
         BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                            \
                                                                                                             \
-        dim3 blockDim(BlockDim::toRT().toDim3());                                                           \
+        auto blockDim = BlockDim::toRT();                                                                   \
         detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                  \
         using namespace PMacc;                                                                              \
-        __cudaKernel(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim)                       \
+        PMACC_TYPEKERNEL(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim)                  \
                   /* c0_shifted, c1_shifted, ... */                                                         \
             (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor));                   \
     }
 
@@ -59,15 +59,19 @@ namespace detail
                         /* typename C0, typename C1, ... */                                                 \
 template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor>                            \
                                                 /* C0 c0, C1 c1, ... */                                     \
-__global__ void kernelForeachBlock(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor)    \
+DINLINE void operator()(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const         \
 {                                                                                                           \
     math::Int<Mapper::dim> cellIndex(mapper(blockIdx));                                                     \
          /* c0[cellIndex], c1[cellIndex], ... */                                                            \
     functor(BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _));                                                       \
 }
 
+struct kernelForeachBlock
+{
+
 BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
 
+};
 #undef KERNEL_FOREACH
 #undef SHIFTACCESS_CURSOR
 
@@ -87,10 +91,10 @@ BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREA
         /* ... */                                                                                           \
         BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _)                                                            \
                                                                                                             \
-        dim3 blockDim(ThreadBlock::toRT().toDim3());                                                        \
+        auto blockDim = ThreadBlock::toRT();                                                                \
         detail::SphericMapper<Zone::dim, BlockDim> mapper;                                                  \
         using namespace PMacc;                                                                              \
-        __cudaKernel(detail::kernelForeachBlock)(mapper.cudaGridDim(p_zone.size), blockDim)                  \
+        PMACC_TYPEKERNEL(detail::kernelForeachBlock)(mapper.cudaGridDim(p_zone.size), blockDim)             \
                     /* c0_shifted, c1_shifted, ... */                                                       \
             (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor));                   \
     }
 
@@ -42,15 +42,17 @@ namespace detail
 /*                        typename C0, ..., typename CN     */ \
 template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> \
 /*                                          C0 c0, ..., CN cN   */ \
-__global__ void kernelForeach(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) \
+DINLINE void operator()(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const \
 { \
     math::Int<Mapper::dim> cellIndex(mapper(blockIdx, threadIdx)); \
 /*          forward(c0[cellIndex]), ..., forward(cN[cellIndex])     */ \
     functor(BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _)); \
 }
 
+struct kernelForeach
+{
 BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
-
+};
 #undef KERNEL_FOREACH
 #undef SHIFTACCESS_CURSOR
 
 
@@ -55,9 +55,14 @@ struct SphericMapper<1, BlockSize>
 {
     static constexpr int dim = 1;
 
-    dim3 cudaGridDim(const math::Size_t<1>& size) const
+    typename math::Size_t<3>::BaseType 
+    cudaGridDim(const math::Size_t<1>& size) const
     {
-        return dim3(size.x() / BlockSize::x::value, 1, 1);
+        return math::Size_t<3>(
+            size.x() / BlockSize::x::value, 
+            1u, 
+            1u
+        );
     }
 
     HDINLINE
@@ -80,10 +85,14 @@ struct SphericMapper<2, BlockSize>
 {
     static constexpr int dim = 2;
 
-    dim3 cudaGridDim(const math::Size_t<2>& size) const
+    typename math::Size_t<3>::BaseType  
+    cudaGridDim(const math::Size_t<2>& size) const
     {
-        return dim3(size.x() / BlockSize::x::value,
-                    size.y() / BlockSize::y::value, 1);
+        return math::Size_t<3>(
+            size.x() / BlockSize::x::value,
+            size.y() / BlockSize::y::value, 
+            1u
+         );
     }
 
     HDINLINE
@@ -107,11 +116,14 @@ struct SphericMapper<3, BlockSize>
 {
     static constexpr int dim = 3;
 
-    dim3 cudaGridDim(const math::Size_t<3>& size) const
+    typename math::Size_t<3>::BaseType  
+    cudaGridDim(const math::Size_t<3>& size) const
     {
-        return dim3(size.x() / BlockSize::x::value,
-                    size.y() / BlockSize::y::value,
-                    size.z() / BlockSize::z::value);
+        return math::Size_t<3>(
+            size.x() / BlockSize::x::value,
+            size.y() / BlockSize::y::value,
+            size.z() / BlockSize::z::value
+        );
     }
 
     HDINLINE
@@ -136,9 +148,14 @@ struct SphericMapper<1, mpl::void_>
 {
     static constexpr int dim = 1;
 
-    dim3 cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const
+    typename math::Size_t<3>::BaseType
+    cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const
     {
-        return dim3(size.x() / blockDim.x(), 1, 1);
+        return math::Size_t<3>(
+            size.x() / blockDim.x(), 
+            1u, 
+            1u
+        );
     }
 
     DINLINE
@@ -161,10 +178,14 @@ struct SphericMapper<2, mpl::void_>
 {
     static constexpr int dim = 2;
 
-    dim3 cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const
+    typename math::Size_t<3>::BaseType
+    cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const
     {
-        return dim3(size.x() / blockDim.x(),
-                    size.y() / blockDim.y(), 1);
+        return math::Size_t<3>(
+            size.x() / blockDim.x(),
+            size.y() / blockDim.y(), 
+            1
+        );
     }
 
     DINLINE
@@ -188,11 +209,14 @@ struct SphericMapper<3, mpl::void_>
 {
     static constexpr int dim = 3;
 
-    dim3 cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const
+    typename math::Size_t<3>::BaseType  
+    cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const
     {
-        return dim3(size.x() / blockDim.x(),
-                    size.y() / blockDim.y(),
-                    size.z() / blockDim.z());
+        return math::Size_t<3>(
+            size.x() / blockDim.x(),
+            size.y() / blockDim.y(),
+            size.z() / blockDim.z()
+        );
     }
 
     DINLINE
 
@@ -135,10 +135,14 @@ math::Size_t<DIM3> getBestCudaBlockDim(const math::Size_t<dim> gridDim)
         PMACC_VERIFY(this->_blockDim.y() <= cudaSpecs::MaxNumThreadsPerBlockDim::y::value);                         \
         PMACC_VERIFY(this->_blockDim.z() <= cudaSpecs::MaxNumThreadsPerBlockDim::z::value);                         \
                                                                                                                     \
-        dim3 blockDim(this->_blockDim.x(), this->_blockDim.y(), this->_blockDim.z());                               \
+        typename math::Size_t<3>::BaseType blockDim(                                                                \
+            this->_blockDim.x(),                                                                                    \
+            this->_blockDim.y(),                                                                                    \
+            this->_blockDim.z()                                                                                     \
+        );                                                                                                          \
         kernel::detail::SphericMapper<Zone::dim> mapper;                                                            \
         using namespace PMacc;                                                                                      \
-        __cudaKernel(kernel::detail::kernelForeach)(mapper.cudaGridDim(p_zone.size, this->_blockDim), blockDim)      \
+        PMACC_TYPEKERNEL(kernel::detail::kernelForeach)(mapper.cudaGridDim(p_zone.size, this->_blockDim), blockDim) \
                 /*   c0_shifted, ..., cN_shifted    */                                                              \
             (mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor));                           \
     }
 
@@ -28,14 +28,18 @@
 #include "eventSystem/tasks/StreamTask.hpp"
 #include "eventSystem/events/kernelEvents.hpp"
 #include "dimensions/DataSpace.hpp"
+#include "nvidia/gpuEntryFunction.hpp"
 
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 
-__global__ void kernelSetValueOnDeviceMemory(size_t* pointer, const size_t size)
+struct kernelSetValueOnDeviceMemory
 {
-    *pointer = size;
-}
+    DINLINE void operator()(size_t* pointer, const size_t size) const
+    {
+        *pointer = size;
+    }
+};
 
 namespace PMacc
 {
@@ -83,9 +87,17 @@ class TaskSetCurrentSizeOnDevice : public StreamTask
 
     void setSize()
     {
-        kernelSetValueOnDeviceMemory
-            << < 1, 1, 0, this->getCudaStream() >> >
-            (destination->getCurrentSizeOnDevicePointer(), size);
+         auto sizePtr = destination->getCurrentSizeOnDevicePointer();
+         nvidia::gpuEntryFunction<<<
+            1,
+            1,
+            0,
+            this->getCudaStream()
+        >>>( 
+            kernelSetValueOnDeviceMemory{},
+            sizePtr, 
+            size
+        );
 
         activate();
     }
 
@@ -29,6 +29,7 @@
 #include "memory/boxes/DataBox.hpp"
 #include "eventSystem/EventSystem.hpp"
 #include "eventSystem/tasks/StreamTask.hpp"
+#include "nvidia/gpuEntryFunction.hpp"
 
 #include <boost/type_traits/remove_pointer.hpp>
 #include <boost/type_traits.hpp>
@@ -82,20 +83,22 @@ getValue(T_Type& value)
 
 }
 
-template <class DataBox, typename T_ValueType, typename Space>
-__global__ void kernelSetValue(DataBox data, const T_ValueType value, const Space size)
+struct kernelSetValue
 {
-    const Space threadIndex(threadIdx);
-    const Space blockIndex(blockIdx);
-    const Space gridSize(blockDim);
-
-    Space idx(gridSize * blockIndex + threadIndex);
+    template <class DataBox, typename T_ValueType, typename Space>
+    DINLINE void operator()(DataBox data, const T_ValueType value, const Space size) const
+    {
+        const Space threadIndex(threadIdx);
+        const Space blockIndex(blockIdx);
+        const Space gridSize(blockDim);
 
-    if (idx.x() >= size.x())
-        return;
-    data(idx) = taskSetValueHelper::getValue(value);
-}
+        Space idx(gridSize * blockIndex + threadIndex);
 
+        if (idx.x() >= size.x())
+            return;
+        data(idx) = taskSetValueHelper::getValue(value);
+    }
+};
 
 template <class TYPE, unsigned DIM>
 class DeviceBuffer;
@@ -177,13 +180,23 @@ class TaskSetValue<T_ValueType, T_dim, true> : public TaskSetValueBase<T_ValueTy
 
         if(area_size.productOfComponents() != 0)
         {
-            dim3 gridSize = area_size;
+            auto gridSize = area_size;
 
             /* line wise thread blocks*/
-            gridSize.x = ceil(double(gridSize.x) / 256.);
-
-            kernelSetValue<<<gridSize, 256, 0, this->getCudaStream()>>>
-                (this->destination->getDataBox(), this->value, area_size);
+            gridSize.x() = ceil(double(gridSize.x()) / 256.);
+
+            auto destBox = this->destination->getDataBox();
+            nvidia::gpuEntryFunction<<<
+                gridSize,
+                256,
+                0,
+                this->getCudaStream()
+            >>>( 
+                kernelSetValue{},
+                destBox, 
+                this->value, 
+                area_size 
+            );
         }
         this->activate();
     }
@@ -221,10 +234,10 @@ class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueT
         const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size));
         if(area_size.productOfComponents() != 0)
         {
-            dim3 gridSize = area_size;
+            auto gridSize = area_size;
 
             /* line wise thread blocks*/
-            gridSize.x = ceil(double(gridSize.x) / 256.);
+            gridSize.x()= ceil(double(gridSize.x()) / 256.);
 
             ValueType* devicePtr = this->destination->getPointer();
 
@@ -234,8 +247,19 @@ class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueT
             CUDA_CHECK(cudaMemcpyAsync(
                                        devicePtr, valuePointer_host, sizeof (ValueType),
                                        cudaMemcpyHostToDevice, this->getCudaStream()));
-            kernelSetValue<<<gridSize, 256, 0, this->getCudaStream()>>>
-                (this->destination->getDataBox(), devicePtr, area_size);
+
+            auto destBox = this->destination->getDataBox();
+            nvidia::gpuEntryFunction<<<
+                gridSize,
+                256,
+                0,
+                this->getCudaStream()
+            >>>( 
+                kernelSetValue{},
+                destBox, 
+                devicePtr, 
+                area_size 
+            );
         }
 
         this->activate();
Original file line number	Diff line number	Diff line change
`@@ -63,10 +63,10 @@ namespace kernel`
`63`	`63`	`/* ... */ \`
`64`	`64`	`BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _) \`
`65`	`65`	`\`
`66`		`- dim3 blockDim(BlockDim::toRT().toDim3()); \`
	`66`	`+ auto blockDim = BlockDim::toRT(); \`
`67`	`67`	`detail::SphericMapper<Zone::dim, BlockDim> mapper; \`
`68`	`68`	`using namespace PMacc; \`
`69`		`- __cudaKernel(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim) \`
	`69`	`+ PMACC_TYPEKERNEL(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim) \`
`70`	`70`	`/* c0_shifted, c1_shifted, ... */ \`
`71`	`71`	`(mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor)); \`
`72`	`72`	`}`
Original file line number	Diff line number	Diff line change
`@@ -55,9 +55,14 @@ struct SphericMapper<1, BlockSize>`
`55`	`55`	`{`
`56`	`56`	`static constexpr int dim = 1;`
`57`	`57`
`58`		`- dim3 cudaGridDim(const math::Size_t<1>& size) const`
	`58`	`+ typename math::Size_t<3>::BaseType`
	`59`	`+ cudaGridDim(const math::Size_t<1>& size) const`
`59`	`60`	`{`
`60`		`- return dim3(size.x() / BlockSize::x::value, 1, 1);`
	`61`	`+ return math::Size_t<3>(`
	`62`	`+ size.x() / BlockSize::x::value,`
	`63`	`+ 1u,`
	`64`	`+ 1u`
	`65`	`+ );`
`61`	`66`	`}`
`62`	`67`
`63`	`68`	`HDINLINE`
`@@ -80,10 +85,14 @@ struct SphericMapper<2, BlockSize>`
`80`	`85`	`{`
`81`	`86`	`static constexpr int dim = 2;`
`82`	`87`
`83`		`- dim3 cudaGridDim(const math::Size_t<2>& size) const`
	`88`	`+ typename math::Size_t<3>::BaseType`
	`89`	`+ cudaGridDim(const math::Size_t<2>& size) const`
`84`	`90`	`{`
`85`		`- return dim3(size.x() / BlockSize::x::value,`
`86`		`- size.y() / BlockSize::y::value, 1);`
	`91`	`+ return math::Size_t<3>(`
	`92`	`+ size.x() / BlockSize::x::value,`
	`93`	`+ size.y() / BlockSize::y::value,`
	`94`	`+ 1u`
	`95`	`+ );`
`87`	`96`	`}`
`88`	`97`
`89`	`98`	`HDINLINE`
`@@ -107,11 +116,14 @@ struct SphericMapper<3, BlockSize>`
`107`	`116`	`{`
`108`	`117`	`static constexpr int dim = 3;`
`109`	`118`
`110`		`- dim3 cudaGridDim(const math::Size_t<3>& size) const`
	`119`	`+ typename math::Size_t<3>::BaseType`
	`120`	`+ cudaGridDim(const math::Size_t<3>& size) const`
`111`	`121`	`{`
`112`		`- return dim3(size.x() / BlockSize::x::value,`
`113`		`- size.y() / BlockSize::y::value,`
`114`		`- size.z() / BlockSize::z::value);`
	`122`	`+ return math::Size_t<3>(`
	`123`	`+ size.x() / BlockSize::x::value,`
	`124`	`+ size.y() / BlockSize::y::value,`
	`125`	`+ size.z() / BlockSize::z::value`
	`126`	`+ );`
`115`	`127`	`}`
`116`	`128`
`117`	`129`	`HDINLINE`
`@@ -136,9 +148,14 @@ struct SphericMapper<1, mpl::void_>`
`136`	`148`	`{`
`137`	`149`	`static constexpr int dim = 1;`
`138`	`150`
`139`		`- dim3 cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const`
	`151`	`+ typename math::Size_t<3>::BaseType`
	`152`	`+ cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const`
`140`	`153`	`{`
`141`		`- return dim3(size.x() / blockDim.x(), 1, 1);`
	`154`	`+ return math::Size_t<3>(`
	`155`	`+ size.x() / blockDim.x(),`
	`156`	`+ 1u,`
	`157`	`+ 1u`
	`158`	`+ );`
`142`	`159`	`}`
`143`	`160`
`144`	`161`	`DINLINE`
`@@ -161,10 +178,14 @@ struct SphericMapper<2, mpl::void_>`
`161`	`178`	`{`
`162`	`179`	`static constexpr int dim = 2;`
`163`	`180`
`164`		`- dim3 cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const`
	`181`	`+ typename math::Size_t<3>::BaseType`
	`182`	`+ cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const`
`165`	`183`	`{`
`166`		`- return dim3(size.x() / blockDim.x(),`
`167`		`- size.y() / blockDim.y(), 1);`
	`184`	`+ return math::Size_t<3>(`
	`185`	`+ size.x() / blockDim.x(),`
	`186`	`+ size.y() / blockDim.y(),`
	`187`	`+ 1`
	`188`	`+ );`
`168`	`189`	`}`
`169`	`190`
`170`	`191`	`DINLINE`
`@@ -188,11 +209,14 @@ struct SphericMapper<3, mpl::void_>`
`188`	`209`	`{`
`189`	`210`	`static constexpr int dim = 3;`
`190`	`211`
`191`		`- dim3 cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const`
	`212`	`+ typename math::Size_t<3>::BaseType`
	`213`	`+ cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const`
`192`	`214`	`{`
`193`		`- return dim3(size.x() / blockDim.x(),`
`194`		`- size.y() / blockDim.y(),`
`195`		`- size.z() / blockDim.z());`
	`215`	`+ return math::Size_t<3>(`
	`216`	`+ size.x() / blockDim.x(),`
	`217`	`+ size.y() / blockDim.y(),`
	`218`	`+ size.z() / blockDim.z()`
	`219`	`+ );`
`196`	`220`	`}`
`197`	`221`
`198`	`222`	`DINLINE`