Skip to content

Commit 9a97a20

Browse files
[PMacc] transform kernel functions to functors
- use `PMACC_TYPEKERNEL` macro to start a kernel - transform global kernel function to a functor
1 parent 9a569f1 commit 9a97a20

File tree

15 files changed

+958
-838
lines changed

15 files changed

+958
-838
lines changed

src/libPMacc/include/cuSTL/algorithm/kernel/Foreach.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,10 @@ namespace kernel
6363
/* ... */ \
6464
BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _) \
6565
\
66-
dim3 blockDim(BlockDim::toRT().toDim3()); \
66+
auto blockDim = BlockDim::toRT(); \
6767
detail::SphericMapper<Zone::dim, BlockDim> mapper; \
6868
using namespace PMacc; \
69-
__cudaKernel(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim) \
69+
PMACC_TYPEKERNEL(detail::kernelForeach)(mapper.cudaGridDim(p_zone.size), blockDim) \
7070
/* c0_shifted, c1_shifted, ... */ \
7171
(mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor)); \
7272
}

src/libPMacc/include/cuSTL/algorithm/kernel/ForeachBlock.hpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -59,15 +59,19 @@ namespace detail
5959
/* typename C0, typename C1, ... */ \
6060
template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> \
6161
/* C0 c0, C1 c1, ... */ \
62-
__global__ void kernelForeachBlock(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) \
62+
DINLINE void operator()(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const \
6363
{ \
6464
math::Int<Mapper::dim> cellIndex(mapper(blockIdx)); \
6565
/* c0[cellIndex], c1[cellIndex], ... */ \
6666
functor(BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _)); \
6767
}
6868

69+
struct kernelForeachBlock
70+
{
71+
6972
BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
7073

74+
};
7175
#undef KERNEL_FOREACH
7276
#undef SHIFTACCESS_CURSOR
7377

@@ -87,10 +91,10 @@ BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREA
8791
/* ... */ \
8892
BOOST_PP_REPEAT(N, SHIFT_CURSOR_ZONE, _) \
8993
\
90-
dim3 blockDim(ThreadBlock::toRT().toDim3()); \
94+
auto blockDim = ThreadBlock::toRT(); \
9195
detail::SphericMapper<Zone::dim, BlockDim> mapper; \
9296
using namespace PMacc; \
93-
__cudaKernel(detail::kernelForeachBlock)(mapper.cudaGridDim(p_zone.size), blockDim) \
97+
PMACC_TYPEKERNEL(detail::kernelForeachBlock)(mapper.cudaGridDim(p_zone.size), blockDim) \
9498
/* c0_shifted, c1_shifted, ... */ \
9599
(mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor)); \
96100
}

src/libPMacc/include/cuSTL/algorithm/kernel/detail/ForeachKernel.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,17 @@ namespace detail
4242
/* typename C0, ..., typename CN */ \
4343
template<typename Mapper, BOOST_PP_ENUM_PARAMS(N, typename C), typename Functor> \
4444
/* C0 c0, ..., CN cN */ \
45-
__global__ void kernelForeach(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) \
45+
DINLINE void operator()(Mapper mapper, BOOST_PP_ENUM_BINARY_PARAMS(N, C, c), Functor functor) const \
4646
{ \
4747
math::Int<Mapper::dim> cellIndex(mapper(blockIdx, threadIdx)); \
4848
/* forward(c0[cellIndex]), ..., forward(cN[cellIndex]) */ \
4949
functor(BOOST_PP_ENUM(N, SHIFTACCESS_CURSOR, _)); \
5050
}
5151

52+
struct kernelForeach
53+
{
5254
BOOST_PP_REPEAT_FROM_TO(1, BOOST_PP_INC(FOREACH_KERNEL_MAX_PARAMS), KERNEL_FOREACH, _)
53-
55+
};
5456
#undef KERNEL_FOREACH
5557
#undef SHIFTACCESS_CURSOR
5658

src/libPMacc/include/cuSTL/algorithm/kernel/detail/SphericMapper.hpp

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,14 @@ struct SphericMapper<1, BlockSize>
5555
{
5656
static constexpr int dim = 1;
5757

58-
dim3 cudaGridDim(const math::Size_t<1>& size) const
58+
typename math::Size_t<3>::BaseType
59+
cudaGridDim(const math::Size_t<1>& size) const
5960
{
60-
return dim3(size.x() / BlockSize::x::value, 1, 1);
61+
return math::Size_t<3>(
62+
size.x() / BlockSize::x::value,
63+
1u,
64+
1u
65+
);
6166
}
6267

6368
HDINLINE
@@ -80,10 +85,14 @@ struct SphericMapper<2, BlockSize>
8085
{
8186
static constexpr int dim = 2;
8287

83-
dim3 cudaGridDim(const math::Size_t<2>& size) const
88+
typename math::Size_t<3>::BaseType
89+
cudaGridDim(const math::Size_t<2>& size) const
8490
{
85-
return dim3(size.x() / BlockSize::x::value,
86-
size.y() / BlockSize::y::value, 1);
91+
return math::Size_t<3>(
92+
size.x() / BlockSize::x::value,
93+
size.y() / BlockSize::y::value,
94+
1u
95+
);
8796
}
8897

8998
HDINLINE
@@ -107,11 +116,14 @@ struct SphericMapper<3, BlockSize>
107116
{
108117
static constexpr int dim = 3;
109118

110-
dim3 cudaGridDim(const math::Size_t<3>& size) const
119+
typename math::Size_t<3>::BaseType
120+
cudaGridDim(const math::Size_t<3>& size) const
111121
{
112-
return dim3(size.x() / BlockSize::x::value,
113-
size.y() / BlockSize::y::value,
114-
size.z() / BlockSize::z::value);
122+
return math::Size_t<3>(
123+
size.x() / BlockSize::x::value,
124+
size.y() / BlockSize::y::value,
125+
size.z() / BlockSize::z::value
126+
);
115127
}
116128

117129
HDINLINE
@@ -136,9 +148,14 @@ struct SphericMapper<1, mpl::void_>
136148
{
137149
static constexpr int dim = 1;
138150

139-
dim3 cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const
151+
typename math::Size_t<3>::BaseType
152+
cudaGridDim(const math::Size_t<1>& size, const math::Size_t<3>& blockDim) const
140153
{
141-
return dim3(size.x() / blockDim.x(), 1, 1);
154+
return math::Size_t<3>(
155+
size.x() / blockDim.x(),
156+
1u,
157+
1u
158+
);
142159
}
143160

144161
DINLINE
@@ -161,10 +178,14 @@ struct SphericMapper<2, mpl::void_>
161178
{
162179
static constexpr int dim = 2;
163180

164-
dim3 cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const
181+
typename math::Size_t<3>::BaseType
182+
cudaGridDim(const math::Size_t<2>& size, const math::Size_t<3>& blockDim) const
165183
{
166-
return dim3(size.x() / blockDim.x(),
167-
size.y() / blockDim.y(), 1);
184+
return math::Size_t<3>(
185+
size.x() / blockDim.x(),
186+
size.y() / blockDim.y(),
187+
1
188+
);
168189
}
169190

170191
DINLINE
@@ -188,11 +209,14 @@ struct SphericMapper<3, mpl::void_>
188209
{
189210
static constexpr int dim = 3;
190211

191-
dim3 cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const
212+
typename math::Size_t<3>::BaseType
213+
cudaGridDim(const math::Size_t<3>& size, const math::Size_t<3>& blockDim) const
192214
{
193-
return dim3(size.x() / blockDim.x(),
194-
size.y() / blockDim.y(),
195-
size.z() / blockDim.z());
215+
return math::Size_t<3>(
216+
size.x() / blockDim.x(),
217+
size.y() / blockDim.y(),
218+
size.z() / blockDim.z()
219+
);
196220
}
197221

198222
DINLINE

src/libPMacc/include/cuSTL/algorithm/kernel/run-time/Foreach.hpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,14 @@ math::Size_t<DIM3> getBestCudaBlockDim(const math::Size_t<dim> gridDim)
135135
PMACC_VERIFY(this->_blockDim.y() <= cudaSpecs::MaxNumThreadsPerBlockDim::y::value); \
136136
PMACC_VERIFY(this->_blockDim.z() <= cudaSpecs::MaxNumThreadsPerBlockDim::z::value); \
137137
\
138-
dim3 blockDim(this->_blockDim.x(), this->_blockDim.y(), this->_blockDim.z()); \
138+
typename math::Size_t<3>::BaseType blockDim( \
139+
this->_blockDim.x(), \
140+
this->_blockDim.y(), \
141+
this->_blockDim.z() \
142+
); \
139143
kernel::detail::SphericMapper<Zone::dim> mapper; \
140144
using namespace PMacc; \
141-
__cudaKernel(kernel::detail::kernelForeach)(mapper.cudaGridDim(p_zone.size, this->_blockDim), blockDim) \
145+
PMACC_TYPEKERNEL(kernel::detail::kernelForeach)(mapper.cudaGridDim(p_zone.size, this->_blockDim), blockDim) \
142146
/* c0_shifted, ..., cN_shifted */ \
143147
(mapper, BOOST_PP_ENUM(N, SHIFTED_CURSOR, _), lambda::make_Functor(functor)); \
144148
}

src/libPMacc/include/eventSystem/tasks/TaskSetCurrentSizeOnDevice.hpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,18 @@
2828
#include "eventSystem/tasks/StreamTask.hpp"
2929
#include "eventSystem/events/kernelEvents.hpp"
3030
#include "dimensions/DataSpace.hpp"
31+
#include "nvidia/gpuEntryFunction.hpp"
3132

3233
#include <cuda_runtime_api.h>
3334
#include <cuda.h>
3435

35-
__global__ void kernelSetValueOnDeviceMemory(size_t* pointer, const size_t size)
36+
struct kernelSetValueOnDeviceMemory
3637
{
37-
*pointer = size;
38-
}
38+
DINLINE void operator()(size_t* pointer, const size_t size) const
39+
{
40+
*pointer = size;
41+
}
42+
};
3943

4044
namespace PMacc
4145
{
@@ -83,9 +87,17 @@ class TaskSetCurrentSizeOnDevice : public StreamTask
8387

8488
void setSize()
8589
{
86-
kernelSetValueOnDeviceMemory
87-
<< < 1, 1, 0, this->getCudaStream() >> >
88-
(destination->getCurrentSizeOnDevicePointer(), size);
90+
auto sizePtr = destination->getCurrentSizeOnDevicePointer();
91+
nvidia::gpuEntryFunction<<<
92+
1,
93+
1,
94+
0,
95+
this->getCudaStream()
96+
>>>(
97+
kernelSetValueOnDeviceMemory{},
98+
sizePtr,
99+
size
100+
);
89101

90102
activate();
91103
}

src/libPMacc/include/eventSystem/tasks/TaskSetValue.hpp

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "memory/boxes/DataBox.hpp"
3030
#include "eventSystem/EventSystem.hpp"
3131
#include "eventSystem/tasks/StreamTask.hpp"
32+
#include "nvidia/gpuEntryFunction.hpp"
3233

3334
#include <boost/type_traits/remove_pointer.hpp>
3435
#include <boost/type_traits.hpp>
@@ -82,20 +83,22 @@ getValue(T_Type& value)
8283

8384
}
8485

85-
template <class DataBox, typename T_ValueType, typename Space>
86-
__global__ void kernelSetValue(DataBox data, const T_ValueType value, const Space size)
86+
struct kernelSetValue
8787
{
88-
const Space threadIndex(threadIdx);
89-
const Space blockIndex(blockIdx);
90-
const Space gridSize(blockDim);
91-
92-
Space idx(gridSize * blockIndex + threadIndex);
88+
template <class DataBox, typename T_ValueType, typename Space>
89+
DINLINE void operator()(DataBox data, const T_ValueType value, const Space size) const
90+
{
91+
const Space threadIndex(threadIdx);
92+
const Space blockIndex(blockIdx);
93+
const Space gridSize(blockDim);
9394

94-
if (idx.x() >= size.x())
95-
return;
96-
data(idx) = taskSetValueHelper::getValue(value);
97-
}
95+
Space idx(gridSize * blockIndex + threadIndex);
9896

97+
if (idx.x() >= size.x())
98+
return;
99+
data(idx) = taskSetValueHelper::getValue(value);
100+
}
101+
};
99102

100103
template <class TYPE, unsigned DIM>
101104
class DeviceBuffer;
@@ -177,13 +180,23 @@ class TaskSetValue<T_ValueType, T_dim, true> : public TaskSetValueBase<T_ValueTy
177180

178181
if(area_size.productOfComponents() != 0)
179182
{
180-
dim3 gridSize = area_size;
183+
auto gridSize = area_size;
181184

182185
/* line wise thread blocks*/
183-
gridSize.x = ceil(double(gridSize.x) / 256.);
184-
185-
kernelSetValue<<<gridSize, 256, 0, this->getCudaStream()>>>
186-
(this->destination->getDataBox(), this->value, area_size);
186+
gridSize.x() = ceil(double(gridSize.x()) / 256.);
187+
188+
auto destBox = this->destination->getDataBox();
189+
nvidia::gpuEntryFunction<<<
190+
gridSize,
191+
256,
192+
0,
193+
this->getCudaStream()
194+
>>>(
195+
kernelSetValue{},
196+
destBox,
197+
this->value,
198+
area_size
199+
);
187200
}
188201
this->activate();
189202
}
@@ -221,10 +234,10 @@ class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueT
221234
const DataSpace<dim> area_size(this->destination->getCurrentDataSpace(current_size));
222235
if(area_size.productOfComponents() != 0)
223236
{
224-
dim3 gridSize = area_size;
237+
auto gridSize = area_size;
225238

226239
/* line wise thread blocks*/
227-
gridSize.x = ceil(double(gridSize.x) / 256.);
240+
gridSize.x()= ceil(double(gridSize.x()) / 256.);
228241

229242
ValueType* devicePtr = this->destination->getPointer();
230243

@@ -234,8 +247,19 @@ class TaskSetValue<T_ValueType, T_dim, false> : public TaskSetValueBase<T_ValueT
234247
CUDA_CHECK(cudaMemcpyAsync(
235248
devicePtr, valuePointer_host, sizeof (ValueType),
236249
cudaMemcpyHostToDevice, this->getCudaStream()));
237-
kernelSetValue<<<gridSize, 256, 0, this->getCudaStream()>>>
238-
(this->destination->getDataBox(), devicePtr, area_size);
250+
251+
auto destBox = this->destination->getDataBox();
252+
nvidia::gpuEntryFunction<<<
253+
gridSize,
254+
256,
255+
0,
256+
this->getCudaStream()
257+
>>>(
258+
kernelSetValue{},
259+
destBox,
260+
devicePtr,
261+
area_size
262+
);
239263
}
240264

241265
this->activate();

0 commit comments

Comments
 (0)