@@ -215,49 +215,42 @@ template <typename _DataType, typename _ResultType>
215
215
class dpnp_trace_c_kernel ;
216
216
217
217
template <typename _DataType, typename _ResultType>
218
- void dpnp_trace_c (const void * array1_in, void * result1 , const size_t * shape_, const size_t ndim)
218
+ void dpnp_trace_c (const void * array1_in, void * result_in , const size_t * shape_, const size_t ndim)
219
219
{
220
- if (( array1_in == nullptr ) || (result1 == nullptr ) || ( shape_ == nullptr ) || ( ndim == 0 ) )
220
+ if (! array1_in || !result_in || ! shape_ || ! ndim)
221
221
{
222
222
return ;
223
223
}
224
224
225
- const _DataType* array_in = reinterpret_cast <const _DataType*>(array1_in);
226
- _ResultType* result = reinterpret_cast <_ResultType*>(result1);
225
+ const _DataType* input = reinterpret_cast <const _DataType*>(array1_in);
226
+ _ResultType* result = reinterpret_cast <_ResultType*>(result_in);
227
+ const size_t last_dim = shape_[ndim - 1 ];
227
228
228
- size_t size = 1 ;
229
- for (size_t i = 0 ; i < ndim - 1 ; ++i)
230
- {
231
- size *= shape_[i];
232
- }
233
-
234
- if (size == 0 )
229
+ const size_t size = std::accumulate (shape_, shape_ + (ndim - 1 ), 1 , std::multiplies<size_t >());
230
+ if (!size)
235
231
{
236
232
return ;
237
233
}
238
234
239
- size_t * shape = reinterpret_cast <size_t *>(dpnp_memory_alloc_c (ndim * sizeof (size_t )));
240
- auto memcpy_event = DPNP_QUEUE.memcpy (shape, shape_, ndim * sizeof (size_t ));
241
-
242
235
cl::sycl::range<1 > gws (size);
243
236
auto kernel_parallel_for_func = [=](auto index) {
244
237
size_t i = index[0 ];
245
- result[i] = 0 ;
246
- for (size_t j = 0 ; j < shape[ndim - 1 ]; ++j)
238
+ _ResultType acc = _ResultType (0 );
239
+
240
+ for (size_t j = 0 ; j < last_dim; ++j)
247
241
{
248
- result[i] += array_in [i * shape[ndim - 1 ] + j];
242
+ acc += input [i * last_dim + j];
249
243
}
244
+
245
+ result[i] = acc;
250
246
};
251
247
252
248
auto kernel_func = [&](cl::sycl::handler& cgh) {
253
- cgh.depends_on ({memcpy_event});
254
249
cgh.parallel_for <class dpnp_trace_c_kernel <_DataType, _ResultType>>(gws, kernel_parallel_for_func);
255
250
};
256
251
257
252
auto event = DPNP_QUEUE.submit (kernel_func);
258
253
event.wait ();
259
-
260
- dpnp_memory_free_c (shape);
261
254
}
262
255
263
256
template <typename _DataType>
0 commit comments