Skip to content

Commit d3a1fe5

Browse files
authored
Avoid extra allocation of memory (#679)
* Avoid extra allocation of memory in USM SYCL iterator
1 parent a152e41 commit d3a1fe5

File tree

3 files changed

+31
-44
lines changed

3 files changed

+31
-44
lines changed

dpnp/backend/kernels/dpnp_krnl_fft.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ void dpnp_fft_fft_c(const void* array1_in,
6767
long* output_shape_offsets = reinterpret_cast<long*>(dpnp_memory_alloc_c(shape_size * sizeof(long)));
6868
long* input_shape_offsets = reinterpret_cast<long*>(dpnp_memory_alloc_c(shape_size * sizeof(long)));
6969
// must be a thread local storage.
70-
long* xyz = reinterpret_cast<long*>(dpnp_memory_alloc_c(result_size * shape_size * sizeof(long)));
7170
long* axis_iterator = reinterpret_cast<long*>(dpnp_memory_alloc_c(result_size * shape_size * sizeof(long)));
7271

7372
get_shape_offsets_inkernel<long>(output_shape, shape_size, output_shape_offsets);
@@ -79,14 +78,14 @@ void dpnp_fft_fft_c(const void* array1_in,
7978

8079
double sum_real = 0.0;
8180
double sum_imag = 0.0;
82-
// need to replace these arrays by thread local storage
83-
long* xyz_thread = xyz + (output_id * shape_size);
81+
// need to replace this array by thread local storage
8482
long* axis_iterator_thread = axis_iterator + (output_id * shape_size);
8583

86-
get_xyz_by_id_inkernel(output_id, output_shape_offsets, shape_size, xyz_thread);
84+
size_t xyz_id;
8785
for (size_t i = 0; i < shape_size; ++i)
8886
{
89-
axis_iterator_thread[i] = xyz_thread[i];
87+
xyz_id = get_xyz_id_by_id_inkernel(output_id, output_shape_offsets, shape_size, i);
88+
axis_iterator_thread[i] = xyz_id;
9089
}
9190

9291
const long axis_length = input_boundarie;
@@ -114,7 +113,8 @@ void dpnp_fft_fft_c(const void* array1_in,
114113
}
115114
}
116115

117-
const size_t output_local_id = xyz_thread[axis];
116+
xyz_id = get_xyz_id_by_id_inkernel(output_id, output_shape_offsets, shape_size, axis);
117+
const size_t output_local_id = xyz_id;
118118
const double angle = 2.0 * kernel_pi * it * output_local_id / axis_length;
119119

120120
const double angle_cos = cl::sycl::cos(angle);
@@ -153,7 +153,6 @@ void dpnp_fft_fft_c(const void* array1_in,
153153
dpnp_memory_free_c(input_shape_offsets);
154154
dpnp_memory_free_c(output_shape_offsets);
155155
dpnp_memory_free_c(axis_iterator);
156-
dpnp_memory_free_c(xyz);
157156

158157
return;
159158
}

dpnp/backend/src/dpnp_iterator.hpp

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,6 @@ class DPNPC_id final
290290
get_shape_offsets_inkernel<size_type>(output_shape, output_shape_size, output_shape_strides);
291291

292292
iteration_size = 1;
293-
294-
// make thread private storage for each shape by multiplying memory
295-
sycl_output_xyz =
296-
reinterpret_cast<size_type*>(dpnp_memory_alloc_c(output_size * output_shape_size_in_bytes));
297293
}
298294
}
299295

@@ -400,10 +396,6 @@ class DPNPC_id final
400396
{
401397
axes_shape_strides[i] = input_shape_strides[axes[i]];
402398
}
403-
404-
// make thread private storage for each shape by multiplying memory
405-
sycl_output_xyz =
406-
reinterpret_cast<size_type*>(dpnp_memory_alloc_c(output_size * output_shape_size_in_bytes));
407399
}
408400
}
409401

@@ -485,35 +477,30 @@ class DPNPC_id final
485477
{
486478
assert(output_global_id < output_size);
487479

488-
// use thread private storage
489-
size_type* sycl_output_xyz_thread = sycl_output_xyz + (output_global_id * output_shape_size);
490-
491-
get_xyz_by_id_inkernel(output_global_id, output_shape_strides, output_shape_size, sycl_output_xyz_thread);
492-
493480
for (size_t iit = 0, oit = 0; iit < input_shape_size; ++iit)
494481
{
495482
if (std::find(axes.begin(), axes.end(), iit) == axes.end())
496483
{
497-
input_global_id += (sycl_output_xyz_thread[oit] * input_shape_strides[iit]);
484+
const size_type output_xyz_id = get_xyz_id_by_id_inkernel(output_global_id, output_shape_strides,
485+
output_shape_size, oit);
486+
input_global_id += (output_xyz_id * input_shape_strides[iit]);
498487
++oit;
499488
}
500489
}
501490
}
502491
else if (broadcast_use)
503492
{
504493
assert(output_global_id < output_size);
505-
506-
// use thread private storage
507-
size_type* sycl_output_xyz_thread = sycl_output_xyz + (output_global_id * output_shape_size);
508-
509-
get_xyz_by_id_inkernel(output_global_id, output_shape_strides, output_shape_size, sycl_output_xyz_thread);
494+
assert(input_shape_size <= output_shape_size);
510495

511496
for (int irit = input_shape_size - 1, orit = output_shape_size - 1; irit >= 0; --irit, --orit)
512497
{
513498
size_type* broadcast_axes_end = broadcast_axes + broadcast_axes_size;
514499
if (std::find(broadcast_axes, broadcast_axes_end, orit) == broadcast_axes_end)
515500
{
516-
input_global_id += (sycl_output_xyz_thread[orit] * input_shape_strides[irit]);
501+
const size_type output_xyz_id = get_xyz_id_by_id_inkernel(output_global_id, output_shape_strides,
502+
output_shape_size, orit);
503+
input_global_id += (output_xyz_id * input_shape_strides[irit]);
517504
}
518505
}
519506
}
@@ -565,10 +552,8 @@ class DPNPC_id final
565552
output_shape_size = size_type{};
566553
dpnp_memory_free_c(output_shape);
567554
dpnp_memory_free_c(output_shape_strides);
568-
dpnp_memory_free_c(sycl_output_xyz);
569555
output_shape = nullptr;
570556
output_shape_strides = nullptr;
571-
sycl_output_xyz = nullptr;
572557
}
573558

574559
void free_memory()
@@ -602,9 +587,6 @@ class DPNPC_id final
602587
size_type iteration_shape_size = size_type{};
603588
size_type* iteration_shape_strides = nullptr;
604589
size_type* axes_shape_strides = nullptr;
605-
606-
// data allocated to use inside SYCL kernels
607-
size_type* sycl_output_xyz = nullptr;
608590
};
609591

610592
#endif // DPNP_ITERATOR_H

dpnp/backend/src/dpnp_utils.hpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#ifndef BACKEND_UTILS_H // Cython compatibility
2828
#define BACKEND_UTILS_H
2929

30+
#include <cassert>
3031
#include <algorithm>
3132
#include <iostream>
3233
#include <iterator>
@@ -69,33 +70,38 @@ void get_shape_offsets_inkernel(const _DataType* shape, size_t shape_size, _Data
6970

7071
/**
7172
* @ingroup BACKEND_UTILS
72-
* @brief Calculate ids for all given axes from linear index
73+
* @brief Calculate xyz id for given axis from linear index
7374
*
74-
* Calculates ids of the array with given shape. This is reverse operation of @ref get_id_by_xyz_inkernel
75+
* Calculates xyz id of the array with given shape.
7576
* for example:
7677
* input_array_shape_offsets[20, 5, 1]
7778
* global_id == 5
78-
* xyz array ids should be [0, 1, 0]
79+
* axis == 1
80+
* xyz_id should be 1
7981
*
80-
* @param [in] global_id linear index id of the element in multy-D array.
82+
* @param [in] global_id linear index of the element in multy-D array.
8183
* @param [in] offsets array with input offsets.
8284
* @param [in] offsets_size array size for @ref offsets parameter.
83-
* @param [out] xyz Result array with @ref offsets_size size.
85+
* @param [in] axis axis.
8486
*/
8587
template <typename _DataType>
86-
void get_xyz_by_id_inkernel(size_t global_id, const _DataType* offsets, size_t offsets_size, _DataType* xyz)
88+
_DataType get_xyz_id_by_id_inkernel(size_t global_id, const _DataType* offsets, size_t offsets_size, size_t axis)
8789
{
90+
/* avoid warning unused variable*/
91+
(void)offsets_size;
92+
93+
assert(axis < offsets_size);
94+
95+
_DataType xyz_id = 0;
8896
long reminder = global_id;
89-
for (size_t axis = 0; axis < offsets_size; ++axis)
97+
for (size_t i = 0; i < axis + 1; ++i)
9098
{
91-
/* reconstruct [x][y][z] from given linear idx */
92-
const _DataType axis_val = offsets[axis];
93-
_DataType xyz_id = reminder / axis_val;
99+
const _DataType axis_val = offsets[i];
100+
xyz_id = reminder / axis_val;
94101
reminder = reminder % axis_val;
95-
xyz[axis] = xyz_id;
96102
}
97103

98-
return;
104+
return xyz_id;
99105
}
100106

101107
/**

0 commit comments

Comments
 (0)