@@ -946,17 +946,20 @@ template <typename _DataType>
946
946
void dpnp_rng_shuffle_c (
947
947
void * result, const size_t itemsize, const size_t ndim, const size_t high_dim_size, const size_t size)
948
948
{
949
- if (!(size) || !(high_dim_size > 1 ) )
949
+ if (!result )
950
950
{
951
951
return ;
952
952
}
953
953
954
- char * result1 = reinterpret_cast <char *>(result);
954
+ if (!size || !ndim || !(high_dim_size > 1 ))
955
+ {
956
+ return ;
957
+ }
955
958
956
- double * Uvec = nullptr ;
959
+ char * result1 = reinterpret_cast < char *>(result) ;
957
960
958
961
size_t uvec_size = high_dim_size - 1 ;
959
- Uvec = reinterpret_cast <double *>(dpnp_memory_alloc_c (uvec_size * sizeof (double )));
962
+ double * Uvec = reinterpret_cast <double *>(dpnp_memory_alloc_c (uvec_size * sizeof (double )));
960
963
mkl_rng::uniform<double > uniform_distribution (0.0 , 1.0 );
961
964
auto uniform_event = mkl_rng::generate (uniform_distribution, DPNP_RNG_ENGINE, uvec_size, Uvec);
962
965
uniform_event.wait ();
@@ -966,42 +969,52 @@ void dpnp_rng_shuffle_c(
966
969
// Fast, statically typed path: shuffle the underlying buffer.
967
970
// Only for non-empty, 1d objects of class ndarray (subclasses such
968
971
// as MaskedArrays may not support this approach).
969
- // TODO
970
- // kernel
971
- char * buf = nullptr ;
972
- buf = reinterpret_cast <char *>(dpnp_memory_alloc_c (itemsize * sizeof (char )));
972
+ char * buf = reinterpret_cast <char *>(dpnp_memory_alloc_c (itemsize * sizeof (char )));
973
973
for (size_t i = uvec_size; i > 0 ; i--)
974
974
{
975
975
size_t j = (size_t )(floor ((i + 1 ) * Uvec[i - 1 ]));
976
- memcpy (buf, result1 + j * itemsize, itemsize);
977
- memcpy (result1 + j * itemsize, result1 + i * itemsize, itemsize);
978
- memcpy (result1 + i * itemsize, buf, itemsize);
976
+ if (i != j)
977
+ {
978
+ auto memcpy1 =
979
+ DPNP_QUEUE.submit ([&](cl::sycl::handler& h) { h.memcpy (buf, result1 + j * itemsize, itemsize); });
980
+ auto memcpy2 = DPNP_QUEUE.submit ([&](cl::sycl::handler& h) {
981
+ h.depends_on ({memcpy1});
982
+ h.memcpy (result1 + j * itemsize, result1 + i * itemsize, itemsize);
983
+ });
984
+ auto memcpy3 = DPNP_QUEUE.submit ([&](cl::sycl::handler& h) {
985
+ h.depends_on ({memcpy2});
986
+ h.memcpy (result1 + i * itemsize, buf, itemsize);
987
+ });
988
+ memcpy3.wait ();
989
+ }
979
990
}
980
-
981
991
dpnp_memory_free_c (buf);
982
992
}
983
993
else
984
994
{
985
995
// Multidimensional ndarrays require a bounce buffer.
986
- // TODO
987
- // kernel
988
- char * buf = nullptr ;
989
996
size_t step_size = (size / high_dim_size) * itemsize; // size in bytes for x[i] element
990
- buf = reinterpret_cast <char *>(dpnp_memory_alloc_c (step_size * sizeof (char )));
997
+ char * buf = reinterpret_cast <char *>(dpnp_memory_alloc_c (step_size * sizeof (char )));
991
998
for (size_t i = uvec_size; i > 0 ; i--)
992
999
{
993
1000
size_t j = (size_t )(floor ((i + 1 ) * Uvec[i - 1 ]));
994
1001
if (j < i)
995
1002
{
996
- memcpy (buf, result1 + j * step_size, step_size);
997
- memcpy (result1 + j * step_size, result1 + i * step_size, step_size);
998
- memcpy (result1 + i * step_size, buf, step_size);
1003
+ auto memcpy1 =
1004
+ DPNP_QUEUE.submit ([&](cl::sycl::handler& h) { h.memcpy (buf, result1 + j * step_size, step_size); });
1005
+ auto memcpy2 = DPNP_QUEUE.submit ([&](cl::sycl::handler& h) {
1006
+ h.depends_on ({memcpy1});
1007
+ h.memcpy (result1 + j * step_size, result1 + i * step_size, step_size);
1008
+ });
1009
+ auto memcpy3 = DPNP_QUEUE.submit ([&](cl::sycl::handler& h) {
1010
+ h.depends_on ({memcpy2});
1011
+ h.memcpy (result1 + i * step_size, buf, step_size);
1012
+ });
1013
+ memcpy3.wait ();
999
1014
}
1000
1015
}
1001
-
1002
1016
dpnp_memory_free_c (buf);
1003
1017
}
1004
-
1005
1018
dpnp_memory_free_c (Uvec);
1006
1019
}
1007
1020
0 commit comments