Skip to content

Commit 96caa74

Browse files
authored
[SYCL] Optimize NDRDescT by removing sycl::range, sycl::id and padding (#18851)
sycl::range and sycl::id perform validity checks every time setting them. Use std::array instead as dimensions should already be valid. In addition, remove explicitly padding dimensions smaller than 3 and get number of dimensions from template argument instead of function argument.
1 parent 8cdb923 commit 96caa74

File tree

6 files changed

+182
-135
lines changed

6 files changed

+182
-135
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 25 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -914,7 +914,7 @@ class __SYCL_EXPORT handler {
914914
.template get_property<
915915
syclex::cuda::cluster_size_key<ClusterDim>>()
916916
.get_cluster_size();
917-
setKernelClusterLaunch(padRange(ClusterSize), ClusterDim);
917+
setKernelClusterLaunch(ClusterSize);
918918
}
919919
}
920920

@@ -3716,7 +3716,9 @@ class __SYCL_EXPORT handler {
37163716
void setKernelIsCooperative(bool);
37173717

37183718
// Set using cuda thread block cluster launch flag and set the launch bounds.
3719-
void setKernelClusterLaunch(sycl::range<3> ClusterSize, int Dims);
3719+
void setKernelClusterLaunch(sycl::range<3> ClusterSize);
3720+
void setKernelClusterLaunch(sycl::range<2> ClusterSize);
3721+
void setKernelClusterLaunch(sycl::range<1> ClusterSize);
37203722

37213723
// Set the request work group memory size (work_group_static ext).
37223724
void setKernelWorkGroupMem(size_t Size);
@@ -3817,54 +3819,37 @@ class __SYCL_EXPORT handler {
38173819
bool HasAssociatedAccessor(detail::AccessorImplHost *Req,
38183820
access::target AccessTarget) const;
38193821

3820-
template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) {
3821-
if constexpr (Dims == 3) {
3822-
return Range;
3823-
} else {
3824-
sycl::range<3> Res{0, 0, 0};
3825-
for (int I = 0; I < Dims; ++I)
3826-
Res[I] = Range[I];
3827-
return Res;
3828-
}
3829-
}
3830-
3831-
template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) {
3832-
if constexpr (Dims == 3) {
3833-
return Id;
3834-
} else {
3835-
sycl::id<3> Res{0, 0, 0};
3836-
for (int I = 0; I < Dims; ++I)
3837-
Res[I] = Id[I];
3838-
return Res;
3839-
}
3840-
}
3841-
38423822
template <int Dims>
38433823
void setNDRangeDescriptor(sycl::range<Dims> N,
38443824
bool SetNumWorkGroups = false) {
3845-
return setNDRangeDescriptorPadded(padRange(N), SetNumWorkGroups, Dims);
3825+
return setNDRangeDescriptor(N, SetNumWorkGroups);
38463826
}
38473827
template <int Dims>
38483828
void setNDRangeDescriptor(sycl::range<Dims> NumWorkItems,
38493829
sycl::id<Dims> Offset) {
3850-
return setNDRangeDescriptorPadded(padRange(NumWorkItems), padId(Offset),
3851-
Dims);
3830+
return setNDRangeDescriptor(NumWorkItems, Offset);
38523831
}
38533832
template <int Dims>
38543833
void setNDRangeDescriptor(sycl::nd_range<Dims> ExecutionRange) {
3855-
return setNDRangeDescriptorPadded(
3856-
padRange(ExecutionRange.get_global_range()),
3857-
padRange(ExecutionRange.get_local_range()),
3858-
padId(ExecutionRange.get_offset()), Dims);
3859-
}
3860-
3861-
void setNDRangeDescriptorPadded(sycl::range<3> N, bool SetNumWorkGroups,
3862-
int Dims);
3863-
void setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
3864-
sycl::id<3> Offset, int Dims);
3865-
void setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems,
3866-
sycl::range<3> LocalSize, sycl::id<3> Offset,
3867-
int Dims);
3834+
return setNDRangeDescriptor(ExecutionRange.get_global_range(),
3835+
ExecutionRange.get_local_range(),
3836+
ExecutionRange.get_offset());
3837+
}
3838+
3839+
void setNDRangeDescriptor(sycl::range<3> N, bool SetNumWorkGroups);
3840+
void setNDRangeDescriptor(sycl::range<3> NumWorkItems, sycl::id<3> Offset);
3841+
void setNDRangeDescriptor(sycl::range<3> NumWorkItems,
3842+
sycl::range<3> LocalSize, sycl::id<3> Offset);
3843+
3844+
void setNDRangeDescriptor(sycl::range<2> N, bool SetNumWorkGroups);
3845+
void setNDRangeDescriptor(sycl::range<2> NumWorkItems, sycl::id<2> Offset);
3846+
void setNDRangeDescriptor(sycl::range<2> NumWorkItems,
3847+
sycl::range<2> LocalSize, sycl::id<2> Offset);
3848+
3849+
void setNDRangeDescriptor(sycl::range<1> N, bool SetNumWorkGroups);
3850+
void setNDRangeDescriptor(sycl::range<1> NumWorkItems, sycl::id<1> Offset);
3851+
void setNDRangeDescriptor(sycl::range<1> NumWorkItems,
3852+
sycl::range<1> LocalSize, sycl::id<1> Offset);
38683853

38693854
void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs,
38703855
detail::kernel_param_desc_t (*KernelParamDescGetter)(int),

sycl/source/detail/cg.hpp

Lines changed: 58 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -62,99 +62,96 @@ class ArgDesc {
6262

6363
// The structure represents NDRange - global, local sizes, global offset and
6464
// number of dimensions.
65-
class NDRDescT {
66-
// The method initializes all sizes for dimensions greater than the passed one
67-
// to the default values, so they will not affect execution.
68-
void setNDRangeLeftover() {
69-
for (int I = Dims; I < 3; ++I) {
70-
GlobalSize[I] = 1;
71-
LocalSize[I] = LocalSize[0] ? 1 : 0;
72-
GlobalOffset[I] = 0;
73-
NumWorkGroups[I] = 0;
74-
}
75-
}
7665

77-
template <int Dims> static sycl::range<3> padRange(sycl::range<Dims> Range) {
78-
if constexpr (Dims == 3) {
79-
return Range;
80-
} else {
81-
sycl::range<3> Res{0, 0, 0};
82-
for (int I = 0; I < Dims; ++I)
83-
Res[I] = Range[I];
84-
return Res;
85-
}
86-
}
87-
88-
template <int Dims> static sycl::id<3> padId(sycl::id<Dims> Id) {
89-
if constexpr (Dims == 3) {
90-
return Id;
91-
} else {
92-
sycl::id<3> Res{0, 0, 0};
93-
for (int I = 0; I < Dims; ++I)
94-
Res[I] = Id[I];
95-
return Res;
96-
}
97-
}
66+
// TODO: A lot of tests rely on particular values to be set for dimensions that
67+
// are not used. To clarify, for example, if a 2D kernel is invoked, in
68+
// NDRDescT, the value of index 2 in GlobalSize must be set to either 1 or 0
69+
// depending on which constructor is used for no clear reason.
70+
// Instead, only sensible defaults should be used and tests should be updated
71+
// to reflect this.
72+
class NDRDescT {
9873

9974
public:
10075
NDRDescT() = default;
10176
NDRDescT(const NDRDescT &Desc) = default;
10277
NDRDescT(NDRDescT &&Desc) = default;
10378

104-
NDRDescT(sycl::range<3> N, bool SetNumWorkGroups, int DimsArg)
105-
: GlobalSize{SetNumWorkGroups ? sycl::range<3>{0, 0, 0} : N},
106-
NumWorkGroups{SetNumWorkGroups ? N : sycl::range<3>{0, 0, 0}},
107-
Dims{size_t(DimsArg)} {
108-
setNDRangeLeftover();
109-
}
79+
template <int Dims_>
80+
NDRDescT(sycl::range<Dims_> N, bool SetNumWorkGroups) : Dims{size_t(Dims_)} {
81+
if (SetNumWorkGroups) {
82+
for (size_t I = 0; I < Dims_; ++I) {
83+
NumWorkGroups[I] = N[I];
84+
}
85+
} else {
86+
for (size_t I = 0; I < Dims_; ++I) {
87+
GlobalSize[I] = N[I];
88+
}
11089

111-
NDRDescT(sycl::range<3> NumWorkItems, sycl::range<3> LocalSize,
112-
sycl::id<3> Offset, int DimsArg)
113-
: GlobalSize{NumWorkItems}, LocalSize{LocalSize}, GlobalOffset{Offset},
114-
Dims{size_t(DimsArg)} {
115-
setNDRangeLeftover();
90+
for (int I = Dims_; I < 3; ++I) {
91+
GlobalSize[I] = 1;
92+
}
93+
}
11694
}
11795

118-
NDRDescT(sycl::range<3> NumWorkItems, sycl::id<3> Offset, int DimsArg)
119-
: GlobalSize{NumWorkItems}, GlobalOffset{Offset}, Dims{size_t(DimsArg)} {}
96+
template <int Dims_>
97+
NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::range<Dims_> LocalSizes,
98+
sycl::id<Dims_> Offset)
99+
: Dims{size_t(Dims_)} {
100+
for (size_t I = 0; I < Dims_; ++I) {
101+
GlobalSize[I] = NumWorkItems[I];
102+
LocalSize[I] = LocalSizes[I];
103+
GlobalOffset[I] = Offset[I];
104+
}
105+
106+
for (int I = Dims_; I < 3; ++I) {
107+
LocalSize[I] = LocalSizes[0] ? 1 : 0;
108+
}
109+
110+
for (int I = Dims_; I < 3; ++I) {
111+
GlobalSize[I] = 1;
112+
}
113+
}
120114

121115
template <int Dims_>
122-
NDRDescT(sycl::nd_range<Dims_> ExecutionRange, int DimsArg)
123-
: NDRDescT(padRange(ExecutionRange.get_global_range()),
124-
padRange(ExecutionRange.get_local_range()),
125-
padId(ExecutionRange.get_offset()), size_t(DimsArg)) {
126-
setNDRangeLeftover();
116+
NDRDescT(sycl::range<Dims_> NumWorkItems, sycl::id<Dims_> Offset)
117+
: Dims{size_t(Dims_)} {
118+
for (size_t I = 0; I < Dims_; ++I) {
119+
GlobalSize[I] = NumWorkItems[I];
120+
GlobalOffset[I] = Offset[I];
121+
}
127122
}
128123

129124
template <int Dims_>
130125
NDRDescT(sycl::nd_range<Dims_> ExecutionRange)
131-
: NDRDescT(ExecutionRange, Dims_) {}
126+
: NDRDescT(ExecutionRange.get_global_range(),
127+
ExecutionRange.get_local_range(),
128+
ExecutionRange.get_offset()) {}
132129

133130
template <int Dims_>
134131
NDRDescT(sycl::range<Dims_> Range)
135-
: NDRDescT(padRange(Range), /*SetNumWorkGroups=*/false, Dims_) {}
132+
: NDRDescT(Range, /*SetNumWorkGroups=*/false) {}
136133

137-
void setClusterDimensions(sycl::range<3> N, int Dims) {
138-
if (this->Dims != size_t(Dims)) {
134+
template <int Dims_> void setClusterDimensions(sycl::range<Dims_> N) {
135+
if (this->Dims != size_t(Dims_)) {
139136
throw std::runtime_error(
140137
"Dimensionality of cluster, global and local ranges must be same");
141138
}
142139

143-
for (int I = 0; I < 3; ++I)
144-
ClusterDimensions[I] = (I < Dims) ? N[I] : 1;
140+
for (int I = 0; I < Dims_; ++I)
141+
ClusterDimensions[I] = N[I];
145142
}
146143

147144
NDRDescT &operator=(const NDRDescT &Desc) = default;
148145
NDRDescT &operator=(NDRDescT &&Desc) = default;
149146

150-
sycl::range<3> GlobalSize{0, 0, 0};
151-
sycl::range<3> LocalSize{0, 0, 0};
152-
sycl::id<3> GlobalOffset{0, 0, 0};
147+
std::array<size_t, 3> GlobalSize{0, 0, 0};
148+
std::array<size_t, 3> LocalSize{0, 0, 0};
149+
std::array<size_t, 3> GlobalOffset{0, 0, 0};
153150
/// Number of workgroups, used to record the number of workgroups from the
154151
/// simplest form of parallel_for_work_group. If set, all other fields must be
155152
/// zero
156-
sycl::range<3> NumWorkGroups{0, 0, 0};
157-
sycl::range<3> ClusterDimensions{1, 1, 1};
153+
std::array<size_t, 3> NumWorkGroups{0, 0, 0};
154+
std::array<size_t, 3> ClusterDimensions{1, 1, 1};
158155
size_t Dims = 0;
159156
};
160157

sycl/source/detail/scheduler/commands.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2276,8 +2276,11 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, ur_kernel_handle_t Kernel,
22762276
if (WGSize[0] == 0) {
22772277
WGSize = {1, 1, 1};
22782278
}
2279-
NDR = sycl::detail::NDRDescT{nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize),
2280-
static_cast<int>(NDR.Dims)};
2279+
2280+
for (size_t I = 0; I < NDR.Dims; ++I) {
2281+
NDR.GlobalSize[I] = WGSize[I] * NDR.NumWorkGroups[I];
2282+
NDR.LocalSize[I] = WGSize[I];
2283+
}
22812284
}
22822285

22832286
// We have the following mapping between dimensions with SPIR-V builtins:

0 commit comments

Comments
 (0)