Skip to content

Commit 014d626

Browse files
authored
[SYCLomatic] In the migration of BlockLoad/Store, fix the issue that local memory size is not correct calculated without load/store algorithm template type (#2773)
Signed-off-by: intwanghao <hao3.wang@intel.com>
1 parent a85be23 commit 014d626

File tree

3 files changed

+80
-1
lines changed

3 files changed

+80
-1
lines changed

clang/lib/DPCT/RulesLangLib/CUBAPIMigration.cpp

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,9 @@ void CubMemberCallRule::runRule(
258258
Name == "BlockedToWarpStriped";
259259
bool isBlockShuffle =
260260
Name == "Offset" || Name == "Rotate" || Name == "Up" || Name == "Down";
261+
bool isBlockLoadStore = Name == "Load" || Name == "Store";
261262
if (isBlockRadixSort || isBlockExchange || isBlockShuffle ||
262-
Name == "Load" || Name == "Store") {
263+
isBlockLoadStore) {
263264
std::string HelpFuncName;
264265
if (isBlockRadixSort)
265266
HelpFuncName = "group_radix_sort";
@@ -300,6 +301,32 @@ void CubMemberCallRule::runRule(
300301
const auto &ItemsPreThreadArg = ClassSpecDecl->getTemplateArgs()[2];
301302
OS << ", " << ItemsPreThreadArg.getAsIntegral();
302303
}
304+
if (isBlockLoadStore &&
305+
!ClassSpecDecl->getTemplateArgs()[3].getIsDefaulted()) {
306+
int AlgoType =
307+
ClassSpecDecl->getTemplateArgs()[3].getAsIntegral().getExtValue();
308+
if (Name == "Load") {
309+
if (AlgoType == 3) {
310+
OS << ", "
311+
<< MapNames::getDpctNamespace() +
312+
"group::group_load_algorithm::transpose";
313+
} else if (AlgoType == 4) {
314+
OS << ", "
315+
<< MapNames::getDpctNamespace() +
316+
"group::group_load_algorithm::sub_group_transpose";
317+
}
318+
} else {
319+
if (AlgoType == 3) {
320+
OS << ", "
321+
<< MapNames::getDpctNamespace() +
322+
"group::group_store_algorithm::transpose";
323+
} else if (AlgoType == 4) {
324+
OS << ", "
325+
<< MapNames::getDpctNamespace() +
326+
"group::group_store_algorithm::sub_group_transpose";
327+
}
328+
}
329+
}
303330
OS << ">::get_local_memory_size";
304331
if (auto FuncInfo = DeviceFunctionDecl::LinkRedecls(FD)) {
305332
auto LocInfo = DpctGlobalInfo::getLocInfo(TempStorage);

clang/test/dpct/cub/blocklevel/blockload.cu

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,5 +125,32 @@ int main() {
125125
// CHECK-NEXT: });
126126
StripedKernel<<<1, 128>>>(d_data, 128);
127127
cudaStreamSynchronize(0);
128+
129+
// CHECK: q_ct1.submit(
130+
// CHECK: [&](sycl::handler &cgh) {
131+
// CHECK: sycl::stream stream_ct1(64 * 1024, 80, cgh);
132+
// CHECK: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_load<int, 4, dpct::group::group_load_algorithm::transpose>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
133+
// CHECK: cgh.parallel_for(
134+
// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
135+
// CHECK: [=](sycl::nd_item<3> item_ct1) {
136+
// CHECK: TransposeKernel(d_data, 128, item_ct1, stream_ct1, &temp_storage_acc[0]);
137+
// CHECK: });
138+
// CHECK: });
139+
TransposeKernel<<<1, 128>>>(d_data, 128);
140+
cudaStreamSynchronize(0);
141+
142+
// CHECK: q_ct1.submit(
143+
// CHECK: [&](sycl::handler &cgh) {
144+
// CHECK: sycl::stream stream_ct1(64 * 1024, 80, cgh);
145+
// CHECK: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_load<int, 4, dpct::group::group_load_algorithm::sub_group_transpose>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
146+
// CHECK: cgh.parallel_for(
147+
// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
148+
// CHECK: [=](sycl::nd_item<3> item_ct1) {
149+
// CHECK: SubGroupTransposeKernel(d_data, 128, item_ct1, stream_ct1, &temp_storage_acc[0]);
150+
// CHECK: });
151+
// CHECK: });
152+
SubGroupTransposeKernel<<<1, 128>>>(d_data, 128);
153+
cudaStreamSynchronize(0);
154+
128155
return 0;
129156
}

clang/test/dpct/cub/blocklevel/blockstore.cu

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,31 @@ int main() {
135135
// CHECK-NEXT: });
136136
StripedKernel<<<1, 128>>>(d_data, 5);
137137
cudaStreamSynchronize(0);
138+
139+
// CHECK: q_ct1.submit(
140+
// CHECK: [&](sycl::handler &cgh) {
141+
// CHECK: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4, dpct::group::group_store_algorithm::transpose>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
142+
// CHECK: cgh.parallel_for(
143+
// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
144+
// CHECK: [=](sycl::nd_item<3> item_ct1) {
145+
// CHECK: TransposeKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
146+
// CHECK: });
147+
// CHECK: });
148+
TransposeKernel<<<1, 128>>>(d_data, 5);
149+
cudaStreamSynchronize(0);
150+
151+
// CHECK: q_ct1.submit(
152+
// CHECK: [&](sycl::handler &cgh) {
153+
// CHECK: sycl::local_accessor<uint8_t, 1> temp_storage_acc(dpct::group::group_store<int, 4, dpct::group::group_store_algorithm::sub_group_transpose>::get_local_memory_size(sycl::range<3>(1, 1, 128).size()), cgh);
154+
// CHECK: cgh.parallel_for(
155+
// CHECK: sycl::nd_range<3>(sycl::range<3>(1, 1, 128), sycl::range<3>(1, 1, 128)),
156+
// CHECK: [=](sycl::nd_item<3> item_ct1) {
157+
// CHECK: SubGroupTransposeKernel(d_data, 5, item_ct1, &temp_storage_acc[0]);
158+
// CHECK: });
159+
// CHECK: });
160+
SubGroupTransposeKernel<<<1, 128>>>(d_data, 5);
161+
cudaStreamSynchronize(0);
162+
138163
for (int i = 0; i < 512; ++i)
139164
printf("%d%c", d_data[i], (i == 511 ? '\n' : ' '));
140165
return 0;

0 commit comments

Comments
 (0)