Skip to content

Commit 2a58675

Browse files
authored
[SYCL][NVPTX] Optimize ID queries when they fit in int (#18999)
The NVPTX target was unable to properly optimize the global ID query, despite the user specifying the -fsycl-id-queries-fit-in-int flag. This is because, once linked, the compiler sees the global ID builtin as (i64 add (mul (i64 zext i32 A), (i64 zext i32 B), (i64 zext i32 C))). Despite knowing that each of A, B and C are 32-bit values, and the final result fits in a 32-bit value, it is not legal to replace this sequence with (i64 zext (add i32 (mul i32 A, B), C)), which is the ideal code here. The solution to this problem is a new opt-in 'reflection' in the NVPTX implementation of the global ID builtin, which selects a more optimal version. The driver enables this reflection only when the user passes -fsycl-id-queries-fit-in-int.
1 parent 1eb75c7 commit 2a58675

File tree

3 files changed

+35
-0
lines changed

3 files changed

+35
-0
lines changed

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,11 @@ void CudaToolChain::addClangTargetOptions(
968968
"--nvptx-prec-sqrtf32=0"});
969969

970970
CC1Args.append({"-mllvm", "-enable-memcpyopt-without-libcalls"});
971+
972+
if (DriverArgs.hasFlag(options::OPT_fsycl_id_queries_fit_in_int,
973+
options::OPT_fno_sycl_id_queries_fit_in_int, false))
974+
CC1Args.append(
975+
{"-mllvm", "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"});
971976
} else {
972977
CC1Args.append({"-fcuda-is-device", "-mllvm",
973978
"-enable-memcpyopt-without-libcalls",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
// REQUIRES: nvptx-registered-target
2+
3+
// RUN: %clang -### -nocudalib \
4+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda %s 2>&1 \
5+
// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
6+
7+
// RUN: %clang -### -nocudalib \
8+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fno-sycl-id-queries-fit-in-int %s 2>&1 \
9+
// RUN: | FileCheck --check-prefix=CHECK-DEFAULT %s
10+
11+
// RUN: %clang -### -nocudalib \
12+
// RUN: -fsycl -fsycl-targets=nvptx64-nvidia-cuda -fsycl-id-queries-fit-in-int %s 2>&1 \
13+
// RUN: | FileCheck --check-prefix=CHECK-INT %s
14+
15+
// CHECK-INT: "-mllvm" "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"
16+
// CHECK-DEFAULT-NOT: "-nvvm-reflect-add=__CUDA_ID_QUERIES_FIT_IN_INT=1"

libclc/libspirv/lib/ptx-nvidiacl/workitem/get_global_id.cl

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,31 @@
88

99
#include <libspirv/spirv.h>
1010

11+
extern int __nvvm_reflect_ocl(constant char *);
12+
1113
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_x() {
14+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
15+
return (uint)__spirv_WorkgroupId_x() * (uint)__spirv_WorkgroupSize_x() +
16+
(uint)__spirv_LocalInvocationId_x() + (uint)__spirv_GlobalOffset_x();
17+
}
1218
return __spirv_WorkgroupId_x() * __spirv_WorkgroupSize_x() +
1319
__spirv_LocalInvocationId_x() + __spirv_GlobalOffset_x();
1420
}
1521

1622
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_y() {
23+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
24+
return (uint)__spirv_WorkgroupId_y() * (uint)__spirv_WorkgroupSize_y() +
25+
(uint)__spirv_LocalInvocationId_y() + (uint)__spirv_GlobalOffset_y();
26+
}
1727
return __spirv_WorkgroupId_y() * __spirv_WorkgroupSize_y() +
1828
__spirv_LocalInvocationId_y() + __spirv_GlobalOffset_y();
1929
}
2030

2131
_CLC_DEF _CLC_OVERLOAD size_t __spirv_GlobalInvocationId_z() {
32+
if (__nvvm_reflect_ocl("__CUDA_ID_QUERIES_FIT_IN_INT")) {
33+
return (uint)__spirv_WorkgroupId_z() * (uint)__spirv_WorkgroupSize_z() +
34+
(uint)__spirv_LocalInvocationId_z() + (uint)__spirv_GlobalOffset_z();
35+
}
2236
return __spirv_WorkgroupId_z() * __spirv_WorkgroupSize_z() +
2337
__spirv_LocalInvocationId_z() + __spirv_GlobalOffset_z();
2438
}

0 commit comments

Comments
 (0)