Skip to content

Commit 4013033

Browse files
authored
[SYCL][NATIVECPU][PERF] Reduce thread local usage (#17822)
This PR no longer generates thread_local pointers for kernels calling other kernels, which happens for example in the work_item loop. Instead of storing the state struct pointer in the thread local, it is passed directly to the called kernel function which was duplicated with an additional state struct pointer parameter if it didn't already have one. The state getter functions (native_cpu state and corresponding mux and nativecpu spirv functions) have been made __attribute((pure)) to enable more optimizations (including removal of unused calls to such builtins) before the NativeCPU passes. Pointer parameters of the native_cpu getter functions now point to constant data.
1 parent 7129c43 commit 4013033

File tree

3 files changed

+41
-35
lines changed

3 files changed

+41
-35
lines changed

libdevice/nativecpu_utils.cpp

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -318,9 +318,10 @@ DefShuffleINTEL_All(_Float16, f16, _Float16);
318318

319319
Define2ArgForward(uint64_t, __spirv_ocl_u_min, std::min);
320320

321+
#define GET_PROPS __attribute__((pure))
321322
#define GEN_u32(bname, muxname) \
322-
DEVICE_EXTERN_C uint32_t muxname(); \
323-
DEVICE_EXTERNAL uint32_t bname() { return muxname(); } \
323+
DEVICE_EXTERN_C GET_PROPS uint32_t muxname(); \
324+
DEVICE_EXTERNAL GET_PROPS uint32_t bname() { return muxname(); } \
324325
static_assert(true)
325326
// subgroup
326327
GEN_u32(__spirv_SubgroupLocalInvocationId, __mux_get_sub_group_local_id);
@@ -331,8 +332,8 @@ GEN_u32(__spirv_SubgroupSize, __mux_get_sub_group_size);
331332

332333
// I64_I32
333334
#define GEN_p(bname, muxname, arg) \
334-
DEVICE_EXTERN_C uint64_t muxname(uint32_t); \
335-
DEVICE_EXTERNAL uint64_t bname() { return muxname(arg); } \
335+
DEVICE_EXTERN_C GET_PROPS uint64_t muxname(uint32_t); \
336+
DEVICE_EXTERNAL GET_PROPS uint64_t bname() { return muxname(arg); } \
336337
static_assert(true)
337338

338339
#define GEN_xyz(bname, ncpu_name) \
@@ -365,8 +366,8 @@ DefStateSetWithType(set_sub_group_id, SubGroup_id, uint32_t);
365366
DefStateSetWithType(set_max_sub_group_size, SubGroup_size, uint32_t);
366367

367368
#define DefineStateGetWithType(name, field, type) \
368-
DEVICE_EXTERNAL_C type __dpcpp_nativecpu_##name( \
369-
MakeGlobalType<__nativecpu_state> *s) { \
369+
DEVICE_EXTERNAL_C GET_PROPS type __dpcpp_nativecpu_##name( \
370+
MakeGlobalType<const __nativecpu_state> *s) { \
370371
return s->field; \
371372
} \
372373
static_assert(true)
@@ -381,8 +382,8 @@ DefineStateGet_U32(get_max_sub_group_size, SubGroup_size);
381382
DefineStateGet_U32(get_num_sub_groups, NumSubGroups);
382383

383384
#define DefineStateGetWithType2(name, field, rtype, ptype) \
384-
DEVICE_EXTERNAL_C rtype __dpcpp_nativecpu_##name( \
385-
ptype dim, MakeGlobalType<__nativecpu_state> *s) { \
385+
DEVICE_EXTERNAL_C GET_PROPS rtype __dpcpp_nativecpu_##name( \
386+
ptype dim, MakeGlobalType<const __nativecpu_state> *s) { \
386387
return s->field[dim]; \
387388
} \
388389
static_assert(true)

llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -301,29 +301,20 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
301301

302302
llvm::Constant *CurrentStatePointerTLS = nullptr;
303303

304-
// check if any of the kernels is called by some other function.
305-
// This can happen e.g. with OCK, where wrapper functions are
306-
// created around the original kernel.
307-
bool KernelIsCalled = false;
308-
for (auto &K : OldKernels) {
309-
for (auto &U : K->uses()) {
310-
if (isa<CallBase>(U.getUser())) {
311-
KernelIsCalled = true;
312-
}
313-
}
314-
}
304+
// Contains the used builtins and kernels that need to be processed to
305+
// receive a pointer to the state struct.
306+
llvm::SmallVector<std::pair<llvm::Function *, StringRef>>
307+
UsedBuiltinsAndKernels;
315308

316309
// Then we iterate over all the supported builtins, find the used ones
317-
llvm::SmallVector<std::pair<llvm::Function *, StringRef>> UsedBuiltins;
318310
for (const auto &Entry : BuiltinNamesMap) {
319311
auto *Glob = M.getFunction(Entry.first);
320312
if (!Glob)
321313
continue;
322314
if (CurrentStatePointerTLS == nullptr) {
323315
for (const auto &Use : Glob->uses()) {
324316
auto *I = cast<CallBase>(Use.getUser());
325-
if (KernelIsCalled ||
326-
IsNonKernelCalledByNativeCPUKernel(I->getFunction())) {
317+
if (IsNonKernelCalledByNativeCPUKernel(I->getFunction())) {
327318
// only use the threadlocal if we have kernels calling builtins
328319
// indirectly, or if the kernel is called by some other func.
329320
CurrentStatePointerTLS = M.getOrInsertGlobal(
@@ -344,7 +335,7 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
344335
}
345336
}
346337
}
347-
UsedBuiltins.push_back({Glob, Entry.second});
338+
UsedBuiltinsAndKernels.push_back({Glob, Entry.second});
348339
}
349340

350341
#ifdef NATIVECPU_USE_OCK
@@ -407,6 +398,11 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
407398
OldF->replaceAllUsesWith(NewF);
408399
OldF->eraseFromParent();
409400
NewKernels.push_back(NewF);
401+
if (!CurrentStatePointerTLS && NewF->getNumUses() > 0)
402+
// If a thread_local is not used we need to keep track of the called
403+
// kernel so we can update its call sites with the pointer to the state
404+
// struct like we do for the called builtins.
405+
UsedBuiltinsAndKernels.push_back({NewF, ""});
410406
ModuleChanged = true;
411407
}
412408

@@ -419,13 +415,25 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
419415

420416
// Then we iterate over all used builtins and
421417
// replace them with calls to our Native CPU functions.
422-
for (const auto &Entry : UsedBuiltins) {
418+
// For the used kernels we need to replace calls to them
419+
// with calls receiving the state pointer argument.
420+
for (const auto &Entry : UsedBuiltinsAndKernels) {
423421
SmallVector<std::pair<Instruction *, Instruction *>> ToRemove;
424422
SmallVector<Function *> ToRemove2;
425423
Function *const Glob = Entry.first;
426424
Function *ReplaceFunc = nullptr;
427425
for (const auto &Use : Glob->uses()) {
428426
auto I = cast<CallBase>(Use.getUser());
427+
if (Entry.second == "") {
428+
if (const Function *CF = I->getCalledFunction()) {
429+
unsigned numParams = CF->getFunctionType()->getNumParams();
430+
auto numArgs = I->arg_size();
431+
if (numArgs == numParams)
432+
continue;
433+
assert(numArgs + 1 == numParams);
434+
}
435+
ReplaceFunc = Entry.first;
436+
}
429437
Function *const C = I->getFunction();
430438
if (IsUnusedBuiltinOrPrivateDef(*C)) {
431439
ToRemove2.push_back(C);
@@ -464,6 +472,10 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
464472
for (auto Temp : ToRemove2)
465473
Temp->eraseFromParent();
466474

475+
// Don't erase if it's not a builtin
476+
if (Entry.second == "")
477+
continue;
478+
467479
// Finally, we erase the builtin from the module
468480
Glob->eraseFromParent();
469481
}

sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
// REQUIRES: native_cpu_ock
2-
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
2+
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
33

4-
// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -fno-inline -Xclang -sycl-std=2020 -mllvm -sycl-opt -S -emit-llvm -o %t_temp.ll %s
5-
// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
4+
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -fno-inline -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-TL
65

7-
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
6+
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -fenable-sycl-dae -mllvm -sycl-opt -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
87

9-
// RUN: %clangxx -fsycl-device-only -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -S -emit-llvm %s -o %t_temp.ll
10-
// RUN: %clangxx -mllvm -sycl-native-cpu-backend -S -emit-llvm -o - %t_temp.ll | FileCheck %s --check-prefix=CHECK-TL
8+
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -fenable-sycl-dae -mllvm -sycl-opt -fno-inline -mllvm -sycl-native-dump-device-ir %s | FileCheck %s --check-prefix=CHECK-TL
119

1210
// check that we added the state struct as a function argument, and that we
1311
// inject the calls to our builtins.
@@ -46,7 +44,6 @@ int main() {
4644
h.parallel_for<Test2>(
4745
r2, [=](sycl::nd_item<2> ndi) { acc[ndi.get_global_id(1)] = 42; });
4846
// CHECK: @_ZTS5Test2.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
49-
// CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 1, ptr addrspace(1) %2)
5047
// CHECK: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
5148
});
5249
sycl::nd_range<3> r3({1, 1, 1}, {1, 1, 1});
@@ -56,10 +53,6 @@ int main() {
5653
});
5754
// CHECK: @_ZTS5Test3.NativeCPUKernel(ptr {{.*}}%0, ptr {{.*}}%1, ptr addrspace(1) %2)
5855
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 2, ptr addrspace(1) %2)
59-
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 1, ptr addrspace(1) %2)
60-
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_range(i32 0, ptr addrspace(1) %2)
61-
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 2, ptr addrspace(1) %2)
62-
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 1, ptr addrspace(1) %2)
6356
// CHECK-DAG: call{{.*}}__dpcpp_nativecpu_get_global_id(i32 0, ptr addrspace(1) %2)
6457
});
6558

0 commit comments

Comments
 (0)