Skip to content

Commit ad494e9

Browse files
authored
[SYCL][NATIVECPU] Fix local scope module variables for native cpu (intel#15280)
Although local scope variables inside the kernel are less common in SYCL, they can happen with hierarchical. This fixes the problem by adding a pass to replace the local scope variables which start life as globals with a struct which is allocated on the stack. Additionally, this required updating of the code which renames and removes kernel based on wrappers and vecz success. To simplify this we run the OCK utility pass TransferKernelMetadata which adds metadata to store the original kernel name. This in turn simplifies this code significantly. Note this fixes fails in kernel/kernel_attributes_wg_hint.cpp SYCL CTS for native cpu, which is being tested locally.
1 parent 21365ca commit ad494e9

File tree

3 files changed

+54
-37
lines changed

3 files changed

+54
-37
lines changed

llvm/lib/SYCLNativeCPUUtils/PipelineSYCLNativeCPU.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@
2121
#include "compiler/utils/builtin_info.h"
2222
#include "compiler/utils/define_mux_builtins_pass.h"
2323
#include "compiler/utils/device_info.h"
24+
#include "compiler/utils/encode_kernel_metadata_pass.h"
2425
#include "compiler/utils/prepare_barriers_pass.h"
26+
#include "compiler/utils/replace_local_module_scope_variables_pass.h"
2527
#include "compiler/utils/sub_group_analysis.h"
2628
#include "compiler/utils/work_item_loops_pass.h"
2729
#include "vecz/pass.h"
@@ -60,6 +62,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
6062
OptimizationLevel OptLevel) {
6163
MPM.addPass(ConvertToMuxBuiltinsSYCLNativeCPUPass());
6264
#ifdef NATIVECPU_USE_OCK
65+
MPM.addPass(compiler::utils::TransferKernelMetadataPass());
6366
// Always enable vectorizer, unless explictly disabled or -O0 is set.
6467
if (OptLevel != OptimizationLevel::O0 && !SYCLNativeCPUNoVecz) {
6568
MAM.registerPass([] { return vecz::TargetInfoAnalysis(); });
@@ -87,6 +90,7 @@ void llvm::sycl::utils::addSYCLNativeCPUBackendPasses(
8790
MAM.registerPass([] { return compiler::utils::SubgroupAnalysis(); });
8891
MPM.addPass(compiler::utils::PrepareBarriersPass());
8992
MPM.addPass(compiler::utils::WorkItemLoopsPass(Opts));
93+
MPM.addPass(compiler::utils::ReplaceLocalModuleScopeVariablesPass());
9094
MPM.addPass(AlwaysInlinerPass());
9195
#endif
9296
MPM.addPass(PrepareSYCLNativeCPUPass());

llvm/lib/SYCLNativeCPUUtils/PrepareSYCLNativeCPU.cpp

Lines changed: 11 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -338,47 +338,21 @@ PreservedAnalyses PrepareSYCLNativeCPUPass::run(Module &M,
338338
SmallSet<Function *, 5> RemovableFuncs;
339339
SmallVector<Function *, 5> WrapperFuncs;
340340

341-
// Retrieve the wrapper functions created by the WorkItemLoop pass.
342341
for (auto &OldF : OldKernels) {
343-
std::optional<compiler::utils::LinkMetadataResult> VeczR =
344-
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
345-
if (VeczR && VeczR.value().first) {
346-
WrapperFuncs.push_back(OldF);
347-
} else {
348-
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
349-
if (Name != OldF->getName()) {
350-
WrapperFuncs.push_back(OldF);
351-
}
352-
}
353-
}
354-
355-
for (auto &OldF : WrapperFuncs) {
356342
// If vectorization occurred, at this point we have a wrapper function
357-
// that runs the vectorized kernel and peels using the scalar kernel. We
358-
// make it so this wrapper steals the original kernel name.
359-
std::optional<compiler::utils::LinkMetadataResult> VeczR =
360-
compiler::utils::parseVeczToOrigFnLinkMetadata(*OldF);
361-
if (VeczR && VeczR.value().first) {
362-
auto ScalarF = VeczR.value().first;
363-
OldF->takeName(ScalarF);
364-
if (ScalarF->use_empty())
365-
RemovableFuncs.insert(ScalarF);
366-
} else {
367-
// The WorkItemLoops pass created a wrapper function for the original
368-
// kernel. If we have a kernel named foo(), the wrapper will be called
369-
// foo-wrapper(), and will have the original kernel name retrieved by
370-
// getBaseFnNameOrFnName. We set the name of the wrapper function
371-
// to the original kernel name and add the original kernel to the
372-
// list of functions that can be removed from the module.
373-
auto Name = compiler::utils::getBaseFnNameOrFnName(*OldF);
374-
Function *OrigF = M.getFunction(Name);
343+
// that runs the vectorized kernel and peels using the scalar kernel.
344+
// There may also be a wrapper for local variables replacement. We make it
345+
// so this wrapper steals the original kernel name. Otherwise we will have
346+
// a wrapper function from the work item loops. In this case we also steal
347+
// the original kernel name.
348+
auto Name = compiler::utils::getOrigFnName(*OldF);
349+
Function *OrigF = M.getFunction(Name);
350+
if (Name != OldF->getName()) {
375351
if (OrigF != nullptr) {
376-
// The original kernel is inlined by the WorkItemLoops
377-
// pass if it contained barriers or group collectives, otherwise
378-
// we don't want to (and can't) remove it.
379-
if (OrigF->use_empty())
380-
RemovableFuncs.insert(OrigF);
381352
OldF->takeName(OrigF);
353+
if (OrigF->use_empty()) {
354+
RemovableFuncs.insert(OrigF);
355+
}
382356
} else {
383357
OldF->setName(Name);
384358
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// REQUIRES: native_cpu_ock
2+
3+
// RUN: %clangxx -fsycl -fsycl-targets=native_cpu -Xclang -sycl-std=2020 -mllvm -sycl-opt -mllvm -inline-threshold=500 -mllvm -sycl-native-cpu-no-vecz -mllvm -sycl-native-dump-device-ir %s | FileCheck %s
4+
5+
// Check that local types structure is created and placed on the stack
6+
// We also check that the attribute mux-orig-fn is created as this is needed to
7+
// find the original function after this pass is run
8+
9+
// CHECK: %localVarTypes = type { ptr addrspace(1) }
10+
// CHECK: define void @_ZTS4TestILi1ELi4EiE.NativeCPUKernel{{.*}} #[[ATTR:[0-9]*]]
11+
// CHECK: alloca %localVarTypes
12+
// CHECK: attributes #[[ATTR]] = {{.*}} "mux-orig-fn"="_ZTS4TestILi1ELi4EiE"
13+
14+
#include "sycl.hpp"
15+
16+
template <int dims, int size, typename T = int> struct Test;
17+
18+
int main() {
19+
sycl::queue queue;
20+
21+
constexpr int dims = 1;
22+
constexpr int size = 4;
23+
24+
std::array<int, size> data;
25+
26+
const auto range = sycl::range<dims>(size);
27+
const auto range_wg = sycl::range<dims>(1);
28+
{
29+
sycl::buffer<int, dims> buf(data.data(), range);
30+
31+
queue.submit([&](sycl::handler &cgh) {
32+
auto acc = sycl::accessor(buf, cgh, sycl::write_only);
33+
cgh.parallel_for_work_group<Test<dims, size>>(
34+
range, range_wg, [=](auto group) { acc[group.get_group_id()] = 42; });
35+
});
36+
queue.wait_and_throw();
37+
}
38+
return 0;
39+
}

0 commit comments

Comments
 (0)