Skip to content

Commit 43f711e

Browse files
[ESIMD] Process every function in LowerESIMD (#3137)
This patch is a preparation for moving ESIMD-specific passes into sycl-post-link. Right now, only functions marked with "sycl-explicit-simd" metadata are lowered in LowerESIMD pass. In sycl-post-link, once we split ESIMD kernels from SYCL kernels, there is no shared code between two types of kernels, and LowerESIMD pass can safely process a module with ESIMD kernels without worrying about another type of kernels (SYCL). This change is also safe to do now since, as of today, we do not allow to mix SYCL and ESIMD kernels in one source and in one program. Thus, LowerESIMD can operate in "exclusive" mode without worrying about SYCL kernels. The ESIMD CFG markup made in `Sema::MarkSyclSimd` will be removed in subsequent patches.
1 parent 2a751bd commit 43f711e

File tree

5 files changed

+51
-46
lines changed

5 files changed

+51
-46
lines changed

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,10 +1234,6 @@ void SYCLLowerESIMDLegacyPass::collectGenXVolatileType(Module &M) {
12341234
PreservedAnalyses SYCLLowerESIMDPass::run(Function &F,
12351235
FunctionAnalysisManager &FAM,
12361236
SmallPtrSet<Type *, 4> &GVTS) {
1237-
// Only consider functions marked with !sycl_explicit_simd
1238-
if (F.getMetadata("sycl_explicit_simd") == nullptr)
1239-
return PreservedAnalyses::all();
1240-
12411237
SmallVector<CallInst *, 32> ESIMDIntrCalls;
12421238
SmallVector<Instruction *, 8> ESIMDToErases;
12431239

llvm/test/SYCLLowerIR/esimd_global.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg %0, i8* nocapture %1) #2
7979
declare void @llvm.lifetime.end.p0i8(i64 immarg %0, i8* nocapture %1) #2
8080

8181
; Function Attrs: noinline norecurse nounwind
82-
define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 !sycl_explicit_simd !12 {
82+
define dso_local spir_func void @_Z3fooPiN2cl4sycl5INTEL3gpu4simdIiLi16EEE(i32 addrspace(4)* %C, %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v) local_unnamed_addr #3 {
8383
entry:
8484
%agg.tmp = alloca %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd", align 64
8585
%0 = addrspacecast %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd"* %v to %"class._ZTSN2cl4sycl5INTEL3gpu4simdIiLi16EEE.cl::sycl::INTEL::gpu::simd" addrspace(4)*

llvm/test/SYCLLowerIR/esimd_lower_intrins.ll

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ target triple = "spir64-unknown-unknown-sycldevice"
1414
@vg = dso_local global %"cm::gen::simd<int, 16>" zeroinitializer, align 64 #0
1515
@vc = dso_local addrspace(1) global <32 x i32> zeroinitializer
1616

17-
define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 {
17+
; LowerESIMD pass should process every function,
18+
; !sycl_explicit_simd metadata is not necessary.
19+
define dso_local spir_func <32 x i32> @FUNC_1() {
1820
%a_1 = alloca <32 x i64>
1921
%1 = load <32 x i64>, <32 x i64>* %a_1
2022
%a_2 = alloca <32 x i16>
@@ -24,7 +26,7 @@ define dso_local spir_func <32 x i32> @FUNC_1() !sycl_explicit_simd !1 {
2426
ret <32 x i32> %ret_val
2527
}
2628

27-
define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 {
29+
define dso_local spir_func <32 x i32> @FUNC_2() {
2830
%a_1 = alloca <32 x i64>
2931
%1 = load <32 x i64>, <32 x i64>* %a_1
3032
%a_2 = alloca <32 x i32>
@@ -36,7 +38,7 @@ define dso_local spir_func <32 x i32> @FUNC_2() !sycl_explicit_simd !1 {
3638
ret <32 x i32> %ret_val
3739
}
3840

39-
define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 {
41+
define dso_local spir_func <32 x i32> @FUNC_3() {
4042
%a_1 = alloca <32 x i64>
4143
%1 = load <32 x i64>, <32 x i64>* %a_1
4244
%a_2 = alloca <32 x i32>
@@ -50,21 +52,21 @@ define dso_local spir_func <32 x i32> @FUNC_3() !sycl_explicit_simd !1 {
5052
ret <32 x i32> %ret_val
5153
}
5254

53-
define dso_local spir_func <32 x i32> @FUNC_4() !sycl_explicit_simd !1 {
55+
define dso_local spir_func <32 x i32> @FUNC_4() {
5456
%ret_val = call spir_func <32 x i32> @_Z33__esimd_flat_block_read_unalignedIjLi32ELN2cm3gen9CacheHintE0ELS2_0EENS1_13__vector_typeIT_XT0_EE4typeEy(i64 0)
5557
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.block.ld.unaligned.v32i32.i64(i64 0)
5658
ret <32 x i32> %ret_val
5759
}
5860

59-
define dso_local spir_func void @FUNC_5() !sycl_explicit_simd !1 {
61+
define dso_local spir_func void @FUNC_5() {
6062
%a_1 = alloca <32 x i32>
6163
%1 = load <32 x i32>, <32 x i32>* %a_1
6264
call spir_func void @_Z24__esimd_flat_block_writeIjLi32ELN2cm3gen9CacheHintE0ELS2_0EEvyNS1_13__vector_typeIT_XT0_EE4typeE(i64 0, <32 x i32> %1)
6365
; CHECK: call void @llvm.genx.svm.block.st.i64.v32i32(i64 0, <32 x i32> %{{[0-9a-zA-Z_.]+}})
6466
ret void
6567
}
6668

67-
define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 {
69+
define dso_local spir_func <32 x i32> @FUNC_6() {
6870
%a_1 = alloca <32 x i64>
6971
%1 = load <32 x i64>, <32 x i64>* %a_1
7072
%a_2 = alloca <32 x i16>
@@ -74,7 +76,7 @@ define dso_local spir_func <32 x i32> @FUNC_6() !sycl_explicit_simd !1 {
7476
ret <32 x i32> %ret_val
7577
}
7678

77-
define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 {
79+
define dso_local spir_func void @FUNC_7() {
7880
%a_1 = alloca <32 x i64>
7981
%1 = load <32 x i64>, <32 x i64>* %a_1
8082
%a_2 = alloca <32 x i32>
@@ -86,7 +88,7 @@ define dso_local spir_func void @FUNC_7() !sycl_explicit_simd !1 {
8688
ret void
8789
}
8890

89-
define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 {
91+
define dso_local spir_func <16 x i16> @FUNC_8() {
9092
%a_1 = alloca <16 x i16>
9193
%1 = load <16 x i16>, <16 x i16>* %a_1
9294
%a_2 = alloca <16 x i16>
@@ -96,7 +98,7 @@ define dso_local spir_func <16 x i16> @FUNC_8() !sycl_explicit_simd !1 {
9698
ret <16 x i16> %ret_val
9799
}
98100

99-
define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 {
101+
define dso_local spir_func <1 x float> @FUNC_9() {
100102
%a_1 = alloca <1 x float>
101103
%1 = load <1 x float>, <1 x float>* %a_1
102104
%a_2 = alloca <1 x float>
@@ -106,15 +108,15 @@ define dso_local spir_func <1 x float> @FUNC_9() !sycl_explicit_simd !1 {
106108
ret <1 x float> %ret_val
107109
}
108110

109-
define dso_local spir_func <8 x float> @FUNC_10() !sycl_explicit_simd !1 {
111+
define dso_local spir_func <8 x float> @FUNC_10() {
110112
%a_1 = alloca <16 x float>
111113
%1 = load <16 x float>, <16 x float>* %a_1
112114
%ret_val = call spir_func <8 x float> @_Z16__esimd_rdregionIfLi16ELi8ELi0ELi8ELi1ELi0EEN2cm3gen13__vector_typeIT_XT1_EE4typeENS2_IS3_XT0_EE4typeEt(<16 x float> %1, i16 zeroext 0)
113115
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x float> @llvm.genx.rdregionf.v8f32.v16f32.i16(<16 x float> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 1, i16 0, i32 0)
114116
ret <8 x float> %ret_val
115117
}
116118

117-
define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 {
119+
define dso_local spir_func <16 x float> @FUNC_11() {
118120
%a_1 = alloca <16 x float>
119121
%1 = load <16 x float>, <16 x float>* %a_1
120122
%a_2 = alloca <8 x float>
@@ -124,36 +126,36 @@ define dso_local spir_func <16 x float> @FUNC_11() !sycl_explicit_simd !1 {
124126
ret <16 x float> %ret_val
125127
}
126128

127-
define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 {
129+
define dso_local spir_func <32 x i32> @FUNC_21(%opencl.image2d_ro_t addrspace(1)* %0, i32 %1, i32 %2) {
128130
%ret_val = call spir_func <32 x i32> @_Z24__esimd_media_block_loadIiLi4ELi8E14ocl_image2d_roEN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeEjT2_jjjj(i32 0, %opencl.image2d_ro_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2)
129131
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.media.ld.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}})
130132
ret <32 x i32> %ret_val
131133
}
132134

133-
define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) !sycl_explicit_simd !1 {
135+
define dso_local spir_func void @FUNC_22(%opencl.image2d_wo_t addrspace(1)* %0, i32 %1, i32 %2) {
134136
%a_3 = alloca <32 x i32>
135137
%4 = load <32 x i32>, <32 x i32>* %a_3
136138
call spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_image2d_woEvjT2_jjjjN2cm3gen13__vector_typeIT_XmlT0_T1_EE4typeE(i32 0, %opencl.image2d_wo_t addrspace(1)* %0, i32 0, i32 32, i32 %1, i32 %2, <32 x i32> %4)
137139
; CHECK: call void @llvm.genx.media.st.v32i32(i32 0, i32 %{{[0-9a-zA-Z_.]+}}, i32 0, i32 32, i32 %{{[0-9a-zA-Z_.]+}}, i32 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})
138140
ret void
139141
}
140142

141-
define dso_local spir_func <16 x i32> @FUNC_23() !sycl_explicit_simd !1 {
143+
define dso_local spir_func <16 x i32> @FUNC_23() {
142144
%ret_val = call spir_func <16 x i32> @_Z13__esimd_vloadIiLi16EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd<int, 16>", %"cm::gen::simd<int, 16>"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*))
143145
; CHECK: %ret_val1 = load <16 x i32>, <16 x i32> addrspace(4)* addrspacecast (<16 x i32>* getelementptr inbounds (%"cm::gen::simd<int, 16>", %"cm::gen::simd<int, 16>"* @vg, i32 0, i32 0) to <16 x i32> addrspace(4)*), align 64
144146
; TODO: testcase to generate this:
145147
; CxHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.vload.v16i32.p4v16i32(<16 x i32> addrspace(4)* {{.*}})
146148
ret <16 x i32> %ret_val
147149
}
148150

149-
define dso_local spir_func void @FUNC_28(<32 x i32> %0) !sycl_explicit_simd !1 {
151+
define dso_local spir_func void @FUNC_28(<32 x i32> %0) {
150152
call spir_func void @_Z14__esimd_vstoreIiLi32EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), <32 x i32> %0)
151153
; CHECK: store <32 x i32> %0, <32 x i32> addrspace(4)* addrspacecast (<32 x i32> addrspace(1)* @vc to <32 x i32> addrspace(4)*), align 128
152154

153155
ret void
154156
}
155157

156-
define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 {
158+
define dso_local spir_func void @FUNC_29() {
157159
%a_1 = alloca <32 x i32>
158160
%1 = addrspacecast <32 x i32>* %a_1 to <32 x i32> addrspace(4)*
159161
%a_2 = alloca <32 x i32>
@@ -163,15 +165,15 @@ define dso_local spir_func void @FUNC_29() !sycl_explicit_simd !1 {
163165
ret void
164166
}
165167

166-
define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1 {
167-
; CHECK: define dso_local spir_kernel void @FUNC_30() !sycl_explicit_simd !1
168+
define dso_local spir_kernel void @FUNC_30() {
169+
; CHECK: define dso_local spir_kernel void @FUNC_30()
168170
call spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32 1023)
169171
ret void
170172
; CHECK-NEXT: ret void
171173
}
172174

173-
define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 {
174-
; CHECK: define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1
175+
define dso_local spir_kernel void @FUNC_31() {
176+
; CHECK: define dso_local spir_kernel void @FUNC_31()
175177
%call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
176178
; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32()
177179
; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0
@@ -180,7 +182,7 @@ define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 {
180182
; CHECK-NEXT: ret void
181183
}
182184

183-
define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 {
185+
define dso_local spir_func <16 x i32> @FUNC_32() {
184186
%a_1 = alloca <16 x i32>
185187
%1 = load <16 x i32>, <16 x i32>* %a_1
186188
%a_2 = alloca <16 x i32>
@@ -192,7 +194,7 @@ define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 {
192194
ret <16 x i32> %ret_val
193195
}
194196

195-
define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 {
197+
define dso_local spir_func <16 x i32> @FUNC_33() {
196198
%a_1 = alloca <16 x i32>
197199
%1 = load <16 x i32>, <16 x i32>* %a_1
198200
%a_2 = alloca <16 x i32>
@@ -204,7 +206,7 @@ define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 {
204206
ret <16 x i32> %ret_val
205207
}
206208

207-
define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 {
209+
define dso_local spir_func <16 x i32> @FUNC_34() {
208210
%a_1 = alloca <16 x i32>
209211
%1 = load <16 x i32>, <16 x i32>* %a_1
210212
%a_2 = alloca <16 x i32>
@@ -216,7 +218,7 @@ define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 {
216218
ret <16 x i32> %ret_val
217219
}
218220

219-
define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 {
221+
define dso_local spir_func <16 x i32> @FUNC_35() {
220222
%a_1 = alloca <16 x i32>
221223
%1 = load <16 x i32>, <16 x i32>* %a_1
222224
%a_2 = alloca <16 x i32>
@@ -228,7 +230,7 @@ define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 {
228230
ret <16 x i32> %ret_val
229231
}
230232

231-
define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 {
233+
define dso_local spir_func <16 x i32> @FUNC_36() {
232234
%a_1 = alloca <16 x i32>
233235
%1 = load <16 x i32>, <16 x i32>* %a_1
234236
%a_2 = alloca <16 x i32>
@@ -240,7 +242,7 @@ define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 {
240242
ret <16 x i32> %ret_val
241243
}
242244

243-
define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 {
245+
define dso_local spir_func <16 x i32> @FUNC_37() {
244246
%a_1 = alloca <16 x i32>
245247
%1 = load <16 x i32>, <16 x i32>* %a_1
246248
%a_2 = alloca <16 x i32>
@@ -252,7 +254,7 @@ define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 {
252254
ret <16 x i32> %ret_val
253255
}
254256

255-
define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 {
257+
define dso_local spir_func <16 x i32> @FUNC_38() {
256258
%a_1 = alloca <16 x i32>
257259
%1 = load <16 x i32>, <16 x i32>* %a_1
258260
%a_2 = alloca <16 x i32>
@@ -264,7 +266,7 @@ define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 {
264266
ret <16 x i32> %ret_val
265267
}
266268

267-
define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 {
269+
define dso_local spir_func <16 x i32> @FUNC_39() {
268270
%a_1 = alloca <16 x i32>
269271
%1 = load <16 x i32>, <16 x i32>* %a_1
270272
%a_2 = alloca <16 x i32>
@@ -276,25 +278,25 @@ define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 {
276278
ret <16 x i32> %ret_val
277279
}
278280

279-
define dso_local spir_func <8 x i32> @FUNC_40() !sycl_explicit_simd !1 {
281+
define dso_local spir_func <8 x i32> @FUNC_40() {
280282
%ret_val = call spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 0)
281283
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.oword.ld.v8i32(i32 0, i32 254, i32 0)
282284
ret <8 x i32> %ret_val
283285
}
284286

285-
define dso_local spir_func void @FUNC_41() !sycl_explicit_simd !1 {
287+
define dso_local spir_func void @FUNC_41() {
286288
call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 1)
287289
; CHECK: call void @llvm.genx.sbarrier(i8 1)
288290
ret void
289291
}
290292

291-
define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 {
293+
define dso_local spir_func void @FUNC_42() {
292294
call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 0)
293295
; CHECK: call void @llvm.genx.sbarrier(i8 0)
294296
ret void
295297
}
296298

297-
define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 {
299+
define dso_local spir_func <8 x i32> @FUNC_43() {
298300
%a_1 = alloca <16 x i32>
299301
%1 = load <16 x i32>, <16 x i32>* %a_1
300302
%a_2 = alloca <8 x i16>
@@ -304,7 +306,7 @@ define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 {
304306
ret <8 x i32> %ret_val
305307
}
306308

307-
define dso_local spir_func <16 x i32> @FUNC_44() !sycl_explicit_simd !1 {
309+
define dso_local spir_func <16 x i32> @FUNC_44() {
308310
%a_1 = alloca <16 x i32>
309311
%1 = load <16 x i32>, <16 x i32>* %a_1
310312
%a_2 = alloca <8 x i32>

llvm/test/SYCLLowerIR/scalar_fptoui.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:
77
target triple = "spir64-unknown-unknown-sycldevice"
88

99
; Function Attrs: convergent norecurse
10-
define dso_local spir_func i32 @foo(float %x) !sycl_explicit_simd !1 {
10+
define dso_local spir_func i32 @foo(float %x) {
1111
%y = fptoui float %x to i32
1212
; check that the scalar float to unsigned int conversion is left intact
1313
; CHECK: %y = fptoui float %x to i32

sycl/test/esimd/slm_load4.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,22 @@
99
using namespace sycl::INTEL::gpu;
1010
using namespace cl::sycl;
1111

12-
void kernel() __attribute__((sycl_device)) {
13-
simd<uint32_t, 32> offsets(0, 1);
14-
simd<int, 128> v1(0, 1);
12+
template <typename name, typename Func>
13+
__attribute__((sycl_kernel)) void kernel_call(Func kernelFunc) {
14+
kernelFunc();
15+
}
16+
17+
void caller() {
18+
kernel_call<class EsimdKernel>([=]() SYCL_ESIMD_KERNEL {
19+
simd<uint32_t, 32> offsets(0, 1);
20+
simd<int, 128> v1(0, 1);
1521

16-
slm_init(1024);
22+
slm_init(1024);
1723

18-
auto v0 = slm_load4<int, 32, ESIMD_ABGR_ENABLE>(offsets);
24+
auto v0 = slm_load4<int, 32, ESIMD_ABGR_ENABLE>(offsets);
1925

20-
v0 = v0 + v1;
26+
v0 = v0 + v1;
2127

22-
slm_store4<int, 32, ESIMD_ABGR_ENABLE>(v0, offsets);
28+
slm_store4<int, 32, ESIMD_ABGR_ENABLE>(v0, offsets);
29+
});
2330
}

0 commit comments

Comments
 (0)