Skip to content

Commit 7c9a129

Browse files
authored
Fixes for SWDEV-508901 (llvm#697)
2 parents 537e0b6 + 0e43f42 commit 7c9a129

File tree

1 file changed

+295
-0
lines changed

1 file changed

+295
-0
lines changed
Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940 %s
3+
4+
define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef %dst.coerce, ptr addrspace(1) inreg noundef %src.coerce, i64 inreg noundef %nElts, i64 inreg noundef %redOpArg, i1 inreg noundef %redOpArgIsPtr) #0 !dbg !4 {
5+
; GFX940-LABEL: preload_block_count_x:
6+
; GFX940: .Lfunc_begin0:
7+
; GFX940-NEXT: .file 0 "/" "<stdin>"
8+
; GFX940-NEXT: .cfi_sections .debug_frame
9+
; GFX940-NEXT: .cfi_startproc
10+
; GFX940-NEXT: ; %bb.5:
11+
; GFX940-NEXT: .loc 0 1 0 prologue_end ; <stdin>:1:0
12+
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
13+
; GFX940-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x8
14+
; GFX940-NEXT: s_load_dword s12, s[0:1], 0x28
15+
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
16+
; GFX940-NEXT: s_branch .LBB0_0
17+
; GFX940-NEXT: .loc 0 0 0 is_stmt 0 ; :0:0
18+
; GFX940-NEXT: .Ltmp0:
19+
; GFX940-NEXT: .p2align 8
20+
; GFX940-NEXT: ; %bb.6:
21+
; GFX940-NEXT: .LBB0_0: ; %entry
22+
; GFX940-NEXT: .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ;
23+
; GFX940-NEXT: .cfi_undefined 16
24+
; GFX940-NEXT: s_mov_b32 s0, s13
25+
; GFX940-NEXT: .Ltmp1:
26+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
27+
; GFX940-NEXT: .loc 0 1 0 is_stmt 1 ; <stdin>:1:0
28+
; GFX940-NEXT: s_ashr_i32 s13, s12, 31
29+
; GFX940-NEXT: s_or_b64 s[8:9], s[6:7], s[12:13]
30+
; GFX940-NEXT: s_mov_b32 s8, 0
31+
; GFX940-NEXT: s_cmp_lg_u64 s[8:9], 0
32+
; GFX940-NEXT: s_cbranch_scc0 .LBB0_4
33+
; GFX940-NEXT: .Ltmp2:
34+
; GFX940-NEXT: ; %bb.1:
35+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
36+
; GFX940-NEXT: v_cvt_f32_u32_e32 v0, s12
37+
; GFX940-NEXT: v_cvt_f32_u32_e32 v1, s13
38+
; GFX940-NEXT: s_sub_u32 s1, 0, s12
39+
; GFX940-NEXT: s_subb_u32 s3, 0, s13
40+
; GFX940-NEXT: .Ltmp3:
41+
; GFX940-NEXT: v_fmamk_f32 v0, v1, 0x4f800000, v0
42+
; GFX940-NEXT: v_rcp_f32_e32 v0, v0
43+
; GFX940-NEXT: s_nop 0
44+
; GFX940-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
45+
; GFX940-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
46+
; GFX940-NEXT: v_trunc_f32_e32 v1, v1
47+
; GFX940-NEXT: v_fmamk_f32 v0, v1, 0xcf800000, v0
48+
; GFX940-NEXT: v_cvt_u32_f32_e32 v1, v1
49+
; GFX940-NEXT: v_cvt_u32_f32_e32 v0, v0
50+
; GFX940-NEXT: v_readfirstlane_b32 s5, v1
51+
; GFX940-NEXT: v_readfirstlane_b32 s8, v0
52+
; GFX940-NEXT: s_mul_i32 s9, s1, s5
53+
; GFX940-NEXT: s_mul_hi_u32 s15, s1, s8
54+
; GFX940-NEXT: s_mul_i32 s14, s3, s8
55+
; GFX940-NEXT: s_add_i32 s9, s15, s9
56+
; GFX940-NEXT: s_add_i32 s9, s9, s14
57+
; GFX940-NEXT: s_mul_i32 s16, s1, s8
58+
; GFX940-NEXT: s_mul_hi_u32 s14, s8, s9
59+
; GFX940-NEXT: s_mul_i32 s15, s8, s9
60+
; GFX940-NEXT: s_mul_hi_u32 s8, s8, s16
61+
; GFX940-NEXT: s_add_u32 s8, s8, s15
62+
; GFX940-NEXT: s_addc_u32 s14, 0, s14
63+
; GFX940-NEXT: s_mul_hi_u32 s17, s5, s16
64+
; GFX940-NEXT: s_mul_i32 s16, s5, s16
65+
; GFX940-NEXT: s_add_u32 s8, s8, s16
66+
; GFX940-NEXT: s_mul_hi_u32 s15, s5, s9
67+
; GFX940-NEXT: s_addc_u32 s8, s14, s17
68+
; GFX940-NEXT: s_addc_u32 s14, s15, 0
69+
; GFX940-NEXT: s_mul_i32 s9, s5, s9
70+
; GFX940-NEXT: s_add_u32 s8, s8, s9
71+
; GFX940-NEXT: s_addc_u32 s9, 0, s14
72+
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, s8, v0
73+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
74+
; GFX940-NEXT: s_addc_u32 s5, s5, s9
75+
; GFX940-NEXT: v_readfirstlane_b32 s9, v0
76+
; GFX940-NEXT: s_mul_i32 s8, s1, s5
77+
; GFX940-NEXT: s_mul_hi_u32 s14, s1, s9
78+
; GFX940-NEXT: s_add_i32 s8, s14, s8
79+
; GFX940-NEXT: s_mul_i32 s3, s3, s9
80+
; GFX940-NEXT: s_add_i32 s8, s8, s3
81+
; GFX940-NEXT: s_mul_i32 s1, s1, s9
82+
; GFX940-NEXT: s_mul_hi_u32 s14, s5, s1
83+
; GFX940-NEXT: s_mul_i32 s15, s5, s1
84+
; GFX940-NEXT: s_mul_i32 s17, s9, s8
85+
; GFX940-NEXT: s_mul_hi_u32 s1, s9, s1
86+
; GFX940-NEXT: s_mul_hi_u32 s16, s9, s8
87+
; GFX940-NEXT: s_add_u32 s1, s1, s17
88+
; GFX940-NEXT: s_addc_u32 s9, 0, s16
89+
; GFX940-NEXT: s_add_u32 s1, s1, s15
90+
; GFX940-NEXT: s_mul_hi_u32 s3, s5, s8
91+
; GFX940-NEXT: s_addc_u32 s1, s9, s14
92+
; GFX940-NEXT: s_addc_u32 s3, s3, 0
93+
; GFX940-NEXT: s_mul_i32 s8, s5, s8
94+
; GFX940-NEXT: s_add_u32 s1, s1, s8
95+
; GFX940-NEXT: s_addc_u32 s3, 0, s3
96+
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
97+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
98+
; GFX940-NEXT: s_addc_u32 s1, s5, s3
99+
; GFX940-NEXT: v_readfirstlane_b32 s8, v0
100+
; GFX940-NEXT: s_mul_i32 s5, s6, s1
101+
; GFX940-NEXT: s_mul_hi_u32 s9, s6, s8
102+
; GFX940-NEXT: s_mul_hi_u32 s3, s6, s1
103+
; GFX940-NEXT: s_add_u32 s5, s9, s5
104+
; GFX940-NEXT: s_addc_u32 s3, 0, s3
105+
; GFX940-NEXT: s_mul_hi_u32 s14, s7, s8
106+
; GFX940-NEXT: s_mul_i32 s8, s7, s8
107+
; GFX940-NEXT: s_add_u32 s5, s5, s8
108+
; GFX940-NEXT: s_mul_hi_u32 s9, s7, s1
109+
; GFX940-NEXT: s_addc_u32 s3, s3, s14
110+
; GFX940-NEXT: s_addc_u32 s5, s9, 0
111+
; GFX940-NEXT: s_mul_i32 s1, s7, s1
112+
; GFX940-NEXT: s_add_u32 s1, s3, s1
113+
; GFX940-NEXT: s_addc_u32 s3, 0, s5
114+
; GFX940-NEXT: s_mul_i32 s5, s12, s3
115+
; GFX940-NEXT: s_mul_hi_u32 s8, s12, s1
116+
; GFX940-NEXT: s_add_i32 s5, s8, s5
117+
; GFX940-NEXT: s_mul_i32 s8, s13, s1
118+
; GFX940-NEXT: s_mul_i32 s9, s12, s1
119+
; GFX940-NEXT: s_add_i32 s5, s5, s8
120+
; GFX940-NEXT: v_mov_b32_e32 v0, s9
121+
; GFX940-NEXT: s_sub_i32 s8, s7, s5
122+
; GFX940-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
123+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
124+
; GFX940-NEXT: s_subb_u32 s14, s8, s13
125+
; GFX940-NEXT: v_subrev_co_u32_e64 v1, s[8:9], s12, v0
126+
; GFX940-NEXT: s_cmp_lg_u64 s[8:9], 0
127+
; GFX940-NEXT: s_subb_u32 s8, s14, 0
128+
; GFX940-NEXT: s_cmp_ge_u32 s8, s13
129+
; GFX940-NEXT: v_readfirstlane_b32 s14, v1
130+
; GFX940-NEXT: s_cselect_b32 s9, -1, 0
131+
; GFX940-NEXT: s_cmp_ge_u32 s14, s12
132+
; GFX940-NEXT: s_cselect_b32 s14, -1, 0
133+
; GFX940-NEXT: s_cmp_eq_u32 s8, s13
134+
; GFX940-NEXT: s_cselect_b32 s8, s14, s9
135+
; GFX940-NEXT: s_add_u32 s9, s1, 1
136+
; GFX940-NEXT: s_addc_u32 s14, s3, 0
137+
; GFX940-NEXT: s_add_u32 s15, s1, 2
138+
; GFX940-NEXT: s_addc_u32 s16, s3, 0
139+
; GFX940-NEXT: s_cmp_lg_u32 s8, 0
140+
; GFX940-NEXT: s_cselect_b32 s8, s15, s9
141+
; GFX940-NEXT: s_cselect_b32 s9, s16, s14
142+
; GFX940-NEXT: s_cmp_lg_u64 vcc, 0
143+
; GFX940-NEXT: s_subb_u32 s5, s7, s5
144+
; GFX940-NEXT: s_cmp_ge_u32 s5, s13
145+
; GFX940-NEXT: v_readfirstlane_b32 s15, v0
146+
; GFX940-NEXT: s_cselect_b32 s14, -1, 0
147+
; GFX940-NEXT: s_cmp_ge_u32 s15, s12
148+
; GFX940-NEXT: s_cselect_b32 s15, -1, 0
149+
; GFX940-NEXT: s_cmp_eq_u32 s5, s13
150+
; GFX940-NEXT: s_cselect_b32 s5, s15, s14
151+
; GFX940-NEXT: s_cmp_lg_u32 s5, 0
152+
; GFX940-NEXT: s_cselect_b32 s9, s9, s3
153+
; GFX940-NEXT: s_cselect_b32 s8, s8, s1
154+
; GFX940-NEXT: s_cbranch_execnz .LBB0_3
155+
; GFX940-NEXT: .LBB0_2:
156+
; GFX940-NEXT: v_cvt_f32_u32_e32 v0, s12
157+
; GFX940-NEXT: s_sub_i32 s1, 0, s12
158+
; GFX940-NEXT: s_mov_b32 s9, 0
159+
; GFX940-NEXT: v_rcp_iflag_f32_e32 v0, v0
160+
; GFX940-NEXT: s_nop 0
161+
; GFX940-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
162+
; GFX940-NEXT: v_cvt_u32_f32_e32 v0, v0
163+
; GFX940-NEXT: s_nop 0
164+
; GFX940-NEXT: v_readfirstlane_b32 s3, v0
165+
; GFX940-NEXT: s_mul_i32 s1, s1, s3
166+
; GFX940-NEXT: s_mul_hi_u32 s1, s3, s1
167+
; GFX940-NEXT: s_add_i32 s3, s3, s1
168+
; GFX940-NEXT: s_mul_hi_u32 s1, s6, s3
169+
; GFX940-NEXT: s_mul_i32 s5, s1, s12
170+
; GFX940-NEXT: s_sub_i32 s5, s6, s5
171+
; GFX940-NEXT: s_add_i32 s3, s1, 1
172+
; GFX940-NEXT: s_sub_i32 s8, s5, s12
173+
; GFX940-NEXT: s_cmp_ge_u32 s5, s12
174+
; GFX940-NEXT: s_cselect_b32 s1, s3, s1
175+
; GFX940-NEXT: s_cselect_b32 s5, s8, s5
176+
; GFX940-NEXT: s_add_i32 s3, s1, 1
177+
; GFX940-NEXT: s_cmp_ge_u32 s5, s12
178+
; GFX940-NEXT: s_cselect_b32 s8, s3, s1
179+
; GFX940-NEXT: .LBB0_3:
180+
; GFX940-NEXT: s_ashr_i32 s1, s0, 31
181+
; GFX940-NEXT: s_add_u32 s3, s8, 15
182+
; GFX940-NEXT: s_addc_u32 s5, s9, 0
183+
; GFX940-NEXT: s_and_b32 s3, s3, -16
184+
; GFX940-NEXT: s_mul_i32 s1, s3, s1
185+
; GFX940-NEXT: s_mul_hi_u32 s8, s3, s0
186+
; GFX940-NEXT: s_add_i32 s1, s8, s1
187+
; GFX940-NEXT: s_mul_i32 s5, s5, s0
188+
; GFX940-NEXT: s_add_i32 s1, s1, s5
189+
; GFX940-NEXT: s_mul_i32 s3, s3, s0
190+
; GFX940-NEXT: v_cvt_f64_i32_e32 v[0:1], s1
191+
; GFX940-NEXT: v_ldexp_f64 v[0:1], v[0:1], 32
192+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
193+
; GFX940-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3]
194+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
195+
; GFX940-NEXT: v_ldexp_f64 v[2:3], v[2:3], 32
196+
; GFX940-NEXT: v_cvt_f64_u32_e32 v[4:5], s6
197+
; GFX940-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
198+
; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
199+
; GFX940-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
200+
; GFX940-NEXT: s_movk_i32 s0, 0xffe0
201+
; GFX940-NEXT: v_ldexp_f64 v[2:3], v[0:1], s0
202+
; GFX940-NEXT: v_floor_f64_e32 v[2:3], v[2:3]
203+
; GFX940-NEXT: v_fmac_f64_e32 v[0:1], 0xc1f00000, v[2:3]
204+
; GFX940-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
205+
; GFX940-NEXT: v_add_u32_e32 v1, s2, v0
206+
; GFX940-NEXT: v_add_u32_e32 v0, s4, v0
207+
; GFX940-NEXT: v_or_b32_e32 v0, v0, v1
208+
; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
209+
; GFX940-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
210+
; GFX940-NEXT: s_nop 1
211+
; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
212+
; GFX940-NEXT: ;;#ASMSTART
213+
; GFX940-NEXT: ;;#ASMEND
214+
; GFX940-NEXT: s_endpgm
215+
; GFX940-NEXT: .LBB0_4:
216+
; GFX940-NEXT: .Ltmp4:
217+
; GFX940-NEXT: ;DEBUG_VALUE: test:var <- [DW_OP_LLVM_poisoned] $sgpr2_sgpr3
218+
; GFX940-NEXT: ; implicit-def: $sgpr8_sgpr9
219+
; GFX940-NEXT: s_branch .LBB0_2
220+
; GFX940-NEXT: .Ltmp5:
221+
entry:
222+
%0 = ptrtoint ptr addrspace(1) %dst.coerce to i64
223+
%1 = inttoptr i64 %0 to ptr
224+
%2 = ptrtoint ptr addrspace(1) %src.coerce to i64
225+
#dbg_value(ptr %1, !8, !DIExpression(DIOpArg(0, ptr)), !10)
226+
%3 = tail call noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x(), !dbg !10
227+
%4 = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr(), !dbg !10
228+
%5 = tail call i32 @llvm.amdgcn.workgroup.id.x(), !dbg !10
229+
%6 = load i32, ptr addrspace(4) %4, align 4, !dbg !10
230+
%7 = getelementptr inbounds nuw i8, ptr addrspace(4) %4, i64 12, !dbg !10
231+
%8 = load i16, ptr addrspace(4) %7, align 4, !dbg !10
232+
%conv.i.i = zext i16 %8 to i32, !dbg !10
233+
%conv = sext i32 %5 to i64, !dbg !10
234+
%conv6 = sext i32 %6 to i64, !dbg !10
235+
%div = udiv i64 %nElts, %conv6, !dbg !10
236+
%sub.i = add i64 %div, 15, !dbg !10
237+
%and.i = and i64 %sub.i, -16, !dbg !10
238+
%mul = mul i64 %and.i, %conv, !dbg !10
239+
%add8 = add nsw i32 %5, 1, !dbg !10
240+
%conv9 = sext i32 %add8 to i64, !dbg !10
241+
%mul13 = mul i64 %and.i, %conv9, !dbg !10
242+
%conv.i = sitofp i64 %mul to double, !dbg !10
243+
%conv1.i = uitofp i64 %nElts to double, !dbg !10
244+
%9 = tail call contract noundef double @llvm.minnum.f64(double %conv.i, double %conv1.i), !dbg !10
245+
%conv15 = fptosi double %9 to i64, !dbg !10
246+
%conv.i43 = sitofp i64 %mul13 to double, !dbg !10
247+
%10 = tail call contract noundef double @llvm.minnum.f64(double %conv.i43, double %conv1.i), !dbg !10
248+
%add.ptr18 = getelementptr inbounds i8, ptr %1, i64 %conv15, !dbg !10
249+
%rem = and i64 %redOpArg, 1, !dbg !10
250+
%cmp.not = icmp eq i64 %rem, 0, !dbg !10
251+
%rem21 = and i64 %redOpArg, 2, !dbg !10
252+
%cmp22.not = icmp eq i64 %rem21, 0, !dbg !10
253+
%rem26 = and i64 %redOpArg, 4, !dbg !10
254+
%cmp27.not = icmp eq i64 %rem26, 0, !dbg !10
255+
%11 = inttoptr i64 %redOpArg to ptr, !dbg !10
256+
%12 = load i64, ptr %11, align 8, !dbg !10
257+
%conv17 = fptosi double %10 to i64, !dbg !10
258+
%sub = sub nsw i64 %conv17, %conv15, !dbg !10
259+
%rem.i.i5354 = and i32 %3, 63, !dbg !10
260+
%cmp.i.i.not = icmp eq i32 %rem.i.i5354, 0, !dbg !10
261+
%13 = add i64 %2, %conv15, !dbg !10
262+
%14 = ptrtoint ptr %add.ptr18 to i64, !dbg !10
263+
%15 = or i64 %13, %14, !dbg !10
264+
%16 = and i64 %15, 15, !dbg !10
265+
%and1583.i.i = icmp ne i64 %16, 0, !dbg !10
266+
%17 = zext i1 %and1583.i.i to i32, !dbg !10
267+
%18 = tail call i32 asm sideeffect "", "=v,0"(i32 %17) #9, !dbg !10
268+
%19 = icmp ne i32 %18, 0, !dbg !10
269+
%20 = tail call i64 @llvm.amdgcn.ballot.i64(i1 %19), !dbg !10
270+
%.not.i.i = icmp eq i64 %20, 0, !dbg !10
271+
%div1.i.i.i555659 = lshr i32 %3, 6, !dbg !10
272+
%div8.i.i.i = sdiv i64 %sub, 4096, !dbg !10
273+
%mul9.i.i.i = shl nsw i64 %div8.i.i.i, 12, !dbg !10
274+
%sub12.i.i.i = sub nsw i64 %sub, %mul9.i.i.i, !dbg !10
275+
%conv13.i.i.i = zext nneg i32 %div1.i.i.i555659 to i64, !dbg !10
276+
%sub14.i.i.i = sub nsw i64 %div8.i.i.i, %conv13.i.i.i, !dbg !10
277+
%cmp30399.i.i.i = icmp sgt i64 %sub14.i.i.i, 0, !dbg !10
278+
ret void
279+
}
280+
281+
attributes #0 = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
282+
283+
!llvm.dbg.cu = !{!0}
284+
!llvm.module.flags = !{!2, !3}
285+
!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
286+
!1 = !DIFile(filename: "<stdin>", directory: "/")
287+
!2 = !{i32 7, !"Dwarf Version", i32 5}
288+
!3 = !{i32 2, !"Debug Info Version", i32 3}
289+
!4 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !5, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0)
290+
!5 = !DISubroutineType(cc: DW_CC_LLVM_OpenCLKernel, types: !6)
291+
!6 = !{null}
292+
!7 = !{i32 1024, i32 1, i32 1}
293+
!8 = !DILocalVariable(name: "var", arg: 1, scope: !4, file: !1, line: 1, type: !9)
294+
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
295+
!10 = !DILocation(line: 1, scope: !4)

0 commit comments

Comments
 (0)