Skip to content

Commit 79996cd

Browse files
committed
[AMDGPU] Pre-commit tests for llvm#101157. NFC
1 parent 5ef087b commit 79996cd

File tree

1 file changed

+150
-0
lines changed
  • llvm/test/CodeGen/AMDGPU

1 file changed

+150
-0
lines changed

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3538,6 +3538,155 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
35383538
ret void
35393539
}
35403540

3541+
; Check that exact regions with execz affected instructions are as short as possible
3542+
define amdgpu_ps float @short_exact_regions(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3543+
; GFX9-W64-LABEL: short_exact_regions:
3544+
; GFX9-W64: ; %bb.0: ; %main_body
3545+
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3546+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3547+
; GFX9-W64-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3548+
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3549+
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3550+
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
3551+
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
3552+
; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2
3553+
; GFX9-W64-NEXT: ; %bb.1: ; %if
3554+
; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
3555+
; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3556+
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3557+
; GFX9-W64-NEXT: v_readfirstlane_b32 s18, v0
3558+
; GFX9-W64-NEXT: s_buffer_load_dword s18, s[8:11], s18 offset:0x0
3559+
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3560+
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s18
3561+
; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3562+
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
3563+
; GFX9-W64-NEXT: .LBB59_2: ; %endif
3564+
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
3565+
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3566+
; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3567+
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3568+
; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v0
3569+
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3570+
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3571+
; GFX9-W64-NEXT: ; return to shader part epilog
3572+
;
3573+
; GFX10-W32-LABEL: short_exact_regions:
3574+
; GFX10-W32: ; %bb.0: ; %main_body
3575+
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3576+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3577+
; GFX10-W32-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3578+
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3579+
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
3580+
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3581+
; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0
3582+
; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2
3583+
; GFX10-W32-NEXT: ; %bb.1: ; %if
3584+
; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
3585+
; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3586+
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3587+
; GFX10-W32-NEXT: v_readfirstlane_b32 s15, v0
3588+
; GFX10-W32-NEXT: s_buffer_load_dword s15, s[8:11], s15 offset:0x0
3589+
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3590+
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s15
3591+
; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3592+
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
3593+
; GFX10-W32-NEXT: .LBB59_2: ; %endif
3594+
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
3595+
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3596+
; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3597+
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3598+
; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v0
3599+
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3600+
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3601+
; GFX10-W32-NEXT: ; return to shader part epilog
3602+
main_body:
3603+
%tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3604+
%idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3605+
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3606+
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3607+
%cc = icmp uge i32 %hi, 16
3608+
br i1 %cc, label %endif, label %if
3609+
3610+
if:
3611+
%idx1 = extractelement <4 x i32> %idx0, i64 0
3612+
%idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3613+
%idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %sampler, i32 %idx2, i32 0)
3614+
3615+
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex1, <4 x i32> undef, i32 %idx3, i32 0, i32 0, i32 0)
3616+
br label %endif
3617+
3618+
endif:
3619+
%d = extractelement <4 x float> %tex1, i64 0
3620+
%tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3621+
%r0 = extractelement <4 x float> %tex1, i64 1
3622+
%r1 = extractelement <4 x float> %tex2, i64 2
3623+
%r2 = fadd float %r0, %r1
3624+
%out = call float @llvm.amdgcn.wqm.f32(float %r2)
3625+
3626+
ret float %out
3627+
}
3628+
3629+
; Check that exact regions shortening doesn't prevent early WQM exit
3630+
define amdgpu_ps float @short_exact_regions_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c, ptr addrspace(4) %p) {
3631+
; GFX9-W64-LABEL: short_exact_regions_2:
3632+
; GFX9-W64: ; %bb.0: ; %main_body
3633+
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3634+
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3635+
; GFX9-W64-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
3636+
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3637+
; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3638+
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3639+
; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
3640+
; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
3641+
; GFX9-W64-NEXT: ; kill: killed $vgpr3
3642+
; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2
3643+
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3644+
; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0
3645+
; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3646+
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3647+
; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5
3648+
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3649+
; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0
3650+
; GFX9-W64-NEXT: ; return to shader part epilog
3651+
;
3652+
; GFX10-W32-LABEL: short_exact_regions_2:
3653+
; GFX10-W32: ; %bb.0: ; %main_body
3654+
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3655+
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3656+
; GFX10-W32-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
3657+
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3658+
; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3659+
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3660+
; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3661+
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3662+
; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0
3663+
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3664+
; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1
3665+
; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3666+
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3667+
; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0
3668+
; GFX10-W32-NEXT: ; return to shader part epilog
3669+
main_body:
3670+
%tex1 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3671+
%idx0 = load <4 x i32>, ptr addrspace(4) %p, align 4
3672+
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
3673+
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
3674+
%idx1 = extractelement <4 x i32> %idx0, i64 0
3675+
%d = extractelement <4 x float> %tex1, i64 0
3676+
3677+
%tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %d, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
3678+
3679+
%idx2 = call i32 @llvm.amdgcn.readfirstlane.i32(i32 %idx1)
3680+
%idx3 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %sampler, i32 %idx2, i32 0)
3681+
3682+
%r0 = extractelement <4 x float> %tex1, i64 1
3683+
%r1 = extractelement <4 x float> %tex2, i64 2
3684+
%r2 = fadd float %r0, %r1
3685+
%out = fadd float %r2, %idx3
3686+
3687+
ret float %out
3688+
}
3689+
35413690
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
35423691
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
35433692

@@ -3577,6 +3726,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
35773726
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
35783727
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
35793728
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3729+
declare i32 @llvm.amdgcn.readfirstlane.i32(i32)
35803730

35813731
attributes #1 = { nounwind }
35823732
attributes #2 = { nounwind readonly }

0 commit comments

Comments
 (0)