@@ -3538,6 +3538,155 @@ define amdgpu_gs void @wqm_init_exec_wwm() {
3538
3538
ret void
3539
3539
}
3540
3540
3541
+ ; Check that exact regions with execz affected instructions are as short as possible
3542
+ define amdgpu_ps float @short_exact_regions (<8 x i32 > inreg %rsrc , <4 x i32 > inreg %sampler , float %c , ptr addrspace (4 ) %p ) {
3543
+ ; GFX9-W64-LABEL: short_exact_regions:
3544
+ ; GFX9-W64: ; %bb.0: ; %main_body
3545
+ ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3546
+ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3547
+ ; GFX9-W64-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf
3548
+ ; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3549
+ ; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3550
+ ; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
3551
+ ; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
3552
+ ; GFX9-W64-NEXT: s_cbranch_execz .LBB59_2
3553
+ ; GFX9-W64-NEXT: ; %bb.1: ; %if
3554
+ ; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
3555
+ ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3556
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3557
+ ; GFX9-W64-NEXT: v_readfirstlane_b32 s18, v0
3558
+ ; GFX9-W64-NEXT: s_buffer_load_dword s18, s[8:11], s18 offset:0x0
3559
+ ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3560
+ ; GFX9-W64-NEXT: v_mov_b32_e32 v0, s18
3561
+ ; GFX9-W64-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3562
+ ; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
3563
+ ; GFX9-W64-NEXT: .LBB59_2: ; %endif
3564
+ ; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
3565
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3566
+ ; GFX9-W64-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4
3567
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3568
+ ; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v0
3569
+ ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3570
+ ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3571
+ ; GFX9-W64-NEXT: ; return to shader part epilog
3572
+ ;
3573
+ ; GFX10-W32-LABEL: short_exact_regions:
3574
+ ; GFX10-W32: ; %bb.0: ; %main_body
3575
+ ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3576
+ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3577
+ ; GFX10-W32-NEXT: image_sample v[3:6], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
3578
+ ; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
3579
+ ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
3580
+ ; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
3581
+ ; GFX10-W32-NEXT: v_cmpx_gt_u32_e32 16, v0
3582
+ ; GFX10-W32-NEXT: s_cbranch_execz .LBB59_2
3583
+ ; GFX10-W32-NEXT: ; %bb.1: ; %if
3584
+ ; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
3585
+ ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3586
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3587
+ ; GFX10-W32-NEXT: v_readfirstlane_b32 s15, v0
3588
+ ; GFX10-W32-NEXT: s_buffer_load_dword s15, s[8:11], s15 offset:0x0
3589
+ ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3590
+ ; GFX10-W32-NEXT: v_mov_b32_e32 v0, s15
3591
+ ; GFX10-W32-NEXT: buffer_store_dwordx4 v[3:6], v0, s[0:3], 0 idxen
3592
+ ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
3593
+ ; GFX10-W32-NEXT: .LBB59_2: ; %endif
3594
+ ; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
3595
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3596
+ ; GFX10-W32-NEXT: image_sample v0, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3597
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3598
+ ; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v0
3599
+ ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
3600
+ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3601
+ ; GFX10-W32-NEXT: ; return to shader part epilog
3602
+ main_body:
3603
+ %tex1 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %c , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3604
+ %idx0 = load <4 x i32 >, ptr addrspace (4 ) %p , align 4
3605
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo (i32 -1 , i32 0 )
3606
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi (i32 -1 , i32 %lo )
3607
+ %cc = icmp uge i32 %hi , 16
3608
+ br i1 %cc , label %endif , label %if
3609
+
3610
+ if:
3611
+ %idx1 = extractelement <4 x i32 > %idx0 , i64 0
3612
+ %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32 (i32 %idx1 )
3613
+ %idx3 = call i32 @llvm.amdgcn.s.buffer.load.i32 (<4 x i32 > %sampler , i32 %idx2 , i32 0 )
3614
+
3615
+ call void @llvm.amdgcn.struct.buffer.store.v4f32 (<4 x float > %tex1 , <4 x i32 > undef , i32 %idx3 , i32 0 , i32 0 , i32 0 )
3616
+ br label %endif
3617
+
3618
+ endif:
3619
+ %d = extractelement <4 x float > %tex1 , i64 0
3620
+ %tex2 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %d , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3621
+ %r0 = extractelement <4 x float > %tex1 , i64 1
3622
+ %r1 = extractelement <4 x float > %tex2 , i64 2
3623
+ %r2 = fadd float %r0 , %r1
3624
+ %out = call float @llvm.amdgcn.wqm.f32 (float %r2 )
3625
+
3626
+ ret float %out
3627
+ }
3628
+
3629
+ ; Check that exact regions shortening doesn't prevent early WQM exit
3630
+ define amdgpu_ps float @short_exact_regions_2 (<8 x i32 > inreg %rsrc , <4 x i32 > inreg %sampler , float %c , ptr addrspace (4 ) %p ) {
3631
+ ; GFX9-W64-LABEL: short_exact_regions_2:
3632
+ ; GFX9-W64: ; %bb.0: ; %main_body
3633
+ ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
3634
+ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
3635
+ ; GFX9-W64-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3
3636
+ ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
3637
+ ; GFX9-W64-NEXT: global_load_dword v0, v[1:2], off
3638
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3639
+ ; GFX9-W64-NEXT: image_sample v5, v3, s[0:7], s[8:11] dmask:0x4
3640
+ ; GFX9-W64-NEXT: ; kill: killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6 killed $sgpr7
3641
+ ; GFX9-W64-NEXT: ; kill: killed $vgpr3
3642
+ ; GFX9-W64-NEXT: ; kill: killed $vgpr1 killed $vgpr2
3643
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
3644
+ ; GFX9-W64-NEXT: v_readfirstlane_b32 s0, v0
3645
+ ; GFX9-W64-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3646
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
3647
+ ; GFX9-W64-NEXT: v_add_f32_e32 v0, v4, v5
3648
+ ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3649
+ ; GFX9-W64-NEXT: v_add_f32_e32 v0, s0, v0
3650
+ ; GFX9-W64-NEXT: ; return to shader part epilog
3651
+ ;
3652
+ ; GFX10-W32-LABEL: short_exact_regions_2:
3653
+ ; GFX10-W32: ; %bb.0: ; %main_body
3654
+ ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
3655
+ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
3656
+ ; GFX10-W32-NEXT: image_sample v[3:4], v0, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D
3657
+ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
3658
+ ; GFX10-W32-NEXT: global_load_dword v0, v[1:2], off
3659
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3660
+ ; GFX10-W32-NEXT: image_sample v1, v3, s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_1D
3661
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
3662
+ ; GFX10-W32-NEXT: v_readfirstlane_b32 s0, v0
3663
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
3664
+ ; GFX10-W32-NEXT: v_add_f32_e32 v0, v4, v1
3665
+ ; GFX10-W32-NEXT: s_buffer_load_dword s0, s[8:11], s0 offset:0x0
3666
+ ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3667
+ ; GFX10-W32-NEXT: v_add_f32_e32 v0, s0, v0
3668
+ ; GFX10-W32-NEXT: ; return to shader part epilog
3669
+ main_body:
3670
+ %tex1 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %c , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3671
+ %idx0 = load <4 x i32 >, ptr addrspace (4 ) %p , align 4
3672
+ %lo = call i32 @llvm.amdgcn.mbcnt.lo (i32 -1 , i32 0 )
3673
+ %hi = call i32 @llvm.amdgcn.mbcnt.hi (i32 -1 , i32 %lo )
3674
+ %idx1 = extractelement <4 x i32 > %idx0 , i64 0
3675
+ %d = extractelement <4 x float > %tex1 , i64 0
3676
+
3677
+ %tex2 = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %d , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3678
+
3679
+ %idx2 = call i32 @llvm.amdgcn.readfirstlane.i32 (i32 %idx1 )
3680
+ %idx3 = call float @llvm.amdgcn.s.buffer.load.f32 (<4 x i32 > %sampler , i32 %idx2 , i32 0 )
3681
+
3682
+ %r0 = extractelement <4 x float > %tex1 , i64 1
3683
+ %r1 = extractelement <4 x float > %tex2 , i64 2
3684
+ %r2 = fadd float %r0 , %r1
3685
+ %out = fadd float %r2 , %idx3
3686
+
3687
+ ret float %out
3688
+ }
3689
+
3541
3690
declare void @llvm.amdgcn.exp.f32 (i32 , i32 , float , float , float , float , i1 , i1 ) #1
3542
3691
declare void @llvm.amdgcn.image.store.1d.v4f32.i32 (<4 x float >, i32 , i32 , <8 x i32 >, i32 , i32 ) #1
3543
3692
@@ -3577,6 +3726,7 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3577
3726
declare float @llvm.amdgcn.interp.p2 (float , float , i32 , i32 , i32 ) #2
3578
3727
declare i32 @llvm.amdgcn.ds.swizzle (i32 , i32 )
3579
3728
declare float @llvm.amdgcn.s.buffer.load.f32 (<4 x i32 >, i32 , i32 immarg) #7
3729
+ declare i32 @llvm.amdgcn.readfirstlane.i32 (i32 )
3580
3730
3581
3731
attributes #1 = { nounwind }
3582
3732
attributes #2 = { nounwind readonly }
0 commit comments