@@ -2936,6 +2936,89 @@ ENDIF:
2936
2936
ret float %r
2937
2937
}
2938
2938
2939
+ ; WQM -> StrictWQM transition must be preserved because kill breaks WQM mask
2940
+ define amdgpu_ps float @test_strict_wqm_within_wqm_with_kill (<8 x i32 > inreg %rsrc , <4 x i32 > inreg %sampler , i32 %c , i32 %z , float %data , i32 %wqm_data ) {
2941
+ ; GFX9-W64-LABEL: test_strict_wqm_within_wqm_with_kill:
2942
+ ; GFX9-W64: ; %bb.0: ; %main_body
2943
+ ; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
2944
+ ; GFX9-W64-NEXT: s_mov_b64 s[14:15], exec
2945
+ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2946
+ ; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
2947
+ ; GFX9-W64-NEXT: s_mov_b64 exec, s[14:15]
2948
+ ; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2949
+ ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2950
+ ; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2951
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2952
+ ; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2953
+ ; GFX9-W64-NEXT: s_xor_b64 s[0:1], vcc, exec
2954
+ ; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], s[0:1]
2955
+ ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB51_2
2956
+ ; GFX9-W64-NEXT: ; %bb.1: ; %main_body
2957
+ ; GFX9-W64-NEXT: s_and_b64 exec, exec, vcc
2958
+ ; GFX9-W64-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2959
+ ; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
2960
+ ; GFX9-W64-NEXT: v_mov_b32_e32 v1, v3
2961
+ ; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v1
2962
+ ; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
2963
+ ; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
2964
+ ; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2965
+ ; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
2966
+ ; GFX9-W64-NEXT: s_branch .LBB51_3
2967
+ ; GFX9-W64-NEXT: .LBB51_2:
2968
+ ; GFX9-W64-NEXT: s_mov_b64 exec, 0
2969
+ ; GFX9-W64-NEXT: exp null off, off, off, off done vm
2970
+ ; GFX9-W64-NEXT: s_endpgm
2971
+ ; GFX9-W64-NEXT: .LBB51_3:
2972
+ ;
2973
+ ; GFX10-W32-LABEL: test_strict_wqm_within_wqm_with_kill:
2974
+ ; GFX10-W32: ; %bb.0: ; %main_body
2975
+ ; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
2976
+ ; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
2977
+ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2978
+ ; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
2979
+ ; GFX10-W32-NEXT: s_mov_b32 exec_lo, s13
2980
+ ; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2981
+ ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2982
+ ; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2983
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2984
+ ; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2985
+ ; GFX10-W32-NEXT: s_xor_b32 s0, vcc_lo, exec_lo
2986
+ ; GFX10-W32-NEXT: s_andn2_b32 s12, s12, s0
2987
+ ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB51_2
2988
+ ; GFX10-W32-NEXT: ; %bb.1: ; %main_body
2989
+ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, vcc_lo
2990
+ ; GFX10-W32-NEXT: ds_swizzle_b32 v3, v3 offset:swizzle(SWAP,2)
2991
+ ; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
2992
+ ; GFX10-W32-NEXT: v_mov_b32_e32 v1, v3
2993
+ ; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v1
2994
+ ; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
2995
+ ; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
2996
+ ; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
2997
+ ; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
2998
+ ; GFX10-W32-NEXT: s_branch .LBB51_3
2999
+ ; GFX10-W32-NEXT: .LBB51_2:
3000
+ ; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3001
+ ; GFX10-W32-NEXT: exp null off, off, off, off done vm
3002
+ ; GFX10-W32-NEXT: s_endpgm
3003
+ ; GFX10-W32-NEXT: .LBB51_3:
3004
+ main_body:
3005
+ %c.bc = bitcast i32 %c to float
3006
+ %tex = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %c.bc , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3007
+ %tex0 = extractelement <4 x float > %tex , i32 0
3008
+ %dtex = call <4 x float > @llvm.amdgcn.image.sample.1d.v4f32.f32 (i32 15 , float %tex0 , <8 x i32 > %rsrc , <4 x i32 > %sampler , i1 false , i32 0 , i32 0 ) #0
3009
+ %cmp = icmp eq i32 %z , 0
3010
+ call void @llvm.amdgcn.kill (i1 %cmp )
3011
+ %dataf = extractelement <4 x float > %dtex , i32 0
3012
+ %data2 = call i32 @llvm.amdgcn.ds.swizzle (i32 %wqm_data , i32 2079 )
3013
+ %data3 = call i32 @llvm.amdgcn.strict.wqm.i32 (i32 %data2 )
3014
+ %data3f = sitofp i32 %data3 to float
3015
+ %result.f = fadd float %dataf , %data3f
3016
+ %result.i = bitcast float %result.f to i32
3017
+ %result.wqm = call i32 @llvm.amdgcn.wqm.i32 (i32 %result.i )
3018
+ %result = bitcast i32 %result.wqm to float
3019
+ ret float %result
3020
+ }
3021
+
2939
3022
;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
2940
3023
define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm (i32 inreg %idx0 , i32 inreg %idx1 , ptr addrspace (8 ) inreg %res , ptr addrspace (8 ) inreg %res2 , float %inp , <8 x i32 > inreg %res3 ) {
2941
3024
; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
@@ -3281,9 +3364,9 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
3281
3364
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
3282
3365
; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
3283
3366
; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
3284
- ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB54_1
3367
+ ; GFX9-W64-NEXT: s_cbranch_scc0 .LBB55_1
3285
3368
; GFX9-W64-NEXT: s_endpgm
3286
- ; GFX9-W64-NEXT: .LBB54_1 :
3369
+ ; GFX9-W64-NEXT: .LBB55_1 :
3287
3370
; GFX9-W64-NEXT: s_mov_b64 exec, 0
3288
3371
; GFX9-W64-NEXT: exp null off, off, off, off done vm
3289
3372
; GFX9-W64-NEXT: s_endpgm
@@ -3297,9 +3380,9 @@ define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) in
3297
3380
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
3298
3381
; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
3299
3382
; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
3300
- ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB54_1
3383
+ ; GFX10-W32-NEXT: s_cbranch_scc0 .LBB55_1
3301
3384
; GFX10-W32-NEXT: s_endpgm
3302
- ; GFX10-W32-NEXT: .LBB54_1 :
3385
+ ; GFX10-W32-NEXT: .LBB55_1 :
3303
3386
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
3304
3387
; GFX10-W32-NEXT: exp null off, off, off, off done vm
3305
3388
; GFX10-W32-NEXT: s_endpgm
0 commit comments