diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 760c7087f677a..b262c4b73d9bf 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1385,6 +1385,19 @@ bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt( ScoreBrackets.simplifyWaitcnt(OldWait, OptNone); Wait = Wait.combined(OldWait); + if (!WaitcntInstr && II.getOpcode() == AMDGPU::S_WAITCNT_soft) { + // Each direct load to LDS is also a store to LDS, but we do not have a + // separate counter for it. Instead these operations increment LOAD_CNT + // and need to be waited for at a release fence. So we treat a release + // fence as if it depends on any previous LDS DMA stores. + // + // Note that a user-specified S_WAITCNT instruction is not affected; we + // only check for S_WAITCNT_soft since that represents a fence. + // + // FIXME: How does one detect that a soft wait is a release??? + ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, Wait); + } + // Merge consecutive waitcnt of the same type by erasing multiples. if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && OpcodeIsSoft && !OptNone)) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll new file mode 100644 index 0000000000000..882c43b41bac8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-dma-workgroup-release.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GFX900 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s --check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX90A-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck %s --check-prefixes=GFX942-TGSPLIT +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=GFX1010 + +; In each of these tests, an LDS DMA operation is followed by a release pattern +; at workgroup scope. The fence in such a release (implicit or explicit) should +; wait for the store component in the LDS DMA. The additional noalias metadata +; is just meant to ensure that the wait counts are not generated due to some +; unintended aliasing. + +declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux) + +define amdgpu_kernel void @barrier_release(<4 x i32> inreg %rsrc, +; GFX900-LABEL: barrier_release: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX900-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v1, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s12 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v0, s13 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_barrier +; GFX900-NEXT: ds_read_b32 v0, v0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v1, v0, s[14:15] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: barrier_release: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB0_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB0_0: ; %main_body +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_barrier +; GFX90A-NEXT: ds_read_b32 v0, v0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: barrier_release: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_barrier +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: barrier_release: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB0_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB0_0: ; %main_body +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_barrier +; GFX942-NEXT: ds_read_b32 v0, v0 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: barrier_release: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB0_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB0_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, s13 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_barrier +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v0, v0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX1010-LABEL: barrier_release: +; GFX1010: ; %bb.0: ; %main_body +; GFX1010-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 m0, s12 +; GFX1010-NEXT: buffer_load_dword v0, s[8:11], 0 offen lds +; GFX1010-NEXT: v_mov_b32_e32 v0, s13 +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: s_barrier +; GFX1010-NEXT: buffer_gl0_inv +; GFX1010-NEXT: ds_read_b32 v0, v0 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: global_store_dword v1, v0, s[14:15] +; GFX1010-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + tail call void @llvm.amdgcn.s.barrier() + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @fence_fence(<4 x i32> inreg %rsrc, +; GFX900-LABEL: fence_fence: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: fence_fence: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB1_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB1_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: fence_fence: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: fence_fence: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB1_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB1_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: fence_fence: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB1_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB1_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX1010-LABEL: fence_fence: +; GFX1010: ; %bb.0: ; %main_body +; GFX1010-NEXT: s_clause 0x2 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX1010-NEXT: v_mov_b32_e32 v0, 0x800 +; GFX1010-NEXT: v_mov_b32_e32 v1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v2, 1 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 m0, s6 +; GFX1010-NEXT: buffer_load_dword v0, s[0:3], 0 offen lds +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: global_store_dword v1, v2, s[8:9] +; GFX1010-NEXT: global_load_dword v0, v1, s[8:9] glc +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, s7 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-NEXT: buffer_gl0_inv +; GFX1010-NEXT: ds_read_b32 v0, v0 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: global_store_dword v1, v0, s[10:11] +; GFX1010-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + fence syncscope("workgroup") release + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") monotonic, align 4, !noalias !105 + fence syncscope("workgroup") acquire + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +define amdgpu_kernel void @release_acquire(<4 x i32> inreg %rsrc, +; GFX900-LABEL: release_acquire: +; GFX900: ; %bb.0: ; %main_body +; GFX900-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX900-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX900-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX900-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 m0, s6 +; GFX900-NEXT: s_nop 0 +; GFX900-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX900-NEXT: v_mov_b32_e32 v1, 1 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[8:9] +; GFX900-NEXT: global_load_dword v1, v0, s[8:9] +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, s7 +; GFX900-NEXT: ds_read_b32 v1, v1 +; GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GFX900-NEXT: global_store_dword v0, v1, s[10:11] +; GFX900-NEXT: s_endpgm +; +; GFX90A-LABEL: release_acquire: +; GFX90A: ; %bb.1: +; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_branch .LBB2_0 +; GFX90A-NEXT: .p2align 8 +; GFX90A-NEXT: ; %bb.2: +; GFX90A-NEXT: .LBB2_0: ; %main_body +; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-NEXT: s_mov_b32 m0, s12 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-NEXT: global_load_dword v1, v0, s[0:1] +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-NEXT: ds_read_b32 v1, v1 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-NEXT: s_endpgm +; +; GFX90A-TGSPLIT-LABEL: release_acquire: +; GFX90A-TGSPLIT: ; %bb.1: +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX90A-TGSPLIT-NEXT: .p2align 8 +; GFX90A-TGSPLIT-NEXT: ; %bb.2: +; GFX90A-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX90A-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX90A-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] +; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol +; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX90A-TGSPLIT-NEXT: s_endpgm +; +; GFX942-LABEL: release_acquire: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB2_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB2_0: ; %main_body +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-NEXT: s_mov_b32 m0, s12 +; GFX942-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-NEXT: ds_read_b32 v1, v1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-NEXT: s_endpgm +; +; GFX942-TGSPLIT-LABEL: release_acquire: +; GFX942-TGSPLIT: ; %bb.1: +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: s_branch .LBB2_0 +; GFX942-TGSPLIT-NEXT: .p2align 8 +; GFX942-TGSPLIT-NEXT: ; %bb.2: +; GFX942-TGSPLIT-NEXT: .LBB2_0: ; %main_body +; GFX942-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x3c +; GFX942-TGSPLIT-NEXT: s_mov_b32 m0, s12 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-TGSPLIT-NEXT: buffer_load_dword v1, s[8:11], 0 offen lds +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, 1 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[0:1] sc0 +; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) +; GFX942-TGSPLIT-NEXT: buffer_inv sc0 +; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s13 +; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1 +; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] +; GFX942-TGSPLIT-NEXT: s_endpgm +; +; GFX1010-LABEL: release_acquire: +; GFX1010: ; %bb.0: ; %main_body +; GFX1010-NEXT: s_clause 0x2 +; GFX1010-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX1010-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x3c +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: v_mov_b32_e32 v1, 0x800 +; GFX1010-NEXT: v_mov_b32_e32 v2, 1 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 m0, s6 +; GFX1010-NEXT: buffer_load_dword v1, s[0:3], 0 offen lds +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: global_store_dword v0, v2, s[8:9] +; GFX1010-NEXT: global_load_dword v1, v0, s[8:9] glc +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: buffer_gl0_inv +; GFX1010-NEXT: v_mov_b32_e32 v1, s7 +; GFX1010-NEXT: ds_read_b32 v1, v1 +; GFX1010-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-NEXT: global_store_dword v0, v1, s[10:11] +; GFX1010-NEXT: s_endpgm + ptr addrspace(3) inreg %lds1, + ptr addrspace(3) inreg %lds2, + ptr addrspace(1) %flag, + ptr addrspace(1) %dummy2) { +main_body: + call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) %lds1, i32 4, i32 2048, i32 0, i32 0, i32 0), !alias.scope !102 + store atomic i32 1, ptr addrspace(1) %flag syncscope("workgroup") release, align 4, !noalias !105 + %unused_flag = load atomic i32, ptr addrspace(1) %flag syncscope("workgroup") acquire, align 4, !noalias !105 + %load = load i32, ptr addrspace(3) %lds2, align 4, !noalias !105 + store i32 %load, ptr addrspace(1) %dummy2, align 4, !noalias !105 + ret void +} + +!100 = !{!100} +!101 = !{!101, !100} +!102 = !{!101} +!103 = !{!103, !100} +!104 = !{!103} +!105 = !{!101, !103}