Skip to content

Commit 3bee9ba

Browse files
nhaehnlearsenm
andauthored
AMDGPU/GFX12: Fix s_barrier_signal_isfirst for single-wave workgroups (#143634)
Barrier instructions are no-ops in single-wave workgroups. This includes s_barrier_signal_isfirst, which will leave SCC unmodified. Model this correctly (via an implicit use of SCC) and ensure SCC==1 before the barrier instruction (if the wave is the only one of the workgroup, then it is the first). --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
1 parent 74ec1c2 commit 3bee9ba

File tree

6 files changed

+66
-3
lines changed

6 files changed

+66
-3
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1402,6 +1402,10 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
14021402
performs subtraction only if the memory value is greater than or
14031403
equal to the data value.
14041404

1405+
llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction;
1406+
additionally ensures that the result value is valid even when the
1407+
intrinsic is used from a wave that is not running in a workgroup.
1408+
14051409
llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value
14061410
sign-extended from the width of the underlying PC hardware register even on
14071411
processors where the s_getpc_b64 instruction returns a zero-extended value.

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5918,6 +5918,9 @@ bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
59185918
const DebugLoc &DL = I.getDebugLoc();
59195919
Register CCReg = I.getOperand(0).getReg();
59205920

5921+
// Set SCC to true, in case the barrier instruction gets converted to a NOP.
5922+
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_CMP_EQ_U32)).addImm(0).addImm(0);
5923+
59215924
BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
59225925
.addImm(I.getOperand(2).getImm());
59235926

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5423,6 +5423,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
54235423
MI.eraseFromParent();
54245424
return BB;
54255425
}
5426+
case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
5427+
// Set SCC to true, in case the barrier instruction gets converted to a NOP.
5428+
BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5429+
TII->get(AMDGPU::S_CMP_EQ_U32))
5430+
.addImm(0)
5431+
.addImm(0);
5432+
return BB;
5433+
}
54265434
case AMDGPU::GET_GROUPSTATICSIZE: {
54275435
assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
54285436
getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
472472
def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
473473
"", []>{
474474
let Defs = [SCC];
475+
let Uses = [M0, SCC];
475476
let SchedRW = [WriteBarrier];
476477
let isConvergent = 1;
477478
}
@@ -487,6 +488,8 @@ def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
487488
def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
488489
(ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
489490
let Defs = [SCC];
491+
let Uses = [SCC];
492+
let usesCustomInserter = 1;
490493
let SchedRW = [WriteBarrier];
491494
let isConvergent = 1;
492495
}

llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,8 @@ body: |
374374
; CHECK-NEXT: successors: %bb.2(0x80000000)
375375
; CHECK-NEXT: {{ $}}
376376
; CHECK-NEXT: V_NOP_e32 implicit $exec
377-
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
377+
; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
378+
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
378379
; CHECK-NEXT: {{ $}}
379380
; CHECK-NEXT: bb.2:
380381
; CHECK-NEXT: S_ENDPGM 0
@@ -385,7 +386,8 @@ body: |
385386
bb.1:
386387
successors: %bb.2
387388
V_NOP_e32 implicit $exec
388-
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc
389+
S_CMP_EQ_U32 0, 0, implicit-def $scc
390+
S_BARRIER_SIGNAL_ISFIRST_IMM -1, implicit-def $scc, implicit $scc
389391
390392
bb.2:
391393
S_ENDPGM 0
@@ -437,6 +439,7 @@ body: |
437439
; CHECK-NEXT: {{ $}}
438440
; CHECK-NEXT: V_NOP_e32 implicit $exec
439441
; CHECK-NEXT: $m0 = S_MOV_B32 -1
442+
; CHECK-NEXT: S_CMP_EQ_U32 0, 0, implicit-def $scc
440443
; CHECK-NEXT: S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
441444
; CHECK-NEXT: {{ $}}
442445
; CHECK-NEXT: bb.2:
@@ -449,7 +452,8 @@ body: |
449452
successors: %bb.2
450453
V_NOP_e32 implicit $exec
451454
$m0 = S_MOV_B32 -1
452-
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc
455+
S_CMP_EQ_U32 0, 0, implicit-def $scc
456+
S_BARRIER_SIGNAL_ISFIRST_M0 implicit $m0, implicit-def $scc, implicit $scc
453457
454458
bb.2:
455459
S_ENDPGM 0
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
4+
5+
define i1 @func1() {
6+
; GFX12-SDAG-LABEL: func1:
7+
; GFX12-SDAG: ; %bb.0:
8+
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
9+
; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
10+
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
11+
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
12+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
13+
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
14+
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
15+
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
16+
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
17+
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
18+
; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
19+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
20+
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
21+
;
22+
; GFX12-GISEL-LABEL: func1:
23+
; GFX12-GISEL: ; %bb.0:
24+
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
25+
; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
26+
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
27+
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
28+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
29+
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
30+
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
31+
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
32+
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
33+
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
34+
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
35+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
36+
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
37+
%r = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
38+
ret i1 %r
39+
}
40+
41+
declare i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32)

0 commit comments

Comments
 (0)