Skip to content

Commit 5035d20

Browse files
changpengrampitec
andauthored
AMDGPU: Implement ds_atomic_async_barrier_arrive_b64/ds_atomic_barrier_arrive_rtn_b64 (#146409)
These two instructions are supported by gfx1250. We define the instructions and implement the corresponding intrinsic and builtin. Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
1 parent 3edae0d commit 5035d20

17 files changed

+215
-5
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8i16, "V8sV8s*3", "nc", "gfx1
653653
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8f16, "V8hV8h*3", "nc", "gfx1250-insts,wavefrontsize32")
654654
TARGET_BUILTIN(__builtin_amdgcn_ds_load_tr16_b128_v8bf16, "V8yV8y*3", "nc", "gfx1250-insts,wavefrontsize32")
655655

656+
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64, "vLi*3", "nc", "gfx1250-insts")
657+
TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "nc", "gfx1250-insts")
658+
656659
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
657660
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
658661

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
4+
5+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_async_barrier_arrive_b64(
6+
// CHECK-GFX1250-NEXT: entry:
7+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.ds.atomic.async.barrier.arrive.b64(ptr addrspace(3) [[ADDR:%.*]])
8+
// CHECK-GFX1250-NEXT: ret void
9+
//
10+
void test_amdgcn_ds_atomic_async_barrier_arrive_b64(local long* addr)
11+
{
12+
__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64(addr);
13+
}
14+
15+
// CHECK-GFX1250-LABEL: @test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(
16+
// CHECK-GFX1250-NEXT: entry:
17+
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.amdgcn.ds.atomic.barrier.arrive.rtn.b64(ptr addrspace(3) [[ADDR:%.*]], i64 [[DATA:%.*]])
18+
// CHECK-GFX1250-NEXT: store i64 [[TMP0]], ptr [[OUT:%.*]], align 8, !tbaa [[TBAA4:![0-9]+]]
19+
// CHECK-GFX1250-NEXT: ret void
20+
//
21+
void test_amdgcn_ds_atomic_barrier_arrive_rtn_b64(local long* addr, long data, long *out)
22+
{
23+
*out = __builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64(addr, data);
24+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3504,6 +3504,19 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
35043504
// gfx1250 intrinsics
35053505
// ===----------------------------------------------------------------------===//
35063506

3507+
def int_amdgcn_ds_atomic_async_barrier_arrive_b64 :
3508+
ClangBuiltin<"__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64">,
3509+
Intrinsic<[], [local_ptr_ty],
3510+
// Atomically updates LDS and also ASYNC_CNT which is modeled as InaccessibleMem.
3511+
[IntrConvergent, IntrWillReturn, IntrInaccessibleMemOrArgMemOnly],
3512+
"", [SDNPMemOperand]>;
3513+
3514+
def int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 :
3515+
ClangBuiltin<"__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64">,
3516+
Intrinsic<[llvm_i64_ty], [local_ptr_ty, llvm_i64_ty],
3517+
[IntrConvergent, IntrWillReturn, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
3518+
"", [SDNPMemOperand]>;
3519+
35073520
def int_amdgcn_s_monitor_sleep :
35083521
ClangBuiltin<"__builtin_amdgcn_s_monitor_sleep">,
35093522
DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1319,6 +1319,12 @@ def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
13191319
"The platform has memory to lds instructions (global_load w/lds bit set, buffer_load w/lds bit set or global_load_lds. This does not include scratch_load_lds."
13201320
>;
13211321

1322+
def FeatureLdsBarrierArriveAtomic : SubtargetFeature< "lds-barrier-arrive-atomic",
1323+
"HasLdsBarrierArriveAtomic",
1324+
"true",
1325+
"Has LDS barrier-arrive atomic instructions"
1326+
>;
1327+
13221328
// Dummy feature used to disable assembler instructions.
13231329
def FeatureDisable : SubtargetFeature<"",
13241330
"FeatureDisable","true",
@@ -1955,6 +1961,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19551961
FeatureMemoryAtomicFAddF32DenormalSupport,
19561962
FeatureKernargPreload,
19571963
FeatureLshlAddU64Inst,
1964+
FeatureLdsBarrierArriveAtomic,
19581965
FeatureSetPrioIncWgInst,
19591966
]>;
19601967

@@ -2687,6 +2694,9 @@ def HasAshrPkInsts : Predicate<"Subtarget->hasAshrPkInsts()">,
26872694
def HasLshlAddU64Inst : Predicate<"Subtarget->hasLshlAddU64Inst()">,
26882695
AssemblerPredicate<(all_of FeatureLshlAddU64Inst)>;
26892696

2697+
def HasLdsBarrierArriveAtomic : Predicate<"Subtarget->hasLdsBarrierArriveAtomic()">,
2698+
AssemblerPredicate<(all_of FeatureLdsBarrierArriveAtomic)>;
2699+
26902700
def HasSetPrioIncWgInst : Predicate<"Subtarget->hasSetPrioIncWgInst()">,
26912701
AssemblerPredicate<(all_of FeatureSetPrioIncWgInst)>;
26922702

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5117,6 +5117,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
51175117
case Intrinsic::amdgcn_ds_read_tr6_b96:
51185118
case Intrinsic::amdgcn_ds_read_tr8_b64:
51195119
case Intrinsic::amdgcn_ds_read_tr16_b64:
5120+
case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
5121+
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
51205122
return getDefaultMappingAllVGPR(MI);
51215123
case Intrinsic::amdgcn_ds_ordered_add:
51225124
case Intrinsic::amdgcn_ds_ordered_swap: {

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,20 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
354354
let gdsValue = 1;
355355
}
356356

357+
class DS_1A_Off16_NORET <string opName>
358+
: DS_Pseudo<opName,
359+
(outs),
360+
(ins VGPR_32:$addr, Offset:$offset, gds:$gds),
361+
" $addr$offset$gds"> {
362+
363+
let has_vdst = 0;
364+
let has_offset = 1;
365+
let has_data0 = 0;
366+
let has_data1 = 0;
367+
let has_m0_read = 0;
368+
let IsAtomicNoRet = 1;
369+
}
370+
357371
class DS_0A_RET <string opName> : DS_Pseudo<opName,
358372
(outs getLdStRegisterOperand<VGPR_32>.ret:$vdst),
359373
(ins Offset:$offset, gds:$gds),
@@ -794,6 +808,24 @@ defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
794808
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
795809
} // End WaveSizePredicate = isWave32, mayStore = 0
796810

811+
let OtherPredicates = [HasLdsBarrierArriveAtomic] in {
812+
let ASYNC_CNT = 1, LGKM_CNT = 0, Uses = [EXEC, ASYNCcnt], Defs = [ASYNCcnt] in {
813+
def DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_1A_Off16_NORET<"ds_atomic_async_barrier_arrive_b64">;
814+
}
815+
816+
def : GCNPat <
817+
(int_amdgcn_ds_atomic_async_barrier_arrive_b64 (DS1Addr1Offset i32:$ptr, i32:$offset)),
818+
(DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 VGPR_32:$ptr, Offset:$offset, (i1 0))
819+
>;
820+
821+
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_1A1D_RET_mc_gfx9<"ds_atomic_barrier_arrive_rtn_b64", VReg_64>;
822+
823+
def : GCNPat<
824+
(i64 (int_amdgcn_ds_atomic_barrier_arrive_rtn_b64 (DS1Addr1Offset i32:$ptr, i32:$offset), i64:$data)),
825+
(DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 $ptr, $data, Offset:$offset, (i1 0))
826+
>;
827+
} // End OtherPredicates = [HasLdsBarrierArriveAtomic]
828+
797829
} // End SubtargetPredicate = isGFX1250Plus
798830

799831
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
@@ -1366,6 +1398,11 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
13661398
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
13671399
defm DS_BVH_STACK_PUSH8_POP2_RTN_B64 : DS_Real_gfx12<0x0e2>;
13681400

1401+
let AssemblerPredicate = HasLdsBarrierArriveAtomic in {
1402+
defm DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 : DS_Real_gfx12<0x056>;
1403+
defm DS_ATOMIC_BARRIER_ARRIVE_RTN_B64 : DS_Real_gfx12<0x075>;
1404+
}
1405+
13691406
// New aliases added in GFX12 without renaming the instructions.
13701407
let AssemblerPredicate = isGFX12Plus in {
13711408
def : AMDGPUMnemonicAlias<"ds_subrev_u32", "ds_rsub_u32">;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
263263
bool HasMinimum3Maximum3PKF16 = false;
264264
bool HasLshlAddU64Inst = false;
265265
bool HasPointSampleAccel = false;
266+
bool HasLdsBarrierArriveAtomic = false;
266267
bool HasSetPrioIncWgInst = false;
267268

268269
bool RequiresCOV6 = false;
@@ -1381,6 +1382,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13811382

13821383
bool hasPointSampleAccel() const { return HasPointSampleAccel; }
13831384

1385+
bool hasLdsBarrierArriveAtomic() const { return HasLdsBarrierArriveAtomic; }
1386+
13841387
/// \returns The maximum number of instructions that can be enclosed in an
13851388
/// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
13861389
/// instruction.

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,7 @@ enum : uint64_t {
113113
SCALAR_STORE = UINT64_C(1) << 39,
114114
FIXED_SIZE = UINT64_C(1) << 40,
115115

116-
// Reserved, must be 0.
117-
Reserved1 = UINT64_C(1) << 41,
116+
ASYNC_CNT = UINT64_C(1) << 41,
118117

119118
VOP3_OPSEL = UINT64_C(1) << 42,
120119
maybeAtomic = UINT64_C(1) << 43,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,6 +1404,19 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
14041404

14051405
return true;
14061406
}
1407+
case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1408+
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1409+
Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1410+
? ISD::INTRINSIC_W_CHAIN
1411+
: ISD::INTRINSIC_VOID;
1412+
Info.memVT = MVT::getVT(CI.getType());
1413+
Info.ptrVal = CI.getOperand(0);
1414+
Info.memVT = MVT::i64;
1415+
Info.size = 8;
1416+
Info.align.reset();
1417+
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1418+
return true;
1419+
}
14071420
case Intrinsic::amdgcn_global_atomic_csub: {
14081421
Info.opc = ISD::INTRINSIC_W_CHAIN;
14091422
Info.memVT = MVT::getVT(CI.getType());
@@ -1564,6 +1577,8 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
15641577
case Intrinsic::amdgcn_ds_read_tr16_b64:
15651578
case Intrinsic::amdgcn_ds_ordered_add:
15661579
case Intrinsic::amdgcn_ds_ordered_swap:
1580+
case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1581+
case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
15671582
case Intrinsic::amdgcn_flat_atomic_fmax_num:
15681583
case Intrinsic::amdgcn_flat_atomic_fmin_num:
15691584
case Intrinsic::amdgcn_global_atomic_csub:

llvm/lib/Target/AMDGPU/SIInstrFormats.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ class InstSI <dag outs, dag ins, string asm = "",
7777
// instruction size.
7878
field bit FixedSize = 0;
7979

80+
// wait count to manage asynchronous loads/stores.
81+
field bit ASYNC_CNT = 0;
82+
8083
// This bit indicates that this is a VOP3 opcode which supports op_sel
8184
// modifier.
8285
field bit VOP3_OPSEL = 0;
@@ -204,8 +207,7 @@ class InstSI <dag outs, dag ins, string asm = "",
204207
let TSFlags{39} = ScalarStore;
205208
let TSFlags{40} = FixedSize;
206209

207-
// Reserved, must be 0.
208-
let TSFlags{41} = 0;
210+
let TSFlags{41} = ASYNC_CNT;
209211

210212
let TSFlags{42} = VOP3_OPSEL;
211213

0 commit comments

Comments
 (0)