Skip to content

Commit 8c1b516

Browse files
changpengrampitecvangthao95
authored
AMDGPU: Implement s_wait_asynccnt and s_wait_tensorcnt for gfx1250 (#148292)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Co-authored-by: Vang Thao <Vang.Thao@amd.com>
1 parent f9d3278 commit 8c1b516

File tree

8 files changed

+123
-0
lines changed

8 files changed

+123
-0
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "n
665665
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
666666
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
667667

668+
TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
669+
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
670+
668671
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
669672
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
670673
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,24 @@ void test_s_monitor_sleep() {
2424
__builtin_amdgcn_s_monitor_sleep(10);
2525
}
2626

27+
// CHECK-LABEL: @test_s_wait_asynccnt(
28+
// CHECK-NEXT: entry:
29+
// CHECK-NEXT: call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
30+
// CHECK-NEXT: ret void
31+
//
32+
void test_s_wait_asynccnt() {
33+
__builtin_amdgcn_s_wait_asynccnt(0);
34+
}
35+
36+
// CHECK-LABEL: @test_s_wait_tensorcnt(
37+
// CHECK-NEXT: entry:
38+
// CHECK-NEXT: call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
39+
// CHECK-NEXT: ret void
40+
//
41+
void test_s_wait_tensorcnt() {
42+
__builtin_amdgcn_s_wait_tensorcnt(0);
43+
}
44+
2745
// CHECK-LABEL: @test_cvt_f16_fp8(
2846
// CHECK-NEXT: entry:
2947
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ void test_s_monitor_sleep(short a) {
1212
__builtin_amdgcn_s_monitor_sleep(a); // expected-error {{'__builtin_amdgcn_s_monitor_sleep' must be a constant integer}}
1313
}
1414

15+
void test_s_wait_asynccnt(short a) {
16+
__builtin_amdgcn_s_wait_asynccnt(a); // expected-error {{'__builtin_amdgcn_s_wait_asynccnt' must be a constant integer}}
17+
}
18+
19+
void test_s_wait_tensorcnt(short a) {
20+
__builtin_amdgcn_s_wait_tensorcnt(a); // expected-error {{'__builtin_amdgcn_s_wait_tensorcnt' must be a constant integer}}
21+
}
22+
1523
void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
1624
__builtin_amdgcn_cvt_f16_fp8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_fp8' must be a constant integer}}
1725
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3510,6 +3510,18 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
35103510
// gfx1250 intrinsics
35113511
// ===----------------------------------------------------------------------===//
35123512

3513+
// Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
3514+
// modeled as InaccessibleMem.
3515+
class AMDGPUWaitAsyncIntrinsic :
3516+
Intrinsic<[], [llvm_i16_ty],
3517+
[IntrInaccessibleMemOnly, ImmArg<ArgIndex<0>>, IntrWillReturn, IntrNoCallback,
3518+
IntrNoFree]>;
3519+
3520+
def int_amdgcn_s_wait_asynccnt :
3521+
ClangBuiltin<"__builtin_amdgcn_s_wait_asynccnt">, AMDGPUWaitAsyncIntrinsic;
3522+
def int_amdgcn_s_wait_tensorcnt :
3523+
ClangBuiltin<"__builtin_amdgcn_s_wait_tensorcnt">, AMDGPUWaitAsyncIntrinsic;
3524+
35133525
def int_amdgcn_ds_atomic_async_barrier_arrive_b64 :
35143526
ClangBuiltin<"__builtin_amdgcn_ds_atomic_async_barrier_arrive_b64">,
35153527
Intrinsic<[], [local_ptr_ty],

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1764,6 +1764,27 @@ let OtherPredicates = [HasExportInsts] in
17641764
[(int_amdgcn_s_wait_kmcnt timm:$simm16)]>;
17651765
} // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
17661766

1767+
let SubtargetPredicate = isGFX1250Plus, hasSideEffects = 1 in {
1768+
def S_WAIT_ASYNCCNT :
1769+
SOPP_Pseudo<"s_wait_asynccnt", (ins s16imm:$simm16), "$simm16",
1770+
[(int_amdgcn_s_wait_asynccnt timm:$simm16)]> {
1771+
let mayLoad = 1;
1772+
let mayStore = 1;
1773+
let maybeAtomic = 0;
1774+
let Uses = [ASYNCcnt];
1775+
let Defs = [ASYNCcnt];
1776+
}
1777+
def S_WAIT_TENSORCNT :
1778+
SOPP_Pseudo<"s_wait_tensorcnt", (ins s16imm:$simm16), "$simm16",
1779+
[(int_amdgcn_s_wait_tensorcnt timm:$simm16)]> {
1780+
let mayLoad = 1;
1781+
let mayStore = 1;
1782+
let maybeAtomic = 0;
1783+
let Uses = [TENSORcnt];
1784+
let Defs = [TENSORcnt];
1785+
}
1786+
} // End SubtargetPredicate = isGFX1250Plus, hasSideEffects = 1
1787+
17671788
let SubtargetPredicate = HasWaitXcnt, hasSideEffects = 1 in {
17681789
def S_WAIT_XCNT :
17691790
SOPP_Pseudo<"s_wait_xcnt", (ins s16imm:$simm16), "$simm16">;
@@ -2609,6 +2630,8 @@ defm S_WAIT_STORECNT_DSCNT : SOPP_Real_32_gfx12<0x049>;
26092630
//===----------------------------------------------------------------------===//
26102631
defm S_SETPRIO_INC_WG : SOPP_Real_32_gfx12<0x03e>;
26112632
defm S_WAIT_XCNT : SOPP_Real_32_gfx12<0x045>;
2633+
defm S_WAIT_ASYNCCNT : SOPP_Real_32_gfx12<0x04a>;
2634+
defm S_WAIT_TENSORCNT : SOPP_Real_32_gfx12<0x04b>;
26122635

26132636
//===----------------------------------------------------------------------===//
26142637
// SOPP - GFX11, GFX12.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
3+
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
4+
5+
define amdgpu_ps void @test_asynccnt() {
6+
; GFX12-LABEL: test_asynccnt:
7+
; GFX12: ; %bb.0:
8+
; GFX12-NEXT: s_wait_asynccnt 0x0
9+
; GFX12-NEXT: s_endpgm
10+
call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
11+
ret void
12+
}
13+
14+
define amdgpu_ps void @test_tensorcnt() {
15+
; GFX12-LABEL: test_tensorcnt:
16+
; GFX12: ; %bb.0:
17+
; GFX12-NEXT: s_wait_tensorcnt 0x0
18+
; GFX12-NEXT: s_endpgm
19+
call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)
20+
ret void
21+
}
22+
23+
declare void @llvm.amdgcn.s.wait.asynccnt(i16)
24+
declare void @llvm.amdgcn.s.wait.tensorcnt(i16)

llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,26 @@
11
// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1250 %s | FileCheck --check-prefix=GFX1250 %s
22
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX12-ERR --implicit-check-not=error: -strict-whitespace %s
33

4+
s_wait_asynccnt 0x1234
5+
// GFX1250: [0x34,0x12,0xca,0xbf]
6+
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
7+
8+
s_wait_asynccnt 0xc1d1
9+
// GFX1250: [0xd1,0xc1,0xca,0xbf]
10+
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
11+
12+
s_wait_tensorcnt 0x0
13+
// GFX1250: [0x00,0x00,0xcb,0xbf]
14+
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
15+
16+
s_wait_tensorcnt 0x1
17+
// GFX1250: [0x01,0x00,0xcb,0xbf]
18+
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
19+
20+
s_wait_tensorcnt 0x3
21+
// GFX1250: [0x03,0x00,0xcb,0xbf]
22+
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
23+
424
s_wait_xcnt 0x0
525
// GFX1250: [0x00,0x00,0xc5,0xbf]
626
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU

llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s
22

3+
# GFX1250: s_wait_asynccnt 0x1234 ; encoding: [0x34,0x12,0xca,0xbf]
4+
0x34,0x12,0xca,0xbf
5+
6+
# GFX1250: s_wait_asynccnt 0xc1d1 ; encoding: [0xd1,0xc1,0xca,0xbf]
7+
0xd1,0xc1,0xca,0xbf
8+
9+
# GFX1250: s_wait_tensorcnt 0x0 ; encoding: [0x00,0x00,0xcb,0xbf]
10+
0x00,0x00,0xcb,0xbf
11+
12+
# GFX1250: s_wait_tensorcnt 0x1 ; encoding: [0x01,0x00,0xcb,0xbf]
13+
0x01,0x00,0xcb,0xbf
14+
15+
# GFX1250: s_wait_tensorcnt 0x3 ; encoding: [0x03,0x00,0xcb,0xbf]
16+
0x03,0x00,0xcb,0xbf
17+
318
# GFX1250: s_wait_xcnt 0x0 ; encoding: [0x00,0x00,0xc5,0xbf]
419
0x00,0x00,0xc5,0xbf
520

0 commit comments

Comments
 (0)