Skip to content

Commit e48731b

Browse files
authored
[AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling in movetoVALU process (#141152)
Add op_sel for v_s_xxx_f16 when move them to VALU update a few related codegen test for gfx12 in true16 mode
1 parent 8345d62 commit e48731b

8 files changed

+1425
-129
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7734,6 +7734,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77347734
Inst.eraseFromParent();
77357735
return;
77367736
}
7737+
case AMDGPU::V_S_EXP_F16_e64:
7738+
case AMDGPU::V_S_LOG_F16_e64:
7739+
case AMDGPU::V_S_RCP_F16_e64:
7740+
case AMDGPU::V_S_RSQ_F16_e64:
7741+
case AMDGPU::V_S_SQRT_F16_e64: {
7742+
const DebugLoc &DL = Inst.getDebugLoc();
7743+
Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
7744+
? &AMDGPU::VGPR_16RegClass
7745+
: &AMDGPU::VGPR_32RegClass);
7746+
auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7747+
.addImm(0) // src0_modifiers
7748+
.add(Inst.getOperand(2))
7749+
.addImm(0) // clamp
7750+
.addImm(0); // omod
7751+
if (ST.useRealTrue16Insts())
7752+
NewInstr.addImm(0); // opsel0
7753+
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7754+
legalizeOperandsVALUt16(*NewInstr, MRI);
7755+
legalizeOperands(*NewInstr, MDT);
7756+
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7757+
Inst.eraseFromParent();
7758+
return;
7759+
}
77377760
}
77387761

77397762
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {

llvm/test/CodeGen/AMDGPU/frem.ll

Lines changed: 893 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 94 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,102 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
23
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
34
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
47

58
declare half @llvm.amdgcn.rcp.f16(half %a)
69

7-
; GCN-LABEL: {{^}}rcp_f16
8-
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9-
; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
10-
; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
11-
; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
12-
; GCN: buffer_store_short v[[R_F16]]
13-
; GCN: s_endpgm
1410
define amdgpu_kernel void @rcp_f16(
11+
; GCN-LABEL: rcp_f16:
12+
; GCN: ; %bb.0: ; %entry
13+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
14+
; GCN-NEXT: s_mov_b32 s7, 0xf000
15+
; GCN-NEXT: s_mov_b32 s6, -1
16+
; GCN-NEXT: s_mov_b32 s10, s6
17+
; GCN-NEXT: s_mov_b32 s11, s7
18+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
19+
; GCN-NEXT: s_mov_b32 s8, s2
20+
; GCN-NEXT: s_mov_b32 s9, s3
21+
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
22+
; GCN-NEXT: s_mov_b32 s4, s0
23+
; GCN-NEXT: s_mov_b32 s5, s1
24+
; GCN-NEXT: s_waitcnt vmcnt(0)
25+
; GCN-NEXT: v_rcp_f16_e32 v0, v0
26+
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
27+
; GCN-NEXT: s_endpgm
28+
;
29+
; GFX11-TRUE16-LABEL: rcp_f16:
30+
; GFX11-TRUE16: ; %bb.0: ; %entry
31+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
32+
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
33+
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
34+
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
35+
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
36+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
37+
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
38+
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
39+
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
40+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
41+
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
42+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
43+
; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
44+
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
45+
; GFX11-TRUE16-NEXT: s_endpgm
46+
;
47+
; GFX11-FAKE16-LABEL: rcp_f16:
48+
; GFX11-FAKE16: ; %bb.0: ; %entry
49+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
50+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
51+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
52+
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
53+
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
54+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
56+
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
57+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
58+
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
59+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
60+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
61+
; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
62+
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
63+
; GFX11-FAKE16-NEXT: s_endpgm
64+
;
65+
; GFX12-TRUE16-LABEL: rcp_f16:
66+
; GFX12-TRUE16: ; %bb.0: ; %entry
67+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
69+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
72+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
73+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
74+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
75+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
76+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
77+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
78+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
79+
; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
80+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
81+
; GFX12-TRUE16-NEXT: s_endpgm
82+
;
83+
; GFX12-FAKE16-LABEL: rcp_f16:
84+
; GFX12-FAKE16: ; %bb.0: ; %entry
85+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
86+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
87+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
88+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
89+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
90+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
91+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
92+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
93+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
94+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
95+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
96+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
97+
; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
98+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
99+
; GFX12-FAKE16-NEXT: s_endpgm
15100
ptr addrspace(1) %r,
16101
ptr addrspace(1) %a) {
17102
entry:
@@ -20,3 +105,5 @@ entry:
20105
store half %r.val, ptr addrspace(1) %r
21106
ret void
22107
}
108+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
109+
; VI: {{.*}}
Lines changed: 94 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,102 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
23
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
34
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
47

58
declare half @llvm.amdgcn.rsq.f16(half %a)
69

7-
; GCN-LABEL: {{^}}rsq_f16
8-
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
9-
; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
10-
; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
11-
; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
12-
; GCN: buffer_store_short v[[R_F16]]
13-
; GCN: s_endpgm
1410
define amdgpu_kernel void @rsq_f16(
11+
; GCN-LABEL: rsq_f16:
12+
; GCN: ; %bb.0: ; %entry
13+
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
14+
; GCN-NEXT: s_mov_b32 s7, 0xf000
15+
; GCN-NEXT: s_mov_b32 s6, -1
16+
; GCN-NEXT: s_mov_b32 s10, s6
17+
; GCN-NEXT: s_mov_b32 s11, s7
18+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
19+
; GCN-NEXT: s_mov_b32 s8, s2
20+
; GCN-NEXT: s_mov_b32 s9, s3
21+
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
22+
; GCN-NEXT: s_mov_b32 s4, s0
23+
; GCN-NEXT: s_mov_b32 s5, s1
24+
; GCN-NEXT: s_waitcnt vmcnt(0)
25+
; GCN-NEXT: v_rsq_f16_e32 v0, v0
26+
; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
27+
; GCN-NEXT: s_endpgm
28+
;
29+
; GFX11-TRUE16-LABEL: rsq_f16:
30+
; GFX11-TRUE16: ; %bb.0: ; %entry
31+
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
32+
; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
33+
; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
34+
; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
35+
; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
36+
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
37+
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
38+
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
39+
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
40+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
41+
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
42+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
43+
; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
44+
; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
45+
; GFX11-TRUE16-NEXT: s_endpgm
46+
;
47+
; GFX11-FAKE16-LABEL: rsq_f16:
48+
; GFX11-FAKE16: ; %bb.0: ; %entry
49+
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
50+
; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
51+
; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
52+
; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
53+
; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
54+
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
55+
; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
56+
; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
57+
; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
58+
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
59+
; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
60+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
61+
; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
62+
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
63+
; GFX11-FAKE16-NEXT: s_endpgm
64+
;
65+
; GFX12-TRUE16-LABEL: rsq_f16:
66+
; GFX12-TRUE16: ; %bb.0: ; %entry
67+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
68+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
69+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
70+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
71+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
72+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
73+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
74+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
75+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
76+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
77+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
78+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
79+
; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
80+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
81+
; GFX12-TRUE16-NEXT: s_endpgm
82+
;
83+
; GFX12-FAKE16-LABEL: rsq_f16:
84+
; GFX12-FAKE16: ; %bb.0: ; %entry
85+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
86+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
87+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
88+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
89+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
90+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
91+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
92+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
93+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
94+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
95+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
96+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
97+
; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
98+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
99+
; GFX12-FAKE16-NEXT: s_endpgm
15100
ptr addrspace(1) %r,
16101
ptr addrspace(1) %a) {
17102
entry:
@@ -20,3 +105,5 @@ entry:
20105
store half %r.val, ptr addrspace(1) %r
21106
ret void
22107
}
108+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
109+
; VI: {{.*}}

llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
22
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
33
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
5-
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
4+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
5+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
6+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
7+
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
68

79
declare half @llvm.sqrt.f16(half %a)
810
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -81,6 +83,42 @@ define amdgpu_kernel void @sqrt_f16(
8183
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
8284
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
8385
; GFX11-FAKE16-NEXT: s_endpgm
86+
;
87+
; GFX12-TRUE16-LABEL: sqrt_f16:
88+
; GFX12-TRUE16: ; %bb.0: ; %entry
89+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
90+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
91+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
92+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
93+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
94+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
95+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
96+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
97+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
98+
; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
99+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
100+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
101+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
102+
; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
103+
; GFX12-TRUE16-NEXT: s_endpgm
104+
;
105+
; GFX12-FAKE16-LABEL: sqrt_f16:
106+
; GFX12-FAKE16: ; %bb.0: ; %entry
107+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
108+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
109+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
110+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
111+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
112+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
113+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
114+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
115+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
116+
; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
117+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
118+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
119+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
120+
; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
121+
; GFX12-FAKE16-NEXT: s_endpgm
84122
ptr addrspace(1) %r,
85123
ptr addrspace(1) %a) {
86124
entry:
@@ -189,6 +227,50 @@ define amdgpu_kernel void @sqrt_v2f16(
189227
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
190228
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
191229
; GFX11-FAKE16-NEXT: s_endpgm
230+
;
231+
; GFX12-TRUE16-LABEL: sqrt_v2f16:
232+
; GFX12-TRUE16: ; %bb.0: ; %entry
233+
; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
234+
; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
235+
; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
236+
; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
237+
; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
238+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
239+
; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
240+
; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
241+
; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
242+
; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
243+
; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
244+
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
245+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
246+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
247+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
248+
; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, v1.l
249+
; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
250+
; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
251+
; GFX12-TRUE16-NEXT: s_endpgm
252+
;
253+
; GFX12-FAKE16-LABEL: sqrt_v2f16:
254+
; GFX12-FAKE16: ; %bb.0: ; %entry
255+
; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
256+
; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
257+
; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
258+
; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
259+
; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
260+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
261+
; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
262+
; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
263+
; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
264+
; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
265+
; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
266+
; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
267+
; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
268+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
269+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
270+
; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
271+
; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
272+
; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
273+
; GFX12-FAKE16-NEXT: s_endpgm
192274
ptr addrspace(1) %r,
193275
ptr addrspace(1) %a) {
194276
entry:
@@ -197,5 +279,3 @@ entry:
197279
store <2 x half> %r.val, ptr addrspace(1) %r
198280
ret void
199281
}
200-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
201-
; GFX11: {{.*}}

0 commit comments

Comments
 (0)