Skip to content

Commit a65b9dd

Browse files
committed
[AMDGPU] Divergence-driven instruction selection for bfm patterns
Differential Revision: https://reviews.llvm.org/D119706
1 parent f1efac7 commit a65b9dd

File tree

2 files changed

+111
-15
lines changed

2 files changed

+111
-15
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2784,20 +2784,21 @@ def : GCNPat<
27842784
(S_MOV_B32 SReg_32:$src)
27852785
>;
27862786

2787-
multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
2787+
multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> {
27882788
def : GCNPat <
2789-
(vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
2789+
(vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)),
27902790
(BFM $a, $b)
27912791
>;
27922792

27932793
def : GCNPat <
2794-
(vt (add (vt (shl 1, vt:$a)), -1)),
2795-
(BFM $a, (MOV (i32 0)))
2794+
(vt (ADD (vt (shl 1, vt:$a)), -1)),
2795+
(BFM $a, (i32 0))
27962796
>;
27972797
}
27982798

2799-
defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
2800-
// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
2799+
defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>;
2800+
// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>;
2801+
defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>;
28012802

28022803
// Bitfield extract patterns
28032804

llvm/test/CodeGen/AMDGPU/bfm.ll

Lines changed: 104 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,115 @@
1-
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
2-
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=SI,FUNC %s
3-
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3+
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
44

5-
; FUNC-LABEL: {{^}}bfm_pattern:
6-
; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
7-
define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
5+
define amdgpu_kernel void @s_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
6+
; SI-LABEL: s_bfm_pattern:
7+
; SI: ; %bb.0:
8+
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
9+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
10+
; SI-NEXT: s_mov_b32 s3, 0xf000
11+
; SI-NEXT: s_waitcnt lgkmcnt(0)
12+
; SI-NEXT: s_bfm_b32 s4, s4, s5
13+
; SI-NEXT: s_mov_b32 s2, -1
14+
; SI-NEXT: v_mov_b32_e32 v0, s4
15+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
16+
; SI-NEXT: s_endpgm
17+
;
18+
; VI-LABEL: s_bfm_pattern:
19+
; VI: ; %bb.0:
20+
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
21+
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
22+
; VI-NEXT: s_waitcnt lgkmcnt(0)
23+
; VI-NEXT: s_bfm_b32 s2, s2, s3
24+
; VI-NEXT: v_mov_b32_e32 v0, s0
25+
; VI-NEXT: v_mov_b32_e32 v1, s1
26+
; VI-NEXT: v_mov_b32_e32 v2, s2
27+
; VI-NEXT: flat_store_dword v[0:1], v2
28+
; VI-NEXT: s_endpgm
829
%a = shl i32 1, %x
930
%b = sub i32 %a, 1
1031
%c = shl i32 %b, %y
1132
store i32 %c, i32 addrspace(1)* %out
1233
ret void
1334
}
1435

15-
; FUNC-LABEL: {{^}}bfm_pattern_simple:
16-
; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
17-
define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
36+
define amdgpu_kernel void @s_bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
37+
; SI-LABEL: s_bfm_pattern_simple:
38+
; SI: ; %bb.0:
39+
; SI-NEXT: s_load_dword s2, s[0:1], 0xb
40+
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
41+
; SI-NEXT: s_mov_b32 s3, 0xf000
42+
; SI-NEXT: s_waitcnt lgkmcnt(0)
43+
; SI-NEXT: s_bfm_b32 s4, s2, 0
44+
; SI-NEXT: s_mov_b32 s2, -1
45+
; SI-NEXT: v_mov_b32_e32 v0, s4
46+
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
47+
; SI-NEXT: s_endpgm
48+
;
49+
; VI-LABEL: s_bfm_pattern_simple:
50+
; VI: ; %bb.0:
51+
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
52+
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
53+
; VI-NEXT: s_waitcnt lgkmcnt(0)
54+
; VI-NEXT: s_bfm_b32 s2, s2, 0
55+
; VI-NEXT: v_mov_b32_e32 v0, s0
56+
; VI-NEXT: v_mov_b32_e32 v1, s1
57+
; VI-NEXT: v_mov_b32_e32 v2, s2
58+
; VI-NEXT: flat_store_dword v[0:1], v2
59+
; VI-NEXT: s_endpgm
60+
%a = shl i32 1, %x
61+
%b = sub i32 %a, 1
62+
store i32 %b, i32 addrspace(1)* %out
63+
ret void
64+
}
65+
66+
define void @v_bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
67+
; SI-LABEL: v_bfm_pattern:
68+
; SI: ; %bb.0:
69+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
70+
; SI-NEXT: s_mov_b32 s7, 0xf000
71+
; SI-NEXT: s_mov_b32 s6, 0
72+
; SI-NEXT: v_bfm_b32_e32 v2, v2, v3
73+
; SI-NEXT: s_mov_b32 s4, s6
74+
; SI-NEXT: s_mov_b32 s5, s6
75+
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
76+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
77+
; SI-NEXT: s_setpc_b64 s[30:31]
78+
;
79+
; VI-LABEL: v_bfm_pattern:
80+
; VI: ; %bb.0:
81+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82+
; VI-NEXT: v_bfm_b32 v2, v2, v3
83+
; VI-NEXT: flat_store_dword v[0:1], v2
84+
; VI-NEXT: s_waitcnt vmcnt(0)
85+
; VI-NEXT: s_setpc_b64 s[30:31]
86+
%a = shl i32 1, %x
87+
%b = sub i32 %a, 1
88+
%c = shl i32 %b, %y
89+
store i32 %c, i32 addrspace(1)* %out
90+
ret void
91+
}
92+
93+
define void @v_bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
94+
; SI-LABEL: v_bfm_pattern_simple:
95+
; SI: ; %bb.0:
96+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
97+
; SI-NEXT: s_mov_b32 s7, 0xf000
98+
; SI-NEXT: s_mov_b32 s6, 0
99+
; SI-NEXT: v_bfm_b32_e64 v2, v2, 0
100+
; SI-NEXT: s_mov_b32 s4, s6
101+
; SI-NEXT: s_mov_b32 s5, s6
102+
; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
103+
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0)
104+
; SI-NEXT: s_setpc_b64 s[30:31]
105+
;
106+
; VI-LABEL: v_bfm_pattern_simple:
107+
; VI: ; %bb.0:
108+
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
109+
; VI-NEXT: v_bfm_b32 v2, v2, 0
110+
; VI-NEXT: flat_store_dword v[0:1], v2
111+
; VI-NEXT: s_waitcnt vmcnt(0)
112+
; VI-NEXT: s_setpc_b64 s[30:31]
18113
%a = shl i32 1, %x
19114
%b = sub i32 %a, 1
20115
store i32 %b, i32 addrspace(1)* %out

0 commit comments

Comments
 (0)