Skip to content

Commit 11bbd06

Browse files
committed
[AMDGPU] Allocate AVRegClass last
Change-Id: Iace3462f27ea276b22716793ebfa13b5026b0e58
1 parent ff4faaa commit 11bbd06

37 files changed

+2286
-2323
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
109109
let TSFlags{2} = HasVGPR;
110110
let TSFlags{3} = HasAGPR;
111111
let TSFlags{4} = HasSGPR;
112+
113+
// RegisterClass (e.g. AGPR / VGPR) priority for allocation
114+
field int RegClassPriority = 1;
115+
112116
}
113117

114118
multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
@@ -936,14 +940,15 @@ class VRegClassBase<int numRegs, list<ValueType> regTypes, dag regList> :
936940

937941
// Requires n v_mov_b32 to copy
938942
let CopyCost = numRegs;
939-
let AllocationPriority = !sub(numRegs, 1);
943+
defvar SizePrioriity = !if(!le(numRegs, 14), !sub(numRegs, 1), !if(!le(numRegs, 16), 14, 15));
944+
let AllocationPriority = !add(SizePrioriity, !mul(RegClassPriority, 16));
940945
let Weight = numRegs;
941946
}
942947

943948
// Define a register tuple class, along with one requiring an even
944949
// aligned base register.
945950
multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
946-
let HasVGPR = 1 in {
951+
let HasVGPR = 1, RegClassPriority = 1 in {
947952
// Define the regular class.
948953
def "" : VRegClassBase<numRegs, regTypes, regList> {
949954
let BaseClassOrder = !mul(numRegs, 32);
@@ -977,7 +982,7 @@ defm VReg_1024 : VRegClass<32, Reg1024Types.types, (add VGPR_1024)>;
977982
}
978983

979984
multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
980-
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1 in {
985+
let CopyCost = !add(numRegs, numRegs, 1), HasAGPR = 1, RegClassPriority = 1 in {
981986
// Define the regular class.
982987
def "" : VRegClassBase<numRegs, regTypes, regList> {
983988
let BaseClassOrder = !mul(numRegs, 32);
@@ -1070,7 +1075,7 @@ def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_3
10701075
// aligned base register.
10711076
multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
10721077
dag vregList, dag aregList> {
1073-
let HasVGPR = 1, HasAGPR = 1 in {
1078+
let HasVGPR = 1, HasAGPR = 1, RegClassPriority = 0 in {
10741079
// Define the regular class.
10751080
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
10761081

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

Lines changed: 38 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4295,15 +4295,15 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
42954295
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
42964296
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
42974297
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
4298-
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
4298+
; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
42994299
; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo
43004300
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4301-
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
4302-
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4303-
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
4301+
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
4302+
; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4303+
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6
43044304
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4305-
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
4306-
; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v7
4305+
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff
4306+
; GFX12-TRUE16-NEXT: v_not_b32_e32 v10, v6
43074307
; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
43084308
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
43094309
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4317,29 +4317,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
43174317
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
43184318
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
43194319
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4320-
; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], null offen
4320+
; GFX12-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], null offen
43214321
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
43224322
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
43234323
; GFX12-TRUE16-NEXT: ; %bb.2:
43244324
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1
4325+
; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
43254326
; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0
43264327
; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
43274328
; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1
43284329
; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
43294330
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4330-
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4331+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4
43314332
; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo
43324333
; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0
43334334
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4334-
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4335-
; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
4335+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4336+
; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l
43364337
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4337-
; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
4338-
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4338+
; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
4339+
; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
43394340
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4340-
; GFX12-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
4341-
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4342-
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4341+
; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4
4342+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4343+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6
43434344
; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
43444345
; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
43454346
; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4354,21 +4355,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
43544355
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
43554356
; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
43564357
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4357-
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
4358+
; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
43584359
; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
43594360
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
43604361
; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
43614362
; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2
43624363
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
4363-
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4364+
; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
43644365
; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV
43654366
; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
43664367
; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe
43674368
; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
43684369
; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
43694370
; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
43704371
; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4371-
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4372+
; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
43724373
; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
43734374
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
43744375
;
@@ -4526,16 +4527,16 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
45264527
; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
45274528
; GFX11-TRUE16: ; %bb.0:
45284529
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4529-
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
4530+
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
45304531
; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0
45314532
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
45324533
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4533-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6
4534-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v6
4535-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
4534+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4
4535+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, -4, v4
4536+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 3, v6
45364537
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4537-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
4538-
; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v7
4538+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v8, 0xffff
4539+
; GFX11-TRUE16-NEXT: v_not_b32_e32 v10, v6
45394540
; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
45404541
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
45414542
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1
@@ -4547,29 +4548,30 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
45474548
; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0
45484549
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
45494550
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
4550-
; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v9, s[4:7], 0 offen
4551+
; GFX11-TRUE16-NEXT: buffer_load_b32 v4, v9, s[4:7], 0 offen
45514552
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
45524553
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1
45534554
; GFX11-TRUE16-NEXT: ; %bb.2:
45544555
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
4556+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
45554557
; GFX11-TRUE16-NEXT: .p2align 6
45564558
; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start
45574559
; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1
45584560
; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2
45594561
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4560-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, v6
4562+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4
45614563
; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo
45624564
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
45634565
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4564-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v8
4565-
; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l
4566+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v8, v6
4567+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v7.l
45664568
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4567-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6
4568-
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6
4569+
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
4570+
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v8, v4
45694571
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4570-
; GFX11-TRUE16-NEXT: v_and_or_b32 v7, v8, v10, v6
4571-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v7
4572-
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8
4572+
; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v10, v4
4573+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5
4574+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6
45734575
; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
45744576
; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2
45754577
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0
@@ -4583,13 +4585,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
45834585
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
45844586
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0
45854587
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4586-
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
4588+
; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v9, s[4:7], 0 offen glc
45874589
; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0
45884590
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4
45894591
; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
45904592
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2
45914593
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
4592-
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
4594+
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
45934595
; GFX11-TRUE16-NEXT: buffer_gl1_inv
45944596
; GFX11-TRUE16-NEXT: buffer_gl0_inv
45954597
; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1
@@ -4598,7 +4600,7 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
45984600
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_3
45994601
; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end
46004602
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1
4601-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v6
4603+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v8, v4
46024604
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
46034605
;
46044606
; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:

0 commit comments

Comments
 (0)