diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp index 4deb2a9485e4d..dc693f148796e 100644 --- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp +++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp @@ -43,6 +43,11 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-pre-ra-optimizations" +static cl::opt + InflateToAVGPR("amdgpu-avgpr-inflation", cl::Hidden, cl::init(false), + cl::desc("Enable register inflation to avgpr register class " + "(which can be assigned to either AGPR or VGPR).")); + namespace { class GCNPreRAOptimizationsImpl { @@ -253,6 +258,13 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { if (!LIS->hasInterval(Reg)) continue; const TargetRegisterClass *RC = MRI->getRegClass(Reg); + + if (InflateToAVGPR && ST.hasGFX90AInsts() && + (TRI->isAGPRClass(RC) || TRI->isVGPRClass(RC))) { + MRI->recomputeRegClass(Reg); + continue; + } + if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) && (ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC))) continue; diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll new file mode 100644 index 0000000000000..33a4f24de8cd3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 --amdgpu-avgpr-inflation=1 -verify-machineinstrs %s 2>&1 | FileCheck -check-prefix=INFLATE %s + + +define amdgpu_kernel void @attn_fwd(ptr addrspace(3) %in0, ptr addrspace(3) %in1, ptr addrspace(3) %in2, ptr addrspace(3) %in3, ptr addrspace(3) %in4, ptr addrspace(3) %in5, ptr addrspace(3) %in6, ptr addrspace(3) %in7, ptr addrspace(0) %out) #0 { +; CHECK-LABEL: attn_fwd: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s8 +; CHECK-NEXT: v_mov_b32_e32 v4, s9 +; CHECK-NEXT: v_mov_b32_e32 v5, s10 +; CHECK-NEXT: ds_read_b128 v[0:3], v0 +; CHECK-NEXT: ds_read_b128 v[8:11], v4 +; CHECK-NEXT: ds_read_b128 v[4:7], v5 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_accvgpr_write_b32 a16, v7 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a17, v6 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a18, v5 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_write_b32 a19, v4 ; Reload Reuse +; CHECK-NEXT: v_mov_b32_e32 v4, s11 +; CHECK-NEXT: ds_read_b128 v[12:15], v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s12 +; CHECK-NEXT: ds_read_b128 v[16:19], v4 +; CHECK-NEXT: v_mov_b32_e32 v4, s13 +; CHECK-NEXT: v_mov_b32_e32 v5, s14 +; CHECK-NEXT: v_mov_b32_e32 v6, s15 +; CHECK-NEXT: ds_read_b128 v[20:23], v4 +; CHECK-NEXT: ds_read_b128 v[24:27], v5 +; CHECK-NEXT: ds_read_b128 v[4:7], v6 +; CHECK-NEXT: ; sched_barrier mask(0x00000000) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], 0 +; CHECK-NEXT: v_accvgpr_read_b32 v3, a16 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v2, a17 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v1, a18 ; Reload Reuse +; CHECK-NEXT: v_accvgpr_read_b32 v0, a19 ; Reload Reuse +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15] +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[0:3], a[0:15] +; CHECK-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15] +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15] +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15] +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15] +; CHECK-NEXT: s_nop 7 +; CHECK-NEXT: s_nop 3 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], a[0:3] +; CHECK-NEXT: s_endpgm +; +; INFLATE-LABEL: attn_fwd: +; INFLATE: ; %bb.0: +; INFLATE-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 +; INFLATE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20 +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mov_b32_e32 v0, s8 +; INFLATE-NEXT: v_mov_b32_e32 v4, s9 +; INFLATE-NEXT: v_mov_b32_e32 v8, s10 +; INFLATE-NEXT: v_mov_b32_e32 v12, s11 +; INFLATE-NEXT: v_mov_b32_e32 v16, s12 +; INFLATE-NEXT: v_mov_b32_e32 v20, s13 +; INFLATE-NEXT: v_mov_b32_e32 v24, s14 +; INFLATE-NEXT: ds_read_b128 a[0:3], v0 +; INFLATE-NEXT: ds_read_b128 v[4:7], v4 +; INFLATE-NEXT: ds_read_b128 v[8:11], v8 +; INFLATE-NEXT: ds_read_b128 v[12:15], v12 +; INFLATE-NEXT: ds_read_b128 v[16:19], v16 +; INFLATE-NEXT: v_mov_b32_e32 v0, s15 +; INFLATE-NEXT: ds_read_b128 v[20:23], v20 +; INFLATE-NEXT: ds_read_b128 v[24:27], v24 +; INFLATE-NEXT: ds_read_b128 a[16:19], v0 +; INFLATE-NEXT: ; sched_barrier mask(0x00000000) +; INFLATE-NEXT: s_waitcnt lgkmcnt(7) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[0:3], a[0:3], 0 +; INFLATE-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; INFLATE-NEXT: s_waitcnt lgkmcnt(6) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[4:7], v[4:7], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(5) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[8:11], v[8:11], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(4) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[12:15], v[12:15], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(3) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[16:19], v[16:19], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(2) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[20:23], v[20:23], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(1) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[24:27], v[24:27], a[0:15] +; INFLATE-NEXT: s_waitcnt lgkmcnt(0) +; INFLATE-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], a[16:19], a[16:19], a[0:15] +; INFLATE-NEXT: s_nop 7 +; INFLATE-NEXT: s_nop 3 +; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[12:15] offset:48 +; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[8:11] offset:32 +; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[4:7] offset:16 +; INFLATE-NEXT: flat_store_dwordx4 v[0:1], a[0:3] +; INFLATE-NEXT: s_endpgm + %load0 = load <8 x half>, ptr addrspace(3) %in0, align 16 + %load1 = load <8 x half>, ptr addrspace(3) %in1, align 16 + %load2 = load <8 x half>, ptr addrspace(3) %in2, align 16 + %load3 = load <8 x half>, ptr addrspace(3) %in3, align 16 + %load4 = load <8 x half>, ptr addrspace(3) %in4, align 16 + %load5 = load <8 x half>, ptr addrspace(3) %in5, align 16 + %load6 = load <8 x half>, ptr addrspace(3) %in6, align 16 + %load7 = load <8 x half>, ptr addrspace(3) %in7, align 16 + tail call void @llvm.amdgcn.sched.barrier(i32 0) + %mfma0 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load0, <8 x half> %load0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0) + %mfma1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load1, <8 x half> %load1, <16 x float> %mfma0, i32 0, i32 0, i32 0) + %mfma2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load2, <8 x half> %load2, <16 x float> %mfma1, i32 0, i32 0, i32 0) + %mfma3 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load3, <8 x half> %load3, <16 x float> %mfma2, i32 0, i32 0, i32 0) + %mfma4 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load4, <8 x half> %load4, <16 x float> %mfma3, i32 0, i32 0, i32 0) + %mfma5 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load5, <8 x half> %load5, <16 x float> %mfma4, i32 0, i32 0, i32 0) + %mfma6 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load6, <8 x half> %load6, <16 x float> %mfma5, i32 0, i32 0, i32 0) + %mfma7 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %load7, <8 x half> %load7, <16 x float> %mfma6, i32 0, i32 0, i32 0) + store <16 x float> %mfma7, ptr addrspace(0) %out + ret void + } + +attributes #0 = { "amdgpu-num-vgpr"="24" "amdgpu-agpr-alloc"="20,256"} diff --git a/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.mir b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.mir new file mode 100644 index 0000000000000..deb51444e73ce --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/preinflate-avgpr.mir @@ -0,0 +1,66 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -verify-machineinstrs --amdgpu-avgpr-inflation=1 --run-pass=amdgpu-pre-ra-optimizations %s -o - | FileCheck %s -check-prefix=INFLATE + +--- +name: agpr_constraint +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: agpr_constraint + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3) + ; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs + ; + ; INFLATE-LABEL: name: agpr_constraint + ; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:areg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3) + ; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs + %0:vgpr_32 = IMPLICIT_DEF + %1:areg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec + INLINEASM &"", 1 /* sideeffect attdialect */, 6291466 /* regdef:AReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3) + S_ENDPGM 0, amdgpu_allvgprs +... + +--- +name: vgpr_constraint +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_constraint + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3) + ; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs + ; + ; INFLATE-LABEL: name: vgpr_constraint + ; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; INFLATE-NEXT: [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; INFLATE-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def dead [[DS_READ_B128_gfx9_]], 2147483657 /* reguse tiedto:$0 */, [[DS_READ_B128_gfx9_]](tied-def 3) + ; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs + %0:vgpr_32 = IMPLICIT_DEF + %1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec + INLINEASM &"", 1 /* sideeffect attdialect */, 6422538 /* regdef:VReg_128_Align2 */, def %1, 2147483657 /* reguse tiedto:$0 */, %1(tied-def 3) + S_ENDPGM 0, amdgpu_allvgprs +... + +--- +name: no_constraint +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: no_constraint + ; CHECK: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:vreg_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0, amdgpu_allvgprs + ; + ; INFLATE-LABEL: name: no_constraint + ; INFLATE: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; INFLATE-NEXT: dead [[DS_READ_B128_gfx9_:%[0-9]+]]:av_128_align2 = DS_READ_B128_gfx9 [[DEF]], 0, 0, implicit $exec + ; INFLATE-NEXT: S_ENDPGM 0, amdgpu_allvgprs + %0:vgpr_32 = IMPLICIT_DEF + %1:vreg_128_align2 = DS_READ_B128_gfx9 %0, 0, 0, implicit $exec + S_ENDPGM 0, amdgpu_allvgprs +...