[AMDGPU] Adjust wave priority based on VMEM instructions to avoid duty-cycling.

kosarev · kosarev · commit 6ddf2a824da9 · 2022-04-27T14:37:18.000+01:00
As older waves execute long sequences of VALU instructions, this may prevent younger waves from address calculation and then issuing their VMEM loads, which in turn leads the VALU unit to idle. This patch tries to prevent this by temporarily raising the wave's priority. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D124246
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -331,6 +331,9 @@ extern char &GCNNSAReassignID;
 void initializeGCNPreRAOptimizationsPass(PassRegistry &);
 extern char &GCNPreRAOptimizationsID;
 
+FunctionPass *createAMDGPUSetWavePriorityPass();
+void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp
@@ -0,0 +1,166 @@
+//===- AMDGPUSetWavePriority.cpp - Set wave priority ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Pass to temporarily raise the wave priority beginning the start of
+/// the shader function until its last VMEM instructions to allow younger
+/// waves to issue their VMEM instructions as well.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-set-wave-priority"
+
+namespace {
+
+struct MBBInfo {
+  MBBInfo() = default;
+  bool MayReachVMEMLoad = false;
+};
+
+using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>;
+
+class AMDGPUSetWavePriority : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUSetWavePriority() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "Set wave priority"; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  MachineInstr *BuildSetprioMI(MachineFunction &MF, unsigned priority) const;
+
+  const SIInstrInfo *TII;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false,
+                false)
+
+char AMDGPUSetWavePriority::ID = 0;
+
+FunctionPass *llvm::createAMDGPUSetWavePriorityPass() {
+  return new AMDGPUSetWavePriority();
+}
+
+MachineInstr *AMDGPUSetWavePriority::BuildSetprioMI(MachineFunction &MF,
+                                                    unsigned priority) const {
+  return BuildMI(MF, DebugLoc(), TII->get(AMDGPU::S_SETPRIO)).addImm(priority);
+}
+
+// Checks that for every predecessor Pred that can reach a VMEM load,
+// none of Pred's successors can reach a VMEM load.
+static bool CanLowerPriorityDirectlyInPredecessors(const MachineBasicBlock &MBB,
+                                                   MBBInfoSet &MBBInfos) {
+  for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+    if (!MBBInfos[Pred].MayReachVMEMLoad)
+      continue;
+    for (const MachineBasicBlock *Succ : Pred->successors()) {
+      if (MBBInfos[Succ].MayReachVMEMLoad)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool isVMEMLoad(const MachineInstr &MI) {
+  return SIInstrInfo::isVMEM(MI) && MI.mayLoad();
+}
+
+bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) {
+  const unsigned HighPriority = 3;
+  const unsigned LowPriority = 0;
+
+  Function &F = MF.getFunction();
+  if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+    return false;
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+
+  MBBInfoSet MBBInfos;
+  SmallVector<const MachineBasicBlock *, 16> Worklist;
+  for (MachineBasicBlock &MBB : MF) {
+    if (any_of(MBB, isVMEMLoad))
+      Worklist.push_back(&MBB);
+  }
+
+  // Mark blocks from which control may reach VMEM loads.
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *MBB = Worklist.pop_back_val();
+    MBBInfo &Info = MBBInfos[MBB];
+    if (!Info.MayReachVMEMLoad) {
+      Info.MayReachVMEMLoad = true;
+      Worklist.append(MBB->pred_begin(), MBB->pred_end());
+    }
+  }
+
+  MachineBasicBlock &Entry = MF.front();
+  if (!MBBInfos[&Entry].MayReachVMEMLoad)
+    return false;
+
+  // Raise the priority at the beginning of the shader.
+  MachineBasicBlock::iterator I = Entry.begin(), E = Entry.end();
+  while (I != E && !SIInstrInfo::isVALU(*I) && !I->isTerminator())
+    ++I;
+  Entry.insert(I, BuildSetprioMI(MF, HighPriority));
+
+  // Lower the priority on edges where control leaves blocks from which
+  // VMEM loads are reachable.
+  SmallSet<MachineBasicBlock *, 16> PriorityLoweringBlocks;
+  for (MachineBasicBlock &MBB : MF) {
+    if (MBBInfos[&MBB].MayReachVMEMLoad) {
+      if (MBB.succ_empty())
+        PriorityLoweringBlocks.insert(&MBB);
+      continue;
+    }
+
+    if (CanLowerPriorityDirectlyInPredecessors(MBB, MBBInfos)) {
+      for (MachineBasicBlock *Pred : MBB.predecessors()) {
+        if (MBBInfos[Pred].MayReachVMEMLoad)
+          PriorityLoweringBlocks.insert(Pred);
+      }
+      continue;
+    }
+
+    // Where lowering the priority in predecessors is not possible, the
+    // block receiving control either was not part of a loop in the first
+    // place or the loop simplification/canonicalization pass should have
+    // already tried to split the edge and insert a preheader, and if for
+    // whatever reason it failed to do so, then this leaves us with the
+    // only option of lowering the priority within the loop.
+    PriorityLoweringBlocks.insert(&MBB);
+  }
+
+  for (MachineBasicBlock *MBB : PriorityLoweringBlocks) {
+    MachineBasicBlock::iterator I = MBB->end(), B = MBB->begin();
+    while (I != B) {
+      if (isVMEMLoad(*--I)) {
+        ++I;
+        break;
+      }
+    }
+    MBB->insert(I, BuildSetprioMI(MF, LowPriority));
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -277,6 +277,10 @@ EnableDCEInRA("amdgpu-dce-in-ra",
     cl::init(true), cl::Hidden,
     cl::desc("Enable machine DCE inside regalloc"));
 
+static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
+                                           cl::desc("Adjust wave priority"),
+                                           cl::init(false), cl::Hidden);
+
 static cl::opt<bool> EnableScalarIRPasses(
   "amdgpu-scalar-ir-passes",
   cl::desc("Enable scalar IR passes"),
@@ -1360,6 +1364,8 @@ void GCNPassConfig::addPreEmitPass() {
     addPass(&SIInsertHardClausesID);
 
   addPass(&SILateBranchLoweringPassID);
+  if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+    addPass(createAMDGPUSetWavePriorityPass());
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -89,6 +89,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUReplaceLDSUseWithPointer.cpp
   AMDGPUResourceUsageAnalysis.cpp
   AMDGPURewriteOutArguments.cpp
+  AMDGPUSetWavePriority.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetObjectFile.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll
@@ -0,0 +1,153 @@
+; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
+; RUN:   FileCheck %s
+
+; CHECK-LABEL: no_setprio:
+; CHECK-NOT:       s_setprio
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @no_setprio() {
+  ret <2 x float> <float 0.0, float 0.0>
+}
+
+; CHECK-LABEL: vmem_in_exit_block:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           ; return to shader part epilog
+define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: branch:
+; CHECK:           s_setprio 3
+; CHECK:           s_cbranch_scc0 [[A:.*]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[A]]:  ; %a
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK-NEXT:  [[EXIT]]:
+define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %a, label %b
+
+a:
+  ret <2 x float> <float 0.0, float 0.0>
+
+b:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  ret <2 x float> %v
+}
+
+; CHECK-LABEL: setprio_follows_setprio:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK:           s_cbranch_scc1 [[C:.*]]
+; CHECK:       {{.*}}:  ; %a
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_cbranch_scc1 [[C]]
+; CHECK:       {{.*}}:  ; %b
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[EXIT:.*]]
+; CHECK:       [[C]]:  ; %c
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[EXIT]]
+; CHECK:       [[EXIT]]:
+define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
+entry:
+  %v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond1 = icmp ne i32 %i, 0
+  br i1 %cond1, label %a, label %c
+
+a:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %cond2 = icmp ne i32 %i, 1
+  br i1 %cond2, label %b, label %c
+
+b:
+  ret <2 x float> %v2
+
+c:
+  %v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
+  %v4 = fadd <2 x float> %v1, %v3
+  ret <2 x float> %v4
+}
+
+; CHECK-LABEL: loop:
+; CHECK:       {{.*}}:  ; %entry
+; CHECK:           s_setprio 3
+; CHECK-NOT:       s_setprio
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK-NEXT:  {{.*}}:  ; %exit
+; CHECK-NEXT:      s_setprio 0
+define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
+
+  %i2 = add i32 %i, 1
+
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
+  %sum2 = fadd <2 x float> %sum, %v
+
+  %cond = icmp ult i32 %i2, 5
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret <2 x float> %sum2
+}
+
+; CHECK-LABEL: edge_split:
+; CHECK:           s_setprio 3
+; CHECK:           buffer_load_dwordx2
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
+; CHECK:       {{.*}}:  ; %loop.preheader
+; CHECK-NEXT:      s_setprio 0
+; CHECK:       [[LOOP:.*]]:  ; %loop
+; CHECK-NOT:       s_setprio
+; CHECK:           s_cbranch_scc1 [[LOOP]]
+; CHECK        {{.*}}:  ; %exit
+; CHECK-NOT:       s_setprio
+; CHECK:           s_branch [[RET:.*]]
+; CHECK:       [[ANOTHER_LOAD]]:  ; %another_load
+; CHECK:           buffer_load_dwordx2
+; CHECK-NEXT:      s_setprio 0
+; CHECK:           s_branch [[RET]]
+; CHECK:       [[RET]]:
+define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
+entry:
+  %v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
+  %cond = icmp ne i32 %x, 0
+  br i1 %cond, label %loop, label %another_load
+
+loop:
+  %i = phi i32 [0, %entry], [%i2, %loop]
+  %mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
+
+  %i2 = add i32 %i, 1
+  %mul2 = fmul <2 x float> %mul, %v
+
+  %cond2 = icmp ult i32 %i2, 5
+  br i1 %cond2, label %loop, label %exit
+
+exit:
+  ret <2 x float> %mul2
+
+another_load:
+  %v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
+  %sum = fadd <2 x float> %v, %v2
+  ret <2 x float> %sum
+}
+
+declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind