-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AArch64] Use mov imm pseudo instructions in madd combine. #147510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesThe usual path for lowering immediates in AArch64 is to generate a MOVi32imm or MOVi64imm pseudo instruction, that can be moved / rematerialized around as required, being expanded into one or multiple instructions after register allocation. The code for the MachineCombiner was generating MOVN/ORR/MOVZ directly. This converts them to use the pseudos, allowing the generated immediates to be materialized if required. The code is hopefully simpler as a result, and the Sub and Add patterns have been combined to reduce duplication. Full diff: https://github.com/llvm/llvm-project/pull/147510.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8847c62690714..0c60bcb382e2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -7868,62 +7868,47 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
case AArch64MachineCombinerPattern::MULADDWI_OP1:
- case AArch64MachineCombinerPattern::MULADDXI_OP1: {
+ case AArch64MachineCombinerPattern::MULADDXI_OP1:
+ case AArch64MachineCombinerPattern::MULSUBWI_OP1:
+ case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
// MUL I=A,B,0
- // ADD R,I,Imm
- // ==> MOV V, Imm
+ // ADD/SUB R,I,Imm
+ // ==> MOV V, Imm/-Imm
// ==> MADD R,A,B,V
// --- Create(MADD);
- const TargetRegisterClass *OrrRC;
- unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1) {
- OrrOpc = AArch64::ORRWri;
- OrrRC = &AArch64::GPR32spRegClass;
+ const TargetRegisterClass *RC;
+ unsigned BitSize, MovImm;
+ if (Pattern == AArch64MachineCombinerPattern::MULADDWI_OP1 ||
+ Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
+ MovImm = AArch64::MOVi32imm;
+ RC = &AArch64::GPR32spRegClass;
BitSize = 32;
- ZeroReg = AArch64::WZR;
Opc = AArch64::MADDWrrr;
RC = &AArch64::GPR32RegClass;
} else {
- OrrOpc = AArch64::ORRXri;
- OrrRC = &AArch64::GPR64spRegClass;
+ MovImm = AArch64::MOVi64imm;
+ RC = &AArch64::GPR64spRegClass;
BitSize = 64;
- ZeroReg = AArch64::XZR;
Opc = AArch64::MADDXrrr;
RC = &AArch64::GPR64RegClass;
}
- Register NewVR = MRI.createVirtualRegister(OrrRC);
+ Register NewVR = MRI.createVirtualRegister(RC);
uint64_t Imm = Root.getOperand(2).getImm();
if (Root.getOperand(3).isImm()) {
unsigned Val = Root.getOperand(3).getImm();
Imm = Imm << Val;
}
- uint64_t UImm = SignExtend64(Imm, BitSize);
- // The immediate can be composed via a single instruction.
+ bool IsSub = Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1 ||
+ Pattern == AArch64MachineCombinerPattern::MULSUBXI_OP1;
+ uint64_t UImm = SignExtend64(IsSub ? -Imm : Imm, BitSize);
+ // Check that the immediate can be composed via a single instruction.
SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
if (Insn.size() != 1)
return;
- auto MovI = Insn.begin();
- MachineInstrBuilder MIB1;
- // MOV is an alias for one of three instructions: movz, movn, and orr.
- if (MovI->Opcode == OrrOpc)
- MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
- .addReg(ZeroReg)
- .addImm(MovI->Op2);
- else {
- if (BitSize == 32)
- assert((MovI->Opcode == AArch64::MOVNWi ||
- MovI->Opcode == AArch64::MOVZWi) &&
- "Expected opcode");
- else
- assert((MovI->Opcode == AArch64::MOVNXi ||
- MovI->Opcode == AArch64::MOVZXi) &&
- "Expected opcode");
- MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
- .addImm(MovI->Op1)
- .addImm(MovI->Op2);
- }
+ MachineInstrBuilder MIB1 =
+ BuildMI(MF, MIMetadata(Root), TII->get(MovImm), NewVR).addImm(IsSub ? -Imm : Imm);
InsInstrs.push_back(MIB1);
InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
@@ -7977,67 +7962,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
break;
- case AArch64MachineCombinerPattern::MULSUBWI_OP1:
- case AArch64MachineCombinerPattern::MULSUBXI_OP1: {
- // MUL I=A,B,0
- // SUB R,I, Imm
- // ==> MOV V, -Imm
- // ==> MADD R,A,B,V // = -Imm + A*B
- // --- Create(MADD);
- const TargetRegisterClass *OrrRC;
- unsigned BitSize, OrrOpc, ZeroReg;
- if (Pattern == AArch64MachineCombinerPattern::MULSUBWI_OP1) {
- OrrOpc = AArch64::ORRWri;
- OrrRC = &AArch64::GPR32spRegClass;
- BitSize = 32;
- ZeroReg = AArch64::WZR;
- Opc = AArch64::MADDWrrr;
- RC = &AArch64::GPR32RegClass;
- } else {
- OrrOpc = AArch64::ORRXri;
- OrrRC = &AArch64::GPR64spRegClass;
- BitSize = 64;
- ZeroReg = AArch64::XZR;
- Opc = AArch64::MADDXrrr;
- RC = &AArch64::GPR64RegClass;
- }
- Register NewVR = MRI.createVirtualRegister(OrrRC);
- uint64_t Imm = Root.getOperand(2).getImm();
- if (Root.getOperand(3).isImm()) {
- unsigned Val = Root.getOperand(3).getImm();
- Imm = Imm << Val;
- }
- uint64_t UImm = SignExtend64(-Imm, BitSize);
- // The immediate can be composed via a single instruction.
- SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
- AArch64_IMM::expandMOVImm(UImm, BitSize, Insn);
- if (Insn.size() != 1)
- return;
- auto MovI = Insn.begin();
- MachineInstrBuilder MIB1;
- // MOV is an alias for one of three instructions: movz, movn, and orr.
- if (MovI->Opcode == OrrOpc)
- MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(OrrOpc), NewVR)
- .addReg(ZeroReg)
- .addImm(MovI->Op2);
- else {
- if (BitSize == 32)
- assert((MovI->Opcode == AArch64::MOVNWi ||
- MovI->Opcode == AArch64::MOVZWi) &&
- "Expected opcode");
- else
- assert((MovI->Opcode == AArch64::MOVNXi ||
- MovI->Opcode == AArch64::MOVZXi) &&
- "Expected opcode");
- MIB1 = BuildMI(MF, MIMetadata(Root), TII->get(MovI->Opcode), NewVR)
- .addImm(MovI->Op1)
- .addImm(MovI->Op2);
- }
- InsInstrs.push_back(MIB1);
- InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
- MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR, RC);
- break;
- }
case AArch64MachineCombinerPattern::MULADDv8i8_OP1:
Opc = AArch64::MLAv8i8;
RC = &AArch64::FPR64RegClass;
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir b/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
index dc75c8c61c53c..c944889ede695 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-maddimm.mir
@@ -14,8 +14,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
- ; CHECK-NEXT: [[MOVZWi:%[0-9]+]]:gpr32common = nsw MOVZWi 79, 0
- ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVZWi]]
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm 79
+ ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr32 = COPY $w0
@@ -38,8 +38,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
- ; CHECK-NEXT: [[MOVZXi:%[0-9]+]]:gpr64common = nsw MOVZXi 79, 0
- ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVZXi]]
+ ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm 79
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:gpr64 = COPY $x0
@@ -62,8 +62,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
- ; CHECK-NEXT: [[MOVNWi:%[0-9]+]]:gpr32common = nsw MOVNWi 0, 0
- ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32 = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVNWi]]
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm -1
+ ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32 = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr32 = COPY $w0
@@ -86,8 +86,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
- ; CHECK-NEXT: [[MOVNXi:%[0-9]+]]:gpr64common = nsw MOVNXi 0, 0
- ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVNXi]]
+ ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm -1
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64 = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:gpr64 = COPY $x0
@@ -110,8 +110,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
- ; CHECK-NEXT: [[ORRWri:%[0-9]+]]:gpr32common = nsw ORRWri $wzr, 1291
- ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[ORRWri]]
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = nsw MOVi32imm 16773120
+ ; CHECK-NEXT: [[MADDWrrr:%[0-9]+]]:gpr32common = nsw MADDWrrr [[COPY1]], [[COPY]], [[MOVi32imm]]
; CHECK-NEXT: $w0 = COPY [[MADDWrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $w0
%0:gpr32 = COPY $w0
@@ -134,8 +134,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
- ; CHECK-NEXT: [[ORRXri:%[0-9]+]]:gpr64common = nsw ORRXri $xzr, 7435
- ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[ORRXri]]
+ ; CHECK-NEXT: [[MOVi64imm:%[0-9]+]]:gpr64 = nsw MOVi64imm 16773120
+ ; CHECK-NEXT: [[MADDXrrr:%[0-9]+]]:gpr64common = nsw MADDXrrr [[COPY1]], [[COPY]], [[MOVi64imm]]
; CHECK-NEXT: $x0 = COPY [[MADDXrrr]]
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:gpr64 = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/madd-combiner.ll b/llvm/test/CodeGen/AArch64/madd-combiner.ll
index 6e510712fbd21..cc7fc8fc98629 100644
--- a/llvm/test/CodeGen/AArch64/madd-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/madd-combiner.ll
@@ -39,9 +39,8 @@ define void @mul_add_imm2() {
; CHECK-FAST-LABEL: mul_add_imm2:
; CHECK-FAST: ; %bb.0: ; %entry
; CHECK-FAST-NEXT: mov x8, #-3 ; =0xfffffffffffffffd
-; CHECK-FAST-NEXT: mov x9, #-3 ; =0xfffffffffffffffd
-; CHECK-FAST-NEXT: madd x8, x8, x8, x9
; CHECK-FAST-NEXT: mov x9, #45968 ; =0xb390
+; CHECK-FAST-NEXT: madd x8, x8, x8, x8
; CHECK-FAST-NEXT: movk x9, #48484, lsl #16
; CHECK-FAST-NEXT: movk x9, #323, lsl #32
; CHECK-FAST-NEXT: LBB2_1: ; %for.body8
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
The usual path for lowering immediates in AArch64 is to generate a MOVi32imm or MOVi64imm pseudo instruction, that can be moved / rematerialized around as required, being expanded into one or multiple instructions after register allocation. The code for the MachineCombiner was generating MOVN/ORR/MOVZ directly. This converts them to use the pseudos, allowing the generated immediates to be materialized if required. The code is hopefully simpler as a result, and the Sub and Add patterns have been combined to reduce duplication.
ec945d9
to
c15bf09
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks!
The usual path for lowering immediates in AArch64 is to generate a MOVi32imm or MOVi64imm pseudo instruction, that can be moved / rematerialized around as required, being expanded into one or multiple instructions after register allocation.
The code for the MachineCombiner was generating MOVN/ORR/MOVZ directly. This converts them to use the pseudos, allowing the generated immediates to be materialized if required. The code is hopefully simpler as a result, and the Sub and Add patterns have been combined to reduce duplication.