Skip to content

Commit f4ea105

Browse files
committed
[SystemZ] Implement i128 funnel shifts
These can be handled via the VECTOR SHIFT LEFT/RIGHT DOUBLE family of instructions, depending on architecture level. Fixes: llvm#129955
1 parent 4155cc0 commit f4ea105

File tree

8 files changed

+394
-59
lines changed

8 files changed

+394
-59
lines changed

llvm/lib/Target/SystemZ/SystemZISelLowering.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
254254
setOperationAction(ISD::ROTR, MVT::i128, Expand);
255255
setOperationAction(ISD::ROTL, MVT::i128, Expand);
256256

257+
// We may be able to use VSLDB/VSLD/VSRD for these.
258+
setOperationAction(ISD::FSHL, MVT::i128, Custom);
259+
setOperationAction(ISD::FSHR, MVT::i128, Custom);
260+
257261
// No special instructions for these before arch15.
258262
if (!Subtarget.hasVectorEnhancements3()) {
259263
setOperationAction(ISD::MUL, MVT::i128, Expand);
@@ -6644,6 +6648,66 @@ SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
66446648
return Op;
66456649
}
66466650

6651+
SDValue SystemZTargetLowering::lowerFSHL(SDValue Op, SelectionDAG &DAG) const {
6652+
SDLoc DL(Op);
6653+
6654+
// i128 FSHL with a constant amount that is a multiple of 8 can be
6655+
// implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
6656+
// facility, FSHL with a constant amount less than 8 can be implemented
6657+
// via SHL_DOUBLE_BIT, and FSHL with other constant amounts by a
6658+
// combination of the two.
6659+
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
6660+
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
6661+
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
6662+
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
6663+
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
6664+
SmallVector<int, 16> Mask(16);
6665+
for (unsigned Elt = 0; Elt < 16; Elt++)
6666+
Mask[Elt] = (ShiftAmt >> 3) + Elt;
6667+
SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
6668+
if ((ShiftAmt & 7) == 0)
6669+
return DAG.getBitcast(MVT::i128, Shuf1);
6670+
SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op1, Op1, Mask);
6671+
SDValue Val =
6672+
DAG.getNode(SystemZISD::SHL_DOUBLE_BIT, DL, MVT::v16i8, Shuf1, Shuf2,
6673+
DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
6674+
return DAG.getBitcast(MVT::i128, Val);
6675+
}
6676+
}
6677+
6678+
return SDValue();
6679+
}
6680+
6681+
SDValue SystemZTargetLowering::lowerFSHR(SDValue Op, SelectionDAG &DAG) const {
6682+
SDLoc DL(Op);
6683+
6684+
// i128 FSHR with a constant amount that is a multiple of 8 can be
6685+
// implemented via VECTOR_SHUFFLE. If we have the vector-enhancements-2
6686+
// facility, FSHR with a constant amount less than 8 can be implemented
6687+
// via SHL_DOUBLE_BIT, and FSHR with other constant amounts by a
6688+
// combination of the two.
6689+
if (auto *ShiftAmtNode = dyn_cast<ConstantSDNode>(Op.getOperand(2))) {
6690+
uint64_t ShiftAmt = ShiftAmtNode->getZExtValue() & 127;
6691+
if ((ShiftAmt & 7) == 0 || Subtarget.hasVectorEnhancements2()) {
6692+
SDValue Op0 = DAG.getBitcast(MVT::v16i8, Op.getOperand(0));
6693+
SDValue Op1 = DAG.getBitcast(MVT::v16i8, Op.getOperand(1));
6694+
SmallVector<int, 16> Mask(16);
6695+
for (unsigned Elt = 0; Elt < 16; Elt++)
6696+
Mask[Elt] = 16 - (ShiftAmt >> 3) + Elt;
6697+
SDValue Shuf1 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op1, Mask);
6698+
if ((ShiftAmt & 7) == 0)
6699+
return DAG.getBitcast(MVT::i128, Shuf1);
6700+
SDValue Shuf2 = DAG.getVectorShuffle(MVT::v16i8, DL, Op0, Op0, Mask);
6701+
SDValue Val =
6702+
DAG.getNode(SystemZISD::SHR_DOUBLE_BIT, DL, MVT::v16i8, Shuf2, Shuf1,
6703+
DAG.getTargetConstant(ShiftAmt & 7, DL, MVT::i32));
6704+
return DAG.getBitcast(MVT::i128, Val);
6705+
}
6706+
}
6707+
6708+
return SDValue();
6709+
}
6710+
66476711
static SDValue lowerAddrSpaceCast(SDValue Op, SelectionDAG &DAG) {
66486712
SDLoc dl(Op);
66496713
SDValue Src = Op.getOperand(0);
@@ -6853,6 +6917,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
68536917
return lowerAddrSpaceCast(Op, DAG);
68546918
case ISD::ROTL:
68556919
return lowerShift(Op, DAG, SystemZISD::VROTL_BY_SCALAR);
6920+
case ISD::FSHL:
6921+
return lowerFSHL(Op, DAG);
6922+
case ISD::FSHR:
6923+
return lowerFSHR(Op, DAG);
68566924
case ISD::IS_FPCLASS:
68576925
return lowerIS_FPCLASS(Op, DAG);
68586926
case ISD::GET_ROUNDING:
@@ -7063,6 +7131,8 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
70637131
OPCODE(VSRL_BY_SCALAR);
70647132
OPCODE(VSRA_BY_SCALAR);
70657133
OPCODE(VROTL_BY_SCALAR);
7134+
OPCODE(SHL_DOUBLE_BIT);
7135+
OPCODE(SHR_DOUBLE_BIT);
70667136
OPCODE(VSUM);
70677137
OPCODE(VACC);
70687138
OPCODE(VSCBI);

llvm/lib/Target/SystemZ/SystemZISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,11 @@ enum NodeType : unsigned {
222222
VSRA_BY_SCALAR,
223223
VROTL_BY_SCALAR,
224224

225+
// Concatenate the vectors in the first two operands, shift them left/right
226+
// bitwise by the third operand, and take the first/last half of the result.
227+
SHL_DOUBLE_BIT,
228+
SHR_DOUBLE_BIT,
229+
225230
// For each element of the output type, sum across all sub-elements of
226231
// operand 0 belonging to the corresponding element, and add in the
227232
// rightmost sub-element of the corresponding element of operand 1.
@@ -736,6 +741,8 @@ class SystemZTargetLowering : public TargetLowering {
736741
SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
737742
SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
738743
SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
744+
SDValue lowerFSHL(SDValue Op, SelectionDAG &DAG) const;
745+
SDValue lowerFSHR(SDValue Op, SelectionDAG &DAG) const;
739746
SDValue lowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const;
740747
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
741748
SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/SystemZ/SystemZInstrVector.td

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -974,8 +974,11 @@ let Predicates = [FeatureVector] in {
974974
(VSLDB VR128:$x, VR128:$y, imm32zx8:$z)>;
975975

976976
// Shift left double by bit.
977-
let Predicates = [FeatureVectorEnhancements2] in
978-
def VSLD : TernaryVRId<"vsld", 0xE786, int_s390_vsld, v128b, v128b, 0>;
977+
let Predicates = [FeatureVectorEnhancements2] in {
978+
def VSLD : TernaryVRId<"vsld", 0xE786, z_shl_double_bit, v128b, v128b, 0>;
979+
def : Pat<(int_s390_vsld VR128:$x, VR128:$y, imm32zx8_timm:$z),
980+
(VSLD VR128:$x, VR128:$y, imm32zx8:$z)>;
981+
}
979982

980983
// Shift right arithmetic.
981984
def VSRA : BinaryVRRc<"vsra", 0xE77E, int_s390_vsra, v128b, v128b>;
@@ -990,8 +993,11 @@ let Predicates = [FeatureVector] in {
990993
def VSRLB : BinaryVRRc<"vsrlb", 0xE77D, int_s390_vsrlb, v128b, v128b>;
991994

992995
// Shift right double by bit.
993-
let Predicates = [FeatureVectorEnhancements2] in
994-
def VSRD : TernaryVRId<"vsrd", 0xE787, int_s390_vsrd, v128b, v128b, 0>;
996+
let Predicates = [FeatureVectorEnhancements2] in {
997+
def VSRD : TernaryVRId<"vsrd", 0xE787, z_shr_double_bit, v128b, v128b, 0>;
998+
def : Pat<(int_s390_vsrd VR128:$x, VR128:$y, imm32zx8_timm:$z),
999+
(VSRD VR128:$x, VR128:$y, imm32zx8:$z)>;
1000+
}
9951001

9961002
// Subtract.
9971003
def VS : BinaryVRRcGeneric<"vs", 0xE7F7>;

llvm/lib/Target/SystemZ/SystemZOperators.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,8 @@ def z_splat : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
354354
def z_merge_high : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
355355
def z_merge_low : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
356356
def z_shl_double : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
357+
def z_shl_double_bit : SDNode<"SystemZISD::SHL_DOUBLE_BIT", SDT_ZVecTernaryInt>;
358+
def z_shr_double_bit : SDNode<"SystemZISD::SHR_DOUBLE_BIT", SDT_ZVecTernaryInt>;
357359
def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS",
358360
SDT_ZVecTernaryInt>;
359361
def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;

llvm/test/CodeGen/SystemZ/rot-03.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,7 @@ define i128 @f2(i128 %val) {
3030
; CHECK-LABEL: f2:
3131
; CHECK: # %bb.0:
3232
; CHECK-NEXT: vl %v0, 0(%r3), 3
33-
; CHECK-NEXT: vrepib %v1, 96
34-
; CHECK-NEXT: vrepib %v2, 32
35-
; CHECK-NEXT: vsrlb %v1, %v0, %v1
36-
; CHECK-NEXT: vslb %v0, %v0, %v2
37-
; CHECK-NEXT: vo %v0, %v0, %v1
33+
; CHECK-NEXT: vsldb %v0, %v0, %v0, 4
3834
; CHECK-NEXT: vst %v0, 0(%r2), 3
3935
; CHECK-NEXT: br %r14
4036

@@ -55,10 +51,11 @@ define i128 @f3(i128 %val, i128 %amt) {
5551
; CHECK-NEXT: vl %v0, 0(%r3), 3
5652
; CHECK-NEXT: vrepb %v1, %v1, 15
5753
; CHECK-NEXT: vslb %v2, %v0, %v1
58-
; CHECK-NEXT: lhi %r1, 128
59-
; CHECK-NEXT: sr %r1, %r0
6054
; CHECK-NEXT: vsl %v1, %v2, %v1
61-
; CHECK-NEXT: vlvgp %v2, %r1, %r1
55+
; CHECK-NEXT: vrepib %v2, 1
56+
; CHECK-NEXT: xilf %r0, 4294967295
57+
; CHECK-NEXT: vsrl %v0, %v0, %v2
58+
; CHECK-NEXT: vlvgp %v2, %r0, %r0
6259
; CHECK-NEXT: vrepb %v2, %v2, 15
6360
; CHECK-NEXT: vsrlb %v0, %v0, %v2
6461
; CHECK-NEXT: vsrl %v0, %v0, %v2

llvm/test/CodeGen/SystemZ/shift-16.ll

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,23 +7,25 @@
77
define i256 @f1(i256 %a, i256 %sh) {
88
; CHECK-LABEL: f1:
99
; CHECK: # %bb.0:
10-
; CHECK-NEXT: vl %v0, 0(%r3), 3
1110
; CHECK-NEXT: vl %v1, 16(%r3), 3
11+
; CHECK-NEXT: vl %v0, 0(%r3), 3
1212
; CHECK-NEXT: l %r0, 28(%r4)
1313
; CHECK-NEXT: clijhe %r0, 128, .LBB0_2
1414
; CHECK-NEXT: # %bb.1:
15-
; CHECK-NEXT: lhi %r1, 128
16-
; CHECK-NEXT: sr %r1, %r0
17-
; CHECK-NEXT: vlvgp %v2, %r1, %r1
18-
; CHECK-NEXT: vrepb %v2, %v2, 15
19-
; CHECK-NEXT: vsrlb %v3, %v1, %v2
20-
; CHECK-NEXT: vsrl %v2, %v3, %v2
21-
; CHECK-NEXT: vlvgp %v3, %r0, %r0
22-
; CHECK-NEXT: vrepb %v3, %v3, 15
23-
; CHECK-NEXT: vslb %v4, %v0, %v3
15+
; CHECK-NEXT: lr %r1, %r0
16+
; CHECK-NEXT: xilf %r1, 4294967295
17+
; CHECK-NEXT: vlvgp %v2, %r0, %r0
18+
; CHECK-NEXT: vlvgp %v5, %r1, %r1
19+
; CHECK-NEXT: vrepib %v4, 1
20+
; CHECK-NEXT: vrepb %v3, %v2, 15
21+
; CHECK-NEXT: vsrl %v4, %v1, %v4
22+
; CHECK-NEXT: vrepb %v5, %v5, 15
23+
; CHECK-NEXT: vslb %v2, %v0, %v3
24+
; CHECK-NEXT: vsrlb %v4, %v4, %v5
2425
; CHECK-NEXT: vslb %v1, %v1, %v3
25-
; CHECK-NEXT: vsl %v4, %v4, %v3
26-
; CHECK-NEXT: vo %v2, %v4, %v2
26+
; CHECK-NEXT: vsl %v2, %v2, %v3
27+
; CHECK-NEXT: vsrl %v4, %v4, %v5
28+
; CHECK-NEXT: vo %v2, %v2, %v4
2729
; CHECK-NEXT: vsl %v1, %v1, %v3
2830
; CHECK-NEXT: cijlh %r0, 0, .LBB0_3
2931
; CHECK-NEXT: j .LBB0_4
@@ -49,22 +51,24 @@ define i256 @f1(i256 %a, i256 %sh) {
4951
define i256 @f2(i256 %a, i256 %sh) {
5052
; CHECK-LABEL: f2:
5153
; CHECK: # %bb.0:
52-
; CHECK-NEXT: vl %v0, 16(%r3), 3
5354
; CHECK-NEXT: vl %v1, 0(%r3), 3
55+
; CHECK-NEXT: vl %v0, 16(%r3), 3
5456
; CHECK-NEXT: l %r0, 28(%r4)
5557
; CHECK-NEXT: clijhe %r0, 128, .LBB1_2
5658
; CHECK-NEXT: # %bb.1:
57-
; CHECK-NEXT: lhi %r1, 128
58-
; CHECK-NEXT: sr %r1, %r0
59-
; CHECK-NEXT: vlvgp %v2, %r1, %r1
60-
; CHECK-NEXT: vrepb %v2, %v2, 15
61-
; CHECK-NEXT: vslb %v3, %v1, %v2
62-
; CHECK-NEXT: vsl %v2, %v3, %v2
63-
; CHECK-NEXT: vlvgp %v3, %r0, %r0
64-
; CHECK-NEXT: vrepb %v3, %v3, 15
65-
; CHECK-NEXT: vsrlb %v4, %v0, %v3
59+
; CHECK-NEXT: lr %r1, %r0
60+
; CHECK-NEXT: xilf %r1, 4294967295
61+
; CHECK-NEXT: vlvgp %v2, %r0, %r0
62+
; CHECK-NEXT: vlvgp %v5, %r1, %r1
63+
; CHECK-NEXT: vrepib %v4, 1
64+
; CHECK-NEXT: vrepb %v3, %v2, 15
65+
; CHECK-NEXT: vsl %v4, %v1, %v4
66+
; CHECK-NEXT: vrepb %v5, %v5, 15
67+
; CHECK-NEXT: vsrlb %v2, %v0, %v3
68+
; CHECK-NEXT: vslb %v4, %v4, %v5
6669
; CHECK-NEXT: vsrlb %v1, %v1, %v3
67-
; CHECK-NEXT: vsrl %v4, %v4, %v3
70+
; CHECK-NEXT: vsrl %v2, %v2, %v3
71+
; CHECK-NEXT: vsl %v4, %v4, %v5
6872
; CHECK-NEXT: vo %v2, %v4, %v2
6973
; CHECK-NEXT: vsrl %v1, %v1, %v3
7074
; CHECK-NEXT: cijlh %r0, 0, .LBB1_3
@@ -92,23 +96,25 @@ define i256 @f3(i256 %a, i256 %sh) {
9296
; CHECK-LABEL: f3:
9397
; CHECK: # %bb.0:
9498
; CHECK-NEXT: vl %v0, 16(%r3), 3
95-
; CHECK-NEXT: vl %v2, 0(%r3), 3
9699
; CHECK-NEXT: l %r0, 28(%r4)
100+
; CHECK-NEXT: vl %v2, 0(%r3), 3
97101
; CHECK-NEXT: clijhe %r0, 128, .LBB2_2
98102
; CHECK-NEXT: # %bb.1:
99-
; CHECK-NEXT: lhi %r1, 128
100-
; CHECK-NEXT: sr %r1, %r0
101103
; CHECK-NEXT: vlvgp %v1, %r0, %r0
102-
; CHECK-NEXT: vlvgp %v4, %r1, %r1
103104
; CHECK-NEXT: vrepb %v3, %v1, 15
104-
; CHECK-NEXT: vrepb %v4, %v4, 15
105105
; CHECK-NEXT: vsrab %v1, %v2, %v3
106-
; CHECK-NEXT: vslb %v2, %v2, %v4
107-
; CHECK-NEXT: vsl %v2, %v2, %v4
108106
; CHECK-NEXT: vsrlb %v4, %v0, %v3
109107
; CHECK-NEXT: vsra %v1, %v1, %v3
108+
; CHECK-NEXT: lr %r1, %r0
110109
; CHECK-NEXT: vsrl %v3, %v4, %v3
111-
; CHECK-NEXT: vo %v2, %v3, %v2
110+
; CHECK-NEXT: vrepib %v4, 1
111+
; CHECK-NEXT: xilf %r1, 4294967295
112+
; CHECK-NEXT: vsl %v2, %v2, %v4
113+
; CHECK-NEXT: vlvgp %v4, %r1, %r1
114+
; CHECK-NEXT: vrepb %v4, %v4, 15
115+
; CHECK-NEXT: vslb %v2, %v2, %v4
116+
; CHECK-NEXT: vsl %v2, %v2, %v4
117+
; CHECK-NEXT: vo %v2, %v2, %v3
112118
; CHECK-NEXT: cijlh %r0, 0, .LBB2_3
113119
; CHECK-NEXT: j .LBB2_4
114120
; CHECK-NEXT: .LBB2_2:

0 commit comments

Comments
 (0)