Skip to content

Commit 5ceb0bc

Browse files
author
Simon Moll
committed
[VE] Packed 32/64bit broadcast isel and tests
Packed-mode broadcast of f32/i32 requires the subregister to be replicated to the full I64 register prior. Add repl_i32 and repl_f32 to faciliate this. Reviewed By: kaz7 Differential Revision: https://reviews.llvm.org/D117878
1 parent 0984aa7 commit 5ceb0bc

File tree

7 files changed

+136
-10
lines changed

7 files changed

+136
-10
lines changed

llvm/lib/Target/VE/VECustomDAG.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,14 @@
1919

2020
namespace llvm {
2121

22+
static const int StandardVectorWidth = 256;
23+
24+
bool isPackedVectorType(EVT SomeVT) {
25+
if (!SomeVT.isVector())
26+
return false;
27+
return SomeVT.getVectorNumElements() > StandardVectorWidth;
28+
}
29+
2230
/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
2331
Optional<unsigned> getVVPOpcode(unsigned Opcode) {
2432
switch (Opcode) {
@@ -51,6 +59,22 @@ SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
5159

5260
SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
5361
SDValue AVL) const {
62+
assert(ResultVT.isVector());
63+
auto ScaVT = Scalar.getValueType();
64+
assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts");
65+
66+
if (isPackedVectorType(ResultVT)) {
67+
// v512x packed mode broadcast
68+
// Replicate the scalar reg (f32 or i32) onto the opposing half of the full
69+
// scalar register. If it's an I64 type, assume that this has already
70+
// happened.
71+
if (ScaVT == MVT::f32) {
72+
Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar);
73+
} else if (ScaVT == MVT::i32) {
74+
Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar);
75+
}
76+
}
77+
5478
return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL});
5579
}
5680

llvm/lib/Target/VE/VECustomDAG.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ Optional<unsigned> getVVPOpcode(unsigned Opcode);
2525

2626
bool isVVPBinaryOp(unsigned Opcode);
2727

28+
bool isPackedVectorType(EVT SomeVT);
29+
2830
class VECustomDAG {
2931
SelectionDAG &DAG;
3032
SDLoc DL;

llvm/lib/Target/VE/VEISelLowering.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
//
1212
//===----------------------------------------------------------------------===//
1313

14-
#include "VECustomDAG.h"
1514
#include "VEISelLowering.h"
1615
#include "MCTargetDesc/VEMCExpr.h"
16+
#include "VECustomDAG.h"
1717
#include "VEInstrBuilder.h"
1818
#include "VEMachineFunctionInfo.h"
1919
#include "VERegisterInfo.h"
@@ -899,6 +899,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
899899
TARGET_NODE_CASE(RET_FLAG)
900900
TARGET_NODE_CASE(TS1AM)
901901
TARGET_NODE_CASE(VEC_BROADCAST)
902+
TARGET_NODE_CASE(REPL_I32)
903+
TARGET_NODE_CASE(REPL_F32)
902904

903905
// Register the VVP_* SDNodes.
904906
#define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
@@ -1642,26 +1644,25 @@ static SDValue getSplatValue(SDNode *N) {
16421644
SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
16431645
SelectionDAG &DAG) const {
16441646
VECustomDAG CDAG(DAG, Op);
1645-
unsigned NumEls = Op.getValueType().getVectorNumElements();
1646-
MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
1647+
MVT ResultVT = Op.getSimpleValueType();
16471648

16481649
// If there is just one element, expand to INSERT_VECTOR_ELT.
16491650
unsigned UniqueIdx;
16501651
if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
16511652
SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
16521653
auto ElemV = Op->getOperand(UniqueIdx);
16531654
SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
1654-
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, Op.getValueType(),
1655-
{AccuV, ElemV, IdxV});
1655+
return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
16561656
}
16571657

16581658
// Else emit a broadcast.
16591659
if (SDValue ScalarV = getSplatValue(Op.getNode())) {
1660-
// lower to VEC_BROADCAST
1661-
MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
1662-
1663-
auto AVL = CDAG.getConstant(NumEls, MVT::i32);
1664-
return CDAG.getBroadcast(LegalResVT, Op.getOperand(0), AVL);
1660+
unsigned NumEls = ResultVT.getVectorNumElements();
1661+
// TODO: Legalize packed-mode AVL.
1662+
// For now, cap the AVL at 256.
1663+
auto CappedLength = std::min<unsigned>(256, NumEls);
1664+
auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
1665+
return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
16651666
}
16661667

16671668
// Expand

llvm/lib/Target/VE/VEISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ enum NodeType : unsigned {
4040
TS1AM, // A TS1AM instruction used for 1/2 bytes swap.
4141
VEC_BROADCAST, // A vector broadcast instruction.
4242
// 0: scalar value, 1: VL
43+
REPL_I32,
44+
REPL_F32, // Replicate subregister to other half.
4345

4446
// VVP_* nodes.
4547
#define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,

llvm/lib/Target/VE/VEInstrInfo.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,6 +1576,12 @@ def f2l : OutPatFrag<(ops node:$exp),
15761576
def l2f : OutPatFrag<(ops node:$exp),
15771577
(EXTRACT_SUBREG $exp, sub_f32)>;
15781578

1579+
// Zero out subregisters.
1580+
def zero_i32 : OutPatFrag<(ops node:$expr),
1581+
(ANDrm $expr, 32)>;
1582+
def zero_f32 : OutPatFrag<(ops node:$expr),
1583+
(ANDrm $expr, !add(32, 64))>;
1584+
15791585
// Small immediates.
15801586
def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
15811587
def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
@@ -2287,6 +2293,16 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
22872293
def vec_broadcast : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
22882294
[SDTCisVec<0>, IsVLVT<2>]>>;
22892295

2296+
// replicate lower 32bit to upper 32bit (f32 scalar replication).
2297+
def repl_f32 : SDNode<"VEISD::REPL_F32",
2298+
SDTypeProfile<1, 1,
2299+
[SDTCisInt<0>, SDTCisFP<1>]>>;
2300+
// replicate upper 32bit to lower 32 bit (i32 scalar replication).
2301+
def repl_i32 : SDNode<"VEISD::REPL_I32",
2302+
SDTypeProfile<1, 1,
2303+
[SDTCisInt<0>, SDTCisInt<1>]>>;
2304+
2305+
22902306
// Whether this is an all-true mask (assuming undef-bits above VL are all-true).
22912307
def true_mask : PatLeaf<
22922308
(vec_broadcast (i32 nonzero), (i32 srcvalue))>;

llvm/lib/Target/VE/VEInstrPatternsVec.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,17 @@
1515
// Instruction format superclass
1616
//===----------------------------------------------------------------------===//
1717

18+
// Sub-register replication for packed broadcast.
19+
def: Pat<(i64 (repl_f32 f32:$val)),
20+
(ORrr
21+
(SRLri (f2l $val), 32),
22+
(zero_i32 (f2l $val)))>;
23+
def: Pat<(i64 (repl_i32 i32:$val)),
24+
(ORrr
25+
(zero_f32 (i2l $val)),
26+
(SLLri (i2l $val), 32))>;
27+
28+
1829
multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
1930
SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
2031
// VBRDil
@@ -89,3 +100,8 @@ defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
89100

90101
defm : patterns_elem64<v256i64, i64, simm7, LO7>;
91102
defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
103+
104+
defm : vbrd_elem64<v512i32, i64, simm7, LO7>;
105+
defm : vbrd_elem64<v512f32, i64, simm7, LO7>;
106+
defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>;
107+
defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>;
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
3+
4+
define fastcc <512 x i32> @brd_v512i32(i32 %s) {
5+
; CHECK-LABEL: brd_v512i32:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: and %s0, %s0, (32)0
8+
; CHECK-NEXT: sll %s1, %s0, 32
9+
; CHECK-NEXT: and %s0, %s0, (32)0
10+
; CHECK-NEXT: or %s0, %s0, %s1
11+
; CHECK-NEXT: lea %s1, 256
12+
; CHECK-NEXT: lvl %s1
13+
; CHECK-NEXT: vbrd %v0, %s0
14+
; CHECK-NEXT: b.l.t (, %s10)
15+
%val = insertelement <512 x i32> undef, i32 %s, i32 0
16+
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
17+
ret <512 x i32> %ret
18+
}
19+
20+
define fastcc <512 x i32> @brdi_v512i32() {
21+
; CHECK-LABEL: brdi_v512i32:
22+
; CHECK: # %bb.0:
23+
; CHECK-NEXT: or %s0, 17, (0)1
24+
; CHECK-NEXT: sll %s1, %s0, 32
25+
; CHECK-NEXT: and %s0, %s0, (32)0
26+
; CHECK-NEXT: or %s0, %s0, %s1
27+
; CHECK-NEXT: lea %s1, 256
28+
; CHECK-NEXT: lvl %s1
29+
; CHECK-NEXT: vbrd %v0, %s0
30+
; CHECK-NEXT: b.l.t (, %s10)
31+
%val = insertelement <512 x i32> undef, i32 17, i32 0
32+
%ret = shufflevector <512 x i32> %val, <512 x i32> undef, <512 x i32> zeroinitializer
33+
ret <512 x i32> %ret
34+
}
35+
36+
define fastcc <512 x float> @brd_v512f32(float %s) {
37+
; CHECK-LABEL: brd_v512f32:
38+
; CHECK: # %bb.0:
39+
; CHECK-NEXT: and %s1, %s0, (32)1
40+
; CHECK-NEXT: srl %s0, %s0, 32
41+
; CHECK-NEXT: or %s0, %s0, %s1
42+
; CHECK-NEXT: lea %s1, 256
43+
; CHECK-NEXT: lvl %s1
44+
; CHECK-NEXT: vbrd %v0, %s0
45+
; CHECK-NEXT: b.l.t (, %s10)
46+
%val = insertelement <512 x float> undef, float %s, i32 0
47+
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
48+
ret <512 x float> %ret
49+
}
50+
51+
define fastcc <512 x float> @brdi_v512f32() {
52+
; CHECK-LABEL: brdi_v512f32:
53+
; CHECK: # %bb.0:
54+
; CHECK-NEXT: lea.sl %s0, 0
55+
; CHECK-NEXT: and %s1, %s0, (32)1
56+
; CHECK-NEXT: srl %s0, %s0, 32
57+
; CHECK-NEXT: or %s0, %s0, %s1
58+
; CHECK-NEXT: lea %s1, 256
59+
; CHECK-NEXT: lvl %s1
60+
; CHECK-NEXT: vbrd %v0, %s0
61+
; CHECK-NEXT: b.l.t (, %s10)
62+
%val = insertelement <512 x float> undef, float 0.e+00, i32 0
63+
%ret = shufflevector <512 x float> %val, <512 x float> undef, <512 x i32> zeroinitializer
64+
ret <512 x float> %ret
65+
}

0 commit comments

Comments
 (0)