Skip to content

Commit 364b0cb

Browse files
authored
Merge pull request #133 from sx-aurora-dev/merge/ve-fpbinops-vvp
Merge/ve fpbinops vvp
2 parents b93ad3e + bfe9fc9 commit 364b0cb

File tree

13 files changed

+558
-60
lines changed

13 files changed

+558
-60
lines changed

llvm/lib/Analysis/ConstantFolding.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
708708
// is all undef or zero, we know what it loads.
709709
if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) {
710710
if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
711-
if (GV->getInitializer()->isNullValue())
711+
if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() &&
712+
!Ty->isX86_AMXTy())
712713
return Constant::getNullValue(Ty);
713714
if (isa<UndefValue>(GV->getInitializer()))
714715
return UndefValue::get(Ty);

llvm/lib/Target/VE/VVPInstrInfo.td

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,14 @@
2121
///// V(E) - VP internal nodes
2222
// fp node types
2323

24-
def SDTFPBinOpVVP : SDTypeProfile<1, 4, [ // vvp_fadd, etc.
25-
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>, SDTCisSameNumEltsAs<0, 3>, IsVLVT<4>
24+
// BinaryFPOp(x,y,mask,vl)
25+
def SDTFPBinOpVVP : SDTypeProfile<1, 4, [ // vvp_fadd, etc.
26+
SDTCisSameAs<0, 1>,
27+
SDTCisSameAs<0, 2>,
28+
SDTCisFP<0>,
29+
SDTCisInt<3>,
30+
SDTCisSameNumEltsAs<0, 3>,
31+
IsVLVT<4>
2632
]>;
2733

2834
def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [ // vvp_ffma
@@ -207,10 +213,12 @@ def vvp_reduce_umax : SDNode<"VEISD::VVP_REDUCE_UMAX", SDTReduceVVP>;
207213
// math funcs
208214
def vvp_fsqrt : SDNode<"VEISD::VVP_FSQRT", SDTFPUnaryOpVVP>;
209215

216+
// Binary operator commutative pattern.
210217
class vvp_commutative<SDNode RootOp> :
211-
PatFrags<(ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
212-
[(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
213-
(RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
218+
PatFrags<
219+
(ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
220+
[(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
221+
(RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
214222

215223
class vvp_fma_commutative<SDNode RootOp> :
216224
PatFrags<(ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),

llvm/lib/Target/VE/VVPInstrPatternsVec.td

Lines changed: 56 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,14 @@ multiclass Binary_rv_vv<
229229
defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>;
230230
}
231231

232+
multiclass Binary_rv_vr_vv<
233+
SDPatternOperator OpNode,
234+
ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
235+
string OpBaseName> {
236+
defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
237+
defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
238+
}
239+
232240
// Expand both 64bit and 32 bit variant (256 elements)
233241
multiclass Binary_rv_vv_ShortLong<
234242
SDPatternOperator OpNode,
@@ -254,23 +262,6 @@ multiclass Binary_vr_vv_ShortLong<
254262
ShortOpBaseName>;
255263
}
256264

257-
// Binary operators that support broadcasts on LHS and RHS.
258-
multiclass Binary_all<
259-
SDPatternOperator OpNode,
260-
ValueType ScalarVT, ValueType DataVT,
261-
ValueType MaskVT, string OpBaseName> {
262-
defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
263-
defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
264-
}
265-
266-
multiclass Binary_ShortLong<
267-
SDPatternOperator OpNode,
268-
ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
269-
ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
270-
defm : Binary_all<OpNode, LongScalarVT, LongDataVT, v256i1, LongOpBaseName>;
271-
defm : Binary_all<OpNode, ShortScalarVT, ShortDataVT, v256i1, ShortOpBaseName>;
272-
}
273-
274265
multiclass Ternary<
275266
SDPatternOperator OpNode,
276267
ValueType ScalarVT, ValueType DataVT,
@@ -332,6 +323,18 @@ multiclass Ternary_ShortLong<
332323
// Integer arithmetic (256 elements)
333324
defm : Unary_ShortLong<vvp_ctpop, i64, v256i64, "VPCNT", i32, v256i32, "PVPCNTLO">;
334325

326+
multiclass Binary_rv_vr_vv_ShortLong<
327+
SDPatternOperator OpNode,
328+
ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
329+
ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
330+
defm : Binary_rv_vr_vv<OpNode,
331+
LongScalarVT, LongDataVT, v256i1,
332+
LongOpBaseName>;
333+
defm : Binary_rv_vr_vv<OpNode,
334+
ShortScalarVT, ShortDataVT, v256i1,
335+
ShortOpBaseName>;
336+
}
337+
335338
defm : Binary_rv_vv_ShortLong<c_vvp_add,
336339
i64, v256i64, "VADDSL",
337340
i32, v256i32, "VADDSWSX">;
@@ -341,6 +344,12 @@ defm : Binary_rv_vv_ShortLong<vvp_sub,
341344
defm : Binary_rv_vv_ShortLong<c_vvp_mul,
342345
i64, v256i64, "VMULSL",
343346
i32, v256i32, "VMULSWSX">;
347+
defm : Binary_rv_vr_vv_ShortLong<vvp_sdiv,
348+
i64, v256i64, "VDIVSL",
349+
i32, v256i32, "VDIVSWSX">;
350+
defm : Binary_rv_vr_vv_ShortLong<vvp_udiv,
351+
i64, v256i64, "VDIVUL",
352+
i32, v256i32, "VDIVUW">;
344353
defm : Binary_rv_vv_ShortLong<c_vvp_and,
345354
i64, v256i64, "VAND",
346355
i32, v256i32, "PVANDLO">;
@@ -351,9 +360,6 @@ defm : Binary_rv_vv_ShortLong<c_vvp_xor,
351360
i64, v256i64, "VXOR",
352361
i32, v256i32, "PVXORLO">;
353362

354-
defm : Binary_ShortLong<vvp_sdiv, i64, v256i64, "VDIVSL", i32, v256i32, "VDIVSWSX">;
355-
defm : Binary_ShortLong<vvp_udiv, i64, v256i64, "VDIVUL", i32, v256i32, "VDIVUW">;
356-
357363
defm : Binary_vr_vv_ShortLong<vvp_shl,
358364
i64, v256i64, "VSLL",
359365
i32, v256i32, "PVSLLLO">;
@@ -365,19 +371,36 @@ defm : Binary_vr_vv_ShortLong<vvp_srl,
365371
i32, v256i32, "PVSRLLO">;
366372

367373
// Floating-point arithmetic (256 elements)
368-
defm : Unary_ShortLong<vvp_frcp, f64, v256f64, "VRCPD", f32, v256f32, "VRCPS">;
369-
defm : Unary_ShortLong<vvp_fsqrt, f64, v256f64, "VFSQRTD", f32, v256f32, "VFSQRTS">;
370-
defm : Binary_rv_vv_ShortLong<c_vvp_fadd, f64, v256f64, "VFADDD", f32, v256f32, "PVFADDUP">;
371-
defm : Binary_rv_vv_ShortLong<vvp_fsub, f64, v256f64, "VFSUBD", f32, v256f32, "PVFSUBUP">;
372-
defm : Binary_rv_vv_ShortLong<c_vvp_fmul, f64, v256f64, "VFMULD", f32, v256f32, "PVFMULUP">;
373-
defm : Binary_ShortLong<vvp_fdiv, f64, v256f64, "VFDIVD", f32, v256f32, "VFDIVS">;
374-
375-
defm : Binary_rv_vv_ShortLong<c_vvp_fminnum, f64, v256f64, "VFMIND", f32, v256f32, "VFMINS">;
376-
defm : Binary_rv_vv_ShortLong<c_vvp_fmaxnum, f64, v256f64, "VFMAXD", f32, v256f32, "VFMAXS">;
377-
378-
defm : Ternary_ShortLong<c_vvp_ffma, f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
379-
defm : Ternary_ShortLong<c_vvp_ffms, f64, v256f64, "VFMSBD", f32, v256f32, "VFMSBS">;
380-
defm : Ternary_ShortLong<c_vvp_ffmsn, f64, v256f64, "VFNMSBD", f32, v256f32, "VFNMSBS">;
374+
defm : Unary_ShortLong<vvp_frcp,
375+
f64, v256f64, "VRCPD", f32, v256f32, "VRCPS">;
376+
defm : Unary_ShortLong<vvp_fsqrt,
377+
f64, v256f64, "VFSQRTD", f32, v256f32, "VFSQRTS">;
378+
defm : Binary_rv_vv_ShortLong<c_vvp_fadd,
379+
f64, v256f64, "VFADDD",
380+
f32, v256f32, "PVFADDUP">;
381+
defm : Binary_rv_vv_ShortLong<c_vvp_fmul,
382+
f64, v256f64, "VFMULD",
383+
f32, v256f32, "PVFMULUP">;
384+
defm : Binary_rv_vv_ShortLong<vvp_fsub,
385+
f64, v256f64, "VFSUBD",
386+
f32, v256f32, "PVFSUBUP">;
387+
defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
388+
f64, v256f64, "VFDIVD",
389+
f32, v256f32, "VFDIVS">;
390+
391+
defm : Binary_rv_vv_ShortLong<c_vvp_fminnum,
392+
f64, v256f64, "VFMIND",
393+
f32, v256f32, "VFMINS">;
394+
defm : Binary_rv_vv_ShortLong<c_vvp_fmaxnum,
395+
f64, v256f64, "VFMAXD",
396+
f32, v256f32, "VFMAXS">;
397+
398+
defm : Ternary_ShortLong<c_vvp_ffma,
399+
f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
400+
defm : Ternary_ShortLong<c_vvp_ffms,
401+
f64, v256f64, "VFMSBD", f32, v256f32, "VFMSBS">;
402+
defm : Ternary_ShortLong<c_vvp_ffmsn,
403+
f64, v256f64, "VFNMSBD", f32, v256f32, "VFNMSBS">;
381404
// TODO: vvp_ffman
382405

383406
///// Selection /////

llvm/lib/Transforms/IPO/GlobalOpt.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
305305
else if (auto *LI = dyn_cast<LoadInst>(U)) {
306306
// A load from zeroinitializer is always zeroinitializer, regardless of
307307
// any applied offset.
308-
if (Init->isNullValue()) {
309-
LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
308+
Type *Ty = LI->getType();
309+
if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
310+
LI->replaceAllUsesWith(Constant::getNullValue(Ty));
310311
EraseFromParent(LI);
311312
continue;
312313
}
@@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
316317
PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
317318
DL, Offset, /* AllowNonInbounds */ true);
318319
if (PtrOp == GV) {
319-
if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
320-
Offset, DL)) {
320+
if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
321321
LI->replaceAllUsesWith(Value);
322322
EraseFromParent(LI);
323323
}

llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
664664
return nullptr;
665665

666666
// When processing loads, we need to propagate two bits of information to the
667-
// sunk load: whether it is volatile, and what its alignment is. We currently
668-
// don't sink loads when some have their alignment specified and some don't.
669-
// visitLoadInst will propagate an alignment onto the load when TD is around,
670-
// and if TD isn't around, we can't handle the mixed case.
667+
// sunk load: whether it is volatile, and what its alignment is.
671668
bool isVolatile = FirstLI->isVolatile();
672669
Align LoadAlignment = FirstLI->getAlign();
673670
unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
@@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
699696
!isSafeAndProfitableToSinkLoad(LI))
700697
return nullptr;
701698

702-
LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
699+
LoadAlignment = std::min(LoadAlignment, LI->getAlign());
703700

704701
// If the PHI is of volatile loads and the load block has multiple
705702
// successors, sinking it would remove a load of the volatile value from
Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
33

4-
define fastcc <256 x float> @test_vp_fadd_256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
5-
; CHECK-LABEL: test_vp_fadd_256f32:
4+
declare <256 x float> @llvm.vp.fadd.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
5+
6+
define fastcc <256 x float> @test_vp_fadd_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
7+
; CHECK-LABEL: test_vp_fadd_v256f32_vv:
68
; CHECK: # %bb.0:
79
; CHECK-NEXT: and %s0, %s0, (32)0
810
; CHECK-NEXT: lvl %s0
@@ -12,5 +14,68 @@ define fastcc <256 x float> @test_vp_fadd_256f32(<256 x float> %i0, <256 x float
1214
ret <256 x float> %r0
1315
}
1416

15-
; integer arith
16-
declare <256 x float> @llvm.vp.fadd.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
17+
define fastcc <256 x float> @test_vp_fadd_v256f32_rv(float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
18+
; CHECK-LABEL: test_vp_fadd_v256f32_rv:
19+
; CHECK: # %bb.0:
20+
; CHECK-NEXT: and %s1, %s1, (32)0
21+
; CHECK-NEXT: lvl %s1
22+
; CHECK-NEXT: pvfadd.up %v0, %s0, %v0
23+
; CHECK-NEXT: b.l.t (, %s10)
24+
%xins = insertelement <256 x float> undef, float %s0, i32 0
25+
%i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
26+
%r0 = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
27+
ret <256 x float> %r0
28+
}
29+
30+
define fastcc <256 x float> @test_vp_fadd_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) {
31+
; CHECK-LABEL: test_vp_fadd_v256f32_vr:
32+
; CHECK: # %bb.0:
33+
; CHECK-NEXT: and %s1, %s1, (32)0
34+
; CHECK-NEXT: lvl %s1
35+
; CHECK-NEXT: pvfadd.up %v0, %s0, %v0
36+
; CHECK-NEXT: b.l.t (, %s10)
37+
%yins = insertelement <256 x float> undef, float %s1, i32 0
38+
%i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
39+
%r0 = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
40+
ret <256 x float> %r0
41+
}
42+
43+
44+
declare <256 x double> @llvm.vp.fadd.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32)
45+
46+
define fastcc <256 x double> @test_vp_fadd_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
47+
; CHECK-LABEL: test_vp_fadd_v256f64_vv:
48+
; CHECK: # %bb.0:
49+
; CHECK-NEXT: and %s0, %s0, (32)0
50+
; CHECK-NEXT: lvl %s0
51+
; CHECK-NEXT: vfadd.d %v0, %v0, %v1
52+
; CHECK-NEXT: b.l.t (, %s10)
53+
%r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
54+
ret <256 x double> %r0
55+
}
56+
57+
define fastcc <256 x double> @test_vp_fadd_v256f64_rv(double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
58+
; CHECK-LABEL: test_vp_fadd_v256f64_rv:
59+
; CHECK: # %bb.0:
60+
; CHECK-NEXT: and %s1, %s1, (32)0
61+
; CHECK-NEXT: lvl %s1
62+
; CHECK-NEXT: vfadd.d %v0, %s0, %v0
63+
; CHECK-NEXT: b.l.t (, %s10)
64+
%xins = insertelement <256 x double> undef, double %s0, i32 0
65+
%i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
66+
%r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
67+
ret <256 x double> %r0
68+
}
69+
70+
define fastcc <256 x double> @test_vp_fadd_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) {
71+
; CHECK-LABEL: test_vp_fadd_v256f64_vr:
72+
; CHECK: # %bb.0:
73+
; CHECK-NEXT: and %s1, %s1, (32)0
74+
; CHECK-NEXT: lvl %s1
75+
; CHECK-NEXT: vfadd.d %v0, %s0, %v0
76+
; CHECK-NEXT: b.l.t (, %s10)
77+
%yins = insertelement <256 x double> undef, double %s1, i32 0
78+
%i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
79+
%r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
80+
ret <256 x double> %r0
81+
}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
3+
4+
declare <256 x float> @llvm.vp.fdiv.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
5+
6+
define fastcc <256 x float> @test_vp_fdiv_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
7+
; CHECK-LABEL: test_vp_fdiv_v256f32_vv:
8+
; CHECK: # %bb.0:
9+
; CHECK-NEXT: and %s0, %s0, (32)0
10+
; CHECK-NEXT: lvl %s0
11+
; CHECK-NEXT: vfdiv.s %v0, %v0, %v1, %vm1
12+
; CHECK-NEXT: b.l.t (, %s10)
13+
%r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
14+
ret <256 x float> %r0
15+
}
16+
17+
define fastcc <256 x float> @test_vp_fdiv_v256f32_rv(float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
18+
; CHECK-LABEL: test_vp_fdiv_v256f32_rv:
19+
; CHECK: # %bb.0:
20+
; CHECK-NEXT: and %s1, %s1, (32)0
21+
; CHECK-NEXT: lvl %s1
22+
; CHECK-NEXT: vfdiv.s %v0, %s0, %v0, %vm1
23+
; CHECK-NEXT: b.l.t (, %s10)
24+
%xins = insertelement <256 x float> undef, float %s0, i32 0
25+
%i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
26+
%r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
27+
ret <256 x float> %r0
28+
}
29+
30+
define fastcc <256 x float> @test_vp_fdiv_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) {
31+
; CHECK-LABEL: test_vp_fdiv_v256f32_vr:
32+
; CHECK: # %bb.0:
33+
; CHECK-NEXT: and %s1, %s1, (32)0
34+
; CHECK-NEXT: lvl %s1
35+
; CHECK-NEXT: vfdiv.s %v0, %v0, %s0, %vm1
36+
; CHECK-NEXT: b.l.t (, %s10)
37+
%yins = insertelement <256 x float> undef, float %s1, i32 0
38+
%i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
39+
%r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
40+
ret <256 x float> %r0
41+
}
42+
43+
44+
declare <256 x double> @llvm.vp.fdiv.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32)
45+
46+
define fastcc <256 x double> @test_vp_fdiv_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
47+
; CHECK-LABEL: test_vp_fdiv_v256f64_vv:
48+
; CHECK: # %bb.0:
49+
; CHECK-NEXT: and %s0, %s0, (32)0
50+
; CHECK-NEXT: lvl %s0
51+
; CHECK-NEXT: vfdiv.d %v0, %v0, %v1, %vm1
52+
; CHECK-NEXT: b.l.t (, %s10)
53+
%r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
54+
ret <256 x double> %r0
55+
}
56+
57+
define fastcc <256 x double> @test_vp_fdiv_v256f64_rv(double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
58+
; CHECK-LABEL: test_vp_fdiv_v256f64_rv:
59+
; CHECK: # %bb.0:
60+
; CHECK-NEXT: and %s1, %s1, (32)0
61+
; CHECK-NEXT: lvl %s1
62+
; CHECK-NEXT: vfdiv.d %v0, %s0, %v0, %vm1
63+
; CHECK-NEXT: b.l.t (, %s10)
64+
%xins = insertelement <256 x double> undef, double %s0, i32 0
65+
%i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
66+
%r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
67+
ret <256 x double> %r0
68+
}
69+
70+
define fastcc <256 x double> @test_vp_fdiv_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) {
71+
; CHECK-LABEL: test_vp_fdiv_v256f64_vr:
72+
; CHECK: # %bb.0:
73+
; CHECK-NEXT: and %s1, %s1, (32)0
74+
; CHECK-NEXT: lvl %s1
75+
; CHECK-NEXT: vfdiv.d %v0, %v0, %s0, %vm1
76+
; CHECK-NEXT: b.l.t (, %s10)
77+
%yins = insertelement <256 x double> undef, double %s1, i32 0
78+
%i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
79+
%r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
80+
ret <256 x double> %r0
81+
}

0 commit comments

Comments
 (0)