Skip to content

Commit 151ee0f

Browse files
committed
[X86] SimplifyDemandedVectorEltsForTargetNode - ensure X86ISD::VPERMILPV node use v2f64/v4f32 types
When reducing v4f64/v8f32 non-lane crossing X86ISD::VPERMV nodes, we use X86ISD::VPERMILPV nodes for 128-bits, but these are only available for fp types. Fixes #145046
1 parent 749e4a5 commit 151ee0f

File tree

2 files changed

+18
-2
lines changed

2 files changed

+18
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44110,8 +44110,13 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4411044110
// For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
4411144111
if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
4411244112
Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
44113-
else
44114-
Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
44113+
else {
44114+
MVT ShufSVT = MVT::getFloatingPointVT(VT.getScalarSizeInBits());
44115+
MVT ShufVT = HalfVT.changeVectorElementType(ShufSVT);
44116+
Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, ShufVT,
44117+
TLO.DAG.getBitcast(ShufVT, V), M);
44118+
Ext = TLO.DAG.getBitcast(HalfVT, Ext);
44119+
}
4411544120
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
4411644121
Subtarget, TLO.DAG, DL, SizeInBits);
4411744122
return TLO.CombineTo(Op, Insert);

llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,17 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) {
775775
ret <32 x i8> %4
776776
}
777777

778+
define <4 x i32> @extract_vpermd(<8 x i32> %a0) {
779+
; CHECK-LABEL: extract_vpermd:
780+
; CHECK: # %bb.0:
781+
; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,0]
782+
; CHECK-NEXT: vzeroupper
783+
; CHECK-NEXT: ret{{[l|q]}}
784+
%1 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 0, i32 1, i32 0, i32 7, i32 6>)
785+
%2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
786+
ret <4 x i32> %2
787+
}
788+
778789
; Not beneficial to concatenate both inputs just to create a 256-bit vpaddb
779790
define <32 x i8> @concat_add_unnecessary(<16 x i8> %a0, <16 x i8> noundef %a1, <16 x i8> %a2) nounwind {
780791
; CHECK-LABEL: concat_add_unnecessary:

0 commit comments

Comments
 (0)