From 1d24f8ab291662e1506a16fcd14e4a411579fd20 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 3 Jul 2025 12:07:42 +0100 Subject: [PATCH 1/3] [InstCombine] Move extends across identity shuffles. Add a new fold to instcombine to move SExt/ZExt across identity shuffles, applying the cast after the shuffle. This sinks extends and can enable more general additional folding of both shuffles (and related instructions) and extends. If backends prefer splitting up doing casts first, the extends can be hoisted again in VectorCombine for example. A larger example is included in the load_i32_zext_to_v4i32. The wider extend is easier to compute an accurate cost for and targets (like AArch64) can lower a single wider extend more efficiently than multiple separate extends. This is a generalization of a VectorCombine version (https://github.com/llvm/llvm-project/pull/141109/) as suggested by @preames. --- .../InstCombine/InstCombineVectorOps.cpp | 24 +++++++++++++++---- .../Transforms/InstCombine/X86/blend_x86.ll | 2 +- .../InstCombine/fold-shuffle-ext.ll | 16 +++++-------- .../PhaseOrdering/X86/blendv-select.ll | 5 ++-- 4 files changed, 28 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 1a8d4215b5b71..ae5bc2cdbfdd7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2571,17 +2571,16 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, /// Canonicalize casts after shuffle. static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf, InstCombiner::BuilderTy &Builder) { - // Do we have 2 matching cast operands? auto *Cast0 = dyn_cast(Shuf.getOperand(0)); - auto *Cast1 = dyn_cast(Shuf.getOperand(1)); - if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() || - Cast0->getSrcTy() != Cast1->getSrcTy()) + if (!Cast0) return nullptr; // TODO: Allow other opcodes? That would require easing the type restrictions // below here. CastInst::CastOps CastOpcode = Cast0->getOpcode(); switch (CastOpcode) { + case Instruction::SExt: + case Instruction::ZExt: case Instruction::FPToSI: case Instruction::FPToUI: case Instruction::SIToFP: @@ -2591,15 +2590,30 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf, return nullptr; } + VectorType *CastSrcTy = cast(Cast0->getSrcTy()); VectorType *ShufTy = Shuf.getType(); VectorType *ShufOpTy = cast(Shuf.getOperand(0)->getType()); - VectorType *CastSrcTy = cast(Cast0->getSrcTy()); // TODO: Allow length-increasing shuffles? if (ShufTy->getElementCount().getKnownMinValue() > ShufOpTy->getElementCount().getKnownMinValue()) return nullptr; + // shuffle (cast X), Y, identity-with-extract-mask --> + // cast (shuffle X, Y, identity-with-extract-mask). + if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract()) { + auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0), + PoisonValue::get(CastSrcTy), + Shuf.getShuffleMask()); + return CastInst::Create(Cast0->getOpcode(), NewIns, Shuf.getType()); + } + + auto *Cast1 = dyn_cast(Shuf.getOperand(1)); + // Do we have 2 matching cast operands? + if (!Cast1 || Cast0->getOpcode() != Cast1->getOpcode() || + Cast0->getSrcTy() != Cast1->getSrcTy()) + return nullptr; + // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)? assert(isa(CastSrcTy) && isa(ShufOpTy) && "Expected fixed vector operands for casts and binary shuffle"); diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll index aa49f493c9fa1..5c3b0beefbb66 100644 --- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll +++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll @@ -287,9 +287,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f ; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32> ; CHECK-NEXT: [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> ; CHECK-NEXT: [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[B_LO]], <4 x i32> [[A_LO]] ; CHECK-NEXT: [[RES:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float> ; CHECK-NEXT: ret <4 x float> [[RES]] diff --git a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll index dddd63d101e61..fc19b11fa8694 100644 --- a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll +++ b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll @@ -5,8 +5,8 @@ define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(<8 x i8> %x) { ; CHECK-LABEL: define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16( ; CHECK-SAME: <8 x i8> [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16> ; CHECK-NEXT: ret <4 x i16> [[SHUFFLE]] ; entry: @@ -19,8 +19,8 @@ define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(<4 x i16> %x) ; CHECK-LABEL: define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32( ; CHECK-SAME: <4 x i16> [[X:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> poison, <3 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = zext <3 x i16> [[TMP0]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[SHUFFLE]] ; entry: @@ -102,12 +102,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) { ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32( ; CHECK-SAME: ptr [[DI:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4 -; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> , i32 [[L]], i64 0 -; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8> -; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16> -; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> -; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32> +; CHECK-NEXT: [[L1:%.*]] = load <4 x i8>, ptr [[DI]], align 4 +; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L1]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[EXT_2]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll index bbf893f6127b0..dbce77698eb07 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll @@ -496,9 +496,8 @@ define <2 x i64> @x86_pblendvb_v32i8_v16i8_undefs(<4 x i64> %a, <4 x i64> %b, <4 ; CHECK-NEXT: [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> ; CHECK-NEXT: [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> ; CHECK-NEXT: [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]] -; CHECK-NEXT: [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8> -; CHECK-NEXT: [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> -; CHECK-NEXT: [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]]) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> +; CHECK-NEXT: [[SEL_LO:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[B_LO]], <16 x i8> [[A_LO]] ; CHECK-NEXT: [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[RES]] ; From 1309fc716b1ceb0674061abb724fa6953996d528 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sat, 5 Jul 2025 20:46:01 +0100 Subject: [PATCH 2/3] !fixup require second op poison --- llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index ae5bc2cdbfdd7..2915764699541 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2599,9 +2599,10 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf, ShufOpTy->getElementCount().getKnownMinValue()) return nullptr; - // shuffle (cast X), Y, identity-with-extract-mask --> - // cast (shuffle X, Y, identity-with-extract-mask). - if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract()) { + // shuffle (cast X), Poison, identity-with-extract-mask --> + // cast (shuffle X, Poison, identity-with-extract-mask). + if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract() && + isa(Shuf.getOperand(1))) { auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0), PoisonValue::get(CastSrcTy), Shuf.getShuffleMask()); From 721dd820aa56d6971d2a64a6c3674ea57993726c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 8 Jul 2025 12:49:41 +0100 Subject: [PATCH 3/3] !fixup adjust check --- llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 2915764699541..00b877b8a07ef 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2601,8 +2601,8 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf, // shuffle (cast X), Poison, identity-with-extract-mask --> // cast (shuffle X, Poison, identity-with-extract-mask). - if (Cast0->hasOneUse() && Shuf.isIdentityWithExtract() && - isa(Shuf.getOperand(1))) { + if (isa(Shuf.getOperand(1)) && Cast0->hasOneUse() && + Shuf.isIdentityWithExtract()) { auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0), PoisonValue::get(CastSrcTy), Shuf.getShuffleMask());