diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 1a8d4215b5b71..00b877b8a07ef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2571,17 +2571,16 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf,
 /// Canonicalize casts after shuffle.
 static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
                                     InstCombiner::BuilderTy &Builder) {
-  // Do we have 2 matching cast operands?
   auto *Cast0 = dyn_cast<CastInst>(Shuf.getOperand(0));
-  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
-  if (!Cast0 || !Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
-      Cast0->getSrcTy() != Cast1->getSrcTy())
+  if (!Cast0)
     return nullptr;
 
   // TODO: Allow other opcodes? That would require easing the type restrictions
   //       below here.
   CastInst::CastOps CastOpcode = Cast0->getOpcode();
   switch (CastOpcode) {
+  case Instruction::SExt:
+  case Instruction::ZExt:
   case Instruction::FPToSI:
   case Instruction::FPToUI:
   case Instruction::SIToFP:
@@ -2591,15 +2590,31 @@ static Instruction *foldCastShuffle(ShuffleVectorInst &Shuf,
     return nullptr;
   }
 
+  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
   VectorType *ShufTy = Shuf.getType();
   VectorType *ShufOpTy = cast<VectorType>(Shuf.getOperand(0)->getType());
-  VectorType *CastSrcTy = cast<VectorType>(Cast0->getSrcTy());
 
   // TODO: Allow length-increasing shuffles?
   if (ShufTy->getElementCount().getKnownMinValue() >
       ShufOpTy->getElementCount().getKnownMinValue())
     return nullptr;
 
+  // shuffle (cast X), Poison, identity-with-extract-mask -->
+  // cast (shuffle X, Poison, identity-with-extract-mask).
+  if (isa<PoisonValue>(Shuf.getOperand(1)) && Cast0->hasOneUse() &&
+      Shuf.isIdentityWithExtract()) {
+    auto *NewIns = Builder.CreateShuffleVector(Cast0->getOperand(0),
+                                               PoisonValue::get(CastSrcTy),
+                                               Shuf.getShuffleMask());
+    return CastInst::Create(Cast0->getOpcode(), NewIns, Shuf.getType());
+  }
+
+  auto *Cast1 = dyn_cast<CastInst>(Shuf.getOperand(1));
+  // Do we have 2 matching cast operands?
+  if (!Cast1 || Cast0->getOpcode() != Cast1->getOpcode() ||
+      Cast0->getSrcTy() != Cast1->getSrcTy())
+    return nullptr;
+
   // TODO: Allow element-size-decreasing casts (ex: fptosi float to i8)?
   assert(isa<FixedVectorType>(CastSrcTy) && isa<FixedVectorType>(ShufOpTy) &&
          "Expected fixed vector operands for casts and binary shuffle");
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index aa49f493c9fa1..5c3b0beefbb66 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -287,9 +287,9 @@ define <4 x float> @sel_v16i8_bitcast_shuffle_bitcast_cmp(<8 x float> %a, <8 x f
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[A_BC:%.*]] = bitcast <8 x float> [[A]] to <8 x i32>
 ; CHECK-NEXT:    [[B_BC:%.*]] = bitcast <8 x float> [[B]] to <8 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <8 x i32> [[A_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <8 x i32> [[B_BC]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i1> [[CMP]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[B_LO]], <4 x i32> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
diff --git a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
index dddd63d101e61..fc19b11fa8694 100644
--- a/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
+++ b/llvm/test/Transforms/InstCombine/fold-shuffle-ext.ll
@@ -5,8 +5,8 @@ define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(<8 x i8> %x) {
 ; CHECK-LABEL: define <4 x i16> @ext_identity_mask_first_vector_first_half_4xi16(
 ; CHECK-SAME: <8 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[X]] to <8 x i16>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i16>
 ; CHECK-NEXT:    ret <4 x i16> [[SHUFFLE]]
 ;
 entry:
@@ -19,8 +19,8 @@ define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(<4 x i16> %x)
 ; CHECK-LABEL: define <3 x i32> @ext_identity_mask_first_vector_first_half_3xi32(
 ; CHECK-SAME: <4 x i16> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[E_1:%.*]] = zext <4 x i16> [[X]] to <4 x i32>
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[E_1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SHUFFLE:%.*]] = zext <3 x i16> [[TMP0]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[SHUFFLE]]
 ;
 entry:
@@ -102,12 +102,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
 ; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
 ; CHECK-SAME: ptr [[DI:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[DI]], align 4
-; CHECK-NEXT:    [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
-; CHECK-NEXT:    [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
-; CHECK-NEXT:    [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
-; CHECK-NEXT:    [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
+; CHECK-NEXT:    [[L1:%.*]] = load <4 x i8>, ptr [[DI]], align 4
+; CHECK-NEXT:    [[EXT_2:%.*]] = zext <4 x i8> [[L1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[EXT_2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
index bbf893f6127b0..dbce77698eb07 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/blendv-select.ll
@@ -496,9 +496,8 @@ define <2 x i64> @x86_pblendvb_v32i8_v16i8_undefs(<4 x i64> %a, <4 x i64> %b, <4
 ; CHECK-NEXT:    [[A_LO:%.*]] = shufflevector <32 x i8> [[A_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[B_LO:%.*]] = shufflevector <32 x i8> [[B_BC]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <32 x i8> [[C_BC]], [[D_BC]]
-; CHECK-NEXT:    [[SEXT:%.*]] = sext <32 x i1> [[CMP]] to <32 x i8>
-; CHECK-NEXT:    [[SEXT_LO:%.*]] = shufflevector <32 x i8> [[SEXT]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[SEL_LO:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[A_LO]], <16 x i8> [[B_LO]], <16 x i8> [[SEXT_LO]])
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i1> [[CMP]], <32 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[SEL_LO:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[B_LO]], <16 x i8> [[A_LO]]
 ; CHECK-NEXT:    [[RES:%.*]] = bitcast <16 x i8> [[SEL_LO]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;