From 3f572c03105bb6d47e100594fd4e1349e8a1c497 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Mon, 20 Jan 2025 20:56:18 +0800 Subject: [PATCH 01/11] [LV][EVL] Generate negative strided load/store for reversed load/store This can reduce the operations to reverse mask, load result and store value. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 66 +++++++++++-------- ...-force-tail-with-evl-reverse-load-store.ll | 23 ++----- ...orize-force-tail-with-evl-uniform-store.ll | 3 +- 3 files changed, 46 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index aa5f92b235555..587c7e9b4417f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -/// Use all-true mask for reverse rather than actual mask, as it avoids a -/// dependence w/o affecting the result. -static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, - Value *EVL, const Twine &Name) { - VectorType *ValTy = cast(Operand->getType()); - Value *AllTrueMask = - Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); - return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, - {Operand, AllTrueMask, EVL}, nullptr, Name); -} - void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); @@ -2630,8 +2619,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } @@ -2641,17 +2628,29 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewLI = cast(VBuilder.createVectorInstruction( - Instruction::Load, DataTy, Addr, "vp.op.load")); + if (isReverse()) { + auto *EltTy = DataTy->getElementType(); + auto *PtrTy = Addr->getType(); + Value *Operands[] = { + Addr, + ConstantInt::getSigned( + Builder.getInt32Ty(), + -static_cast(EltTy->getScalarSizeInBits()) / 8), + Mask, EVL}; + NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, + {DataTy, PtrTy, Builder.getInt32Ty()}, + Operands, nullptr, "vp.neg.strided.load"); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); + } } NewLI->addParamAttr( 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); State.addMetadata(NewLI, LI); Instruction *Res = NewLI; - if (isReverse()) - Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); State.set(this, Res); } @@ -2749,13 +2748,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); - if (isReverse()) - StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); - if (isReverse()) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } @@ -2765,11 +2760,26 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewSI = cast(VBuilder.createVectorInstruction( - Instruction::Store, Type::getVoidTy(EVL->getContext()), - {StoredVal, Addr})); + if (isReverse()) { + Type *StoredValTy = StoredVal->getType(); + auto *EltTy = cast(StoredValTy)->getElementType(); + auto *PtrTy = Addr->getType(); + Value *Operands[] = { + StoredVal, Addr, + ConstantInt::getSigned( + Builder.getInt32Ty(), + -static_cast(EltTy->getScalarSizeInBits()) / 8), + Mask, EVL}; + NewSI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_store, + {StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands); + } else { + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); + } } NewSI->addParamAttr( 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 5b579b0749c67..ba65137e94935 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -39,16 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE3]], ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -153,18 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP15]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], [[VP_REVERSE_MASK]], i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_OP_LOAD4]], splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] ; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call @llvm.experimental.vp.reverse.nxv4i32( [[VP_REVERSE]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call @llvm.experimental.vp.reverse.nxv4i1( [[TMP15]], splat (i1 true), i32 [[TMP5]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[VP_REVERSE5]], ptr align 4 [[TMP25]], [[VP_REVERSE_MASK6]], i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -280,8 +274,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] -; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[VP_OP_LOAD]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]] @@ -290,16 +283,14 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE1]], ptr align 1 [[TMP20]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] -; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call @llvm.experimental.vp.reverse.nxv16i8( [[WIDE_MASKED_GATHER]], splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0( [[VP_REVERSE2]], ptr align 1 [[TMP26]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index a2f85b9ed4ffe..69ba0bad45de6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -43,8 +43,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]] -; CHECK-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.reverse.nxv2i64( zeroinitializer, splat (i1 true), i32 [[TMP11]]) -; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VP_REVERSE]], ptr align 8 [[TMP20]], splat (i1 true), i32 [[TMP11]]) +; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32( zeroinitializer, ptr align 8 [[TMP20]], i32 -8, splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]] From 627fbaa37c386d5ce171fbe66a48be7283f75e71 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 11:24:19 +0800 Subject: [PATCH 02/11] Use DL.getTypeAllocSize() and else if --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 70 +++++++++---------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 587c7e9b4417f..907843dbb7a7b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2627,25 +2627,22 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); + } else if (isReverse()) { + auto *EltTy = DataTy->getElementType(); + auto *PtrTy = Addr->getType(); + Value *Operands[] = { + Addr, + ConstantInt::getSigned(Builder.getInt32Ty(), + -LI->getDataLayout().getTypeAllocSize(EltTy)), + Mask, EVL}; + NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, + {DataTy, PtrTy, Builder.getInt32Ty()}, + Operands, nullptr, "vp.neg.strided.load"); } else { - if (isReverse()) { - auto *EltTy = DataTy->getElementType(); - auto *PtrTy = Addr->getType(); - Value *Operands[] = { - Addr, - ConstantInt::getSigned( - Builder.getInt32Ty(), - -static_cast(EltTy->getScalarSizeInBits()) / 8), - Mask, EVL}; - NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {DataTy, PtrTy, Builder.getInt32Ty()}, - Operands, nullptr, "vp.neg.strided.load"); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewLI = cast(VBuilder.createVectorInstruction( - Instruction::Load, DataTy, Addr, "vp.op.load")); - } + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewLI = cast(VBuilder.createVectorInstruction( + Instruction::Load, DataTy, Addr, "vp.op.load")); } NewLI->addParamAttr( 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); @@ -2759,27 +2756,24 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); + } else if (isReverse()) { + Type *StoredValTy = StoredVal->getType(); + auto *EltTy = cast(StoredValTy)->getElementType(); + auto *PtrTy = Addr->getType(); + Value *Operands[] = { + StoredVal, Addr, + ConstantInt::getSigned(Builder.getInt32Ty(), + -SI->getDataLayout().getTypeAllocSize(EltTy)), + Mask, EVL}; + NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store, + {StoredValTy, PtrTy, Builder.getInt32Ty()}, + Operands); } else { - if (isReverse()) { - Type *StoredValTy = StoredVal->getType(); - auto *EltTy = cast(StoredValTy)->getElementType(); - auto *PtrTy = Addr->getType(); - Value *Operands[] = { - StoredVal, Addr, - ConstantInt::getSigned( - Builder.getInt32Ty(), - -static_cast(EltTy->getScalarSizeInBits()) / 8), - Mask, EVL}; - NewSI = Builder.CreateIntrinsic( - Intrinsic::experimental_vp_strided_store, - {StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands); - } else { - VectorBuilder VBuilder(Builder); - VBuilder.setEVL(EVL).setMask(Mask); - NewSI = cast(VBuilder.createVectorInstruction( - Instruction::Store, Type::getVoidTy(EVL->getContext()), - {StoredVal, Addr})); - } + VectorBuilder VBuilder(Builder); + VBuilder.setEVL(EVL).setMask(Mask); + NewSI = cast(VBuilder.createVectorInstruction( + Instruction::Store, Type::getVoidTy(EVL->getContext()), + {StoredVal, Addr})); } NewSI->addParamAttr( 1, Attribute::getWithAlignment(NewSI->getContext(), Alignment)); From f4f50e69e71a60c5940bbc124c2979c073600d00 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 11:49:19 +0800 Subject: [PATCH 03/11] Add TII::preferStridedLoadStore to control the behavior --- .../llvm/Analysis/TargetTransformInfo.h | 9 +++++++ .../llvm/Analysis/TargetTransformInfoImpl.h | 2 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 4 +++ .../Target/RISCV/RISCVTargetTransformInfo.h | 2 ++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 25 +++++++++++++++++-- 5 files changed, 40 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 71b204f9c3fec..695acff6f1181 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1776,6 +1776,10 @@ class TargetTransformInfo { /// otherwise scalar epilogue loop. bool preferEpilogueVectorization() const; + /// Return true if the loop vectorizer prefer strided load/store when + /// vectorizing reversed load/store. + bool preferStridedLoadStore() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; @@ -2301,6 +2305,7 @@ class TargetTransformInfo::Concept { virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferEpilogueVectorization() const = 0; + virtual bool preferStridedLoadStore() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual ReductionShuffle @@ -3105,6 +3110,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.preferEpilogueVectorization(); } + bool preferStridedLoadStore() const override { + return Impl.preferStridedLoadStore(); + } + bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index dcef4a1abcfa3..1601005519847 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1003,6 +1003,8 @@ class TargetTransformInfoImplBase { return true; } + bool preferStridedLoadStore() const { return false; } + bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } TTI::ReductionShuffle diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8b9722d047edc..bef0233f3f8a8 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1374,6 +1374,10 @@ bool TargetTransformInfo::preferEpilogueVectorization() const { return TTIImpl->preferEpilogueVectorization(); } +bool TargetTransformInfo::preferStridedLoadStore() const { + return TTIImpl->preferStridedLoadStore(); +} + TargetTransformInfo::VPLegalization TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const { return TTIImpl->getVPLegalizationStrategy(VPI); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 9b364391f0fa4..fe60521abe210 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -118,6 +118,8 @@ class RISCVTTIImpl : public BasicTTIImplBase { return false; } + bool preferStridedLoadStore() const { return true; } + InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 907843dbb7a7b..6e98ccf29d8e4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2603,6 +2603,17 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +/// Use all-true mask for reverse rather than actual mask, as it avoids a +/// dependence w/o affecting the result. +static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, + Value *EVL, const Twine &Name) { + VectorType *ValTy = cast(Operand->getType()); + Value *AllTrueMask = + Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); + return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, + {Operand, AllTrueMask, EVL}, nullptr, Name); +} + void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); @@ -2610,6 +2621,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); + bool UseStridedLoadStore = State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); @@ -2619,6 +2631,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); + if (isReverse() && !UseStridedLoadStore) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } @@ -2627,7 +2641,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); - } else if (isReverse()) { + } else if (isReverse() && UseStridedLoadStore) { auto *EltTy = DataTy->getElementType(); auto *PtrTy = Addr->getType(); Value *Operands[] = { @@ -2648,6 +2662,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); State.addMetadata(NewLI, LI); Instruction *Res = NewLI; + if (isReverse() && !UseStridedLoadStore) + Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); State.set(this, Res); } @@ -2738,6 +2754,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); + bool UseStridedLoadStore = State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); @@ -2745,9 +2762,13 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); + if (isReverse() && !UseStridedLoadStore) + StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; if (VPValue *VPMask = getMask()) { Mask = State.get(VPMask); + if (isReverse() && !UseStridedLoadStore) + Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); } else { Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); } @@ -2756,7 +2777,7 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); - } else if (isReverse()) { + } else if (isReverse() && UseStridedLoadStore) { Type *StoredValTy = StoredVal->getType(); auto *EltTy = cast(StoredValTy)->getElementType(); auto *PtrTy = Addr->getType(); From 65cd8a05b98f890be8497e6439afba8b824625cd Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 12:01:57 +0800 Subject: [PATCH 04/11] Add a CLI option --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++++ llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 29f3940ed6fa7..2e34430bbcdb6 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -340,6 +340,10 @@ static cl::opt cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); +cl::opt PreferStridedLoadStore("prefer-strided-load-store", + cl::init(false), cl::Hidden, + cl::desc("Prefer strided load/store.")); + static cl::opt ForceOrderedReductions( "force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6e98ccf29d8e4..4912f6baba448 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -44,6 +44,7 @@ namespace llvm { extern cl::opt EnableVPlanNativePath; } extern cl::opt ForceTargetInstructionCost; +extern cl::opt PreferStridedLoadStore; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -2621,7 +2622,8 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); - bool UseStridedLoadStore = State.TTI->preferStridedLoadStore(); + bool UseStridedLoadStore = + PreferStridedLoadStore || State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); @@ -2754,7 +2756,8 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool UseStridedLoadStore = State.TTI->preferStridedLoadStore(); + bool UseStridedLoadStore = + PreferStridedLoadStore || State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); From 830272f9629b9b05427cce764f12361d544d1cbc Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 16:06:43 +0800 Subject: [PATCH 05/11] Port new cost model --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 16 ++++++++-------- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 14 ++++++++++++++ ...ize-force-tail-with-evl-reverse-load-store.ll | 16 ++++++++-------- 3 files changed, 30 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2e34430bbcdb6..788b751888d48 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7530,14 +7530,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); - assert((BestFactor.Width == LegacyVF.Width || - planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), - CostCtx, OrigLoop) || - planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), - CostCtx, OrigLoop)) && - " VPlan cost model and legacy cost model disagreed"); - assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && - "when vectorizing, the scalar cost must be computed."); + // assert((BestFactor.Width == LegacyVF.Width || + // planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), + // CostCtx, OrigLoop) || + // planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), + // CostCtx, OrigLoop)) && + // " VPlan cost model and legacy cost model disagreed"); + // assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && + // "when vectorizing, the scalar cost must be computed."); #endif LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4912f6baba448..324ce03bde910 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2689,6 +2689,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; + bool UseStridedLoadStore = + PreferStridedLoadStore || Ctx.TTI.preferStridedLoadStore(); + if (UseStridedLoadStore) + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, + getAddr()->getUnderlyingValue(), + false, Alignment, Ctx.CostKind); + return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, cast(Ty), {}, Ctx.CostKind, 0); @@ -2824,6 +2831,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; + bool UseStridedLoadStore = + PreferStridedLoadStore || Ctx.TTI.preferStridedLoadStore(); + if (UseStridedLoadStore) + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, + getAddr()->getUnderlyingValue(), + false, Alignment, Ctx.CostKind); + return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, cast(Ty), {}, Ctx.CostKind, 0); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index ba65137e94935..5d8d37920058a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -252,20 +252,20 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; IF-EVL: vector.ph: ; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 ; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]] ; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] ; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16 +; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8 ; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[N_VEC]] ; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]] ; IF-EVL: vector.body: ; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]] -; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true) +; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true) ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP7]] @@ -274,23 +274,23 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] -; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE]] -; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv16i8.nxv16p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_NEG_STRIDED_LOAD]] +; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv8i8.nxv8p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]] ; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] ; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] ; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] -; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] From 498d8e7464901b468716f4af24325f1a05b356e9 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 16:59:03 +0800 Subject: [PATCH 06/11] Skip failures because of VP-based/legacy costs mismatch --- .../Transforms/Vectorize/LoopVectorize.cpp | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 788b751888d48..11c581d4e4db1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7437,6 +7437,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // comparing against the legacy cost isn't desirable. if (isa(&R)) return true; + + // The VPlan-based cost model may calculate the cost of strided load/store + // which can't be modeled in the legacy cost model. + if (isa(&R) || isa(&R)) + return true; + if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); } @@ -7530,14 +7536,14 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, CM.CostKind); precomputeCosts(BestPlan, BestFactor.Width, CostCtx); - // assert((BestFactor.Width == LegacyVF.Width || - // planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), - // CostCtx, OrigLoop) || - // planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), - // CostCtx, OrigLoop)) && - // " VPlan cost model and legacy cost model disagreed"); - // assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && - // "when vectorizing, the scalar cost must be computed."); + assert((BestFactor.Width == LegacyVF.Width || + planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), + CostCtx, OrigLoop) || + planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), + CostCtx, OrigLoop)) && + " VPlan cost model and legacy cost model disagreed"); + assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && + "when vectorizing, the scalar cost must be computed."); #endif LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n"); From e21dcdb1eb0098248027ffbb93bc3b8b52464289 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 17:42:54 +0800 Subject: [PATCH 07/11] Only generate strided load/store --- .../llvm/Analysis/TargetTransformInfo.h | 9 ---- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 - llvm/lib/Analysis/TargetTransformInfo.cpp | 4 -- .../Target/RISCV/RISCVTargetTransformInfo.h | 2 - .../Transforms/Vectorize/LoopVectorize.cpp | 4 -- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 54 +++++-------------- 6 files changed, 13 insertions(+), 62 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 695acff6f1181..71b204f9c3fec 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1776,10 +1776,6 @@ class TargetTransformInfo { /// otherwise scalar epilogue loop. bool preferEpilogueVectorization() const; - /// Return true if the loop vectorizer prefer strided load/store when - /// vectorizing reversed load/store. - bool preferStridedLoadStore() const; - /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; @@ -2305,7 +2301,6 @@ class TargetTransformInfo::Concept { virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferEpilogueVectorization() const = 0; - virtual bool preferStridedLoadStore() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual ReductionShuffle @@ -3110,10 +3105,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.preferEpilogueVectorization(); } - bool preferStridedLoadStore() const override { - return Impl.preferStridedLoadStore(); - } - bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1601005519847..dcef4a1abcfa3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -1003,8 +1003,6 @@ class TargetTransformInfoImplBase { return true; } - bool preferStridedLoadStore() const { return false; } - bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } TTI::ReductionShuffle diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bef0233f3f8a8..8b9722d047edc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1374,10 +1374,6 @@ bool TargetTransformInfo::preferEpilogueVectorization() const { return TTIImpl->preferEpilogueVectorization(); } -bool TargetTransformInfo::preferStridedLoadStore() const { - return TTIImpl->preferStridedLoadStore(); -} - TargetTransformInfo::VPLegalization TargetTransformInfo::getVPLegalizationStrategy(const VPIntrinsic &VPI) const { return TTIImpl->getVPLegalizationStrategy(VPI); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index fe60521abe210..9b364391f0fa4 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -118,8 +118,6 @@ class RISCVTTIImpl : public BasicTTIImplBase { return false; } - bool preferStridedLoadStore() const { return true; } - InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 11c581d4e4db1..71ec9e2ebbd3c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -340,10 +340,6 @@ static cl::opt cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); -cl::opt PreferStridedLoadStore("prefer-strided-load-store", - cl::init(false), cl::Hidden, - cl::desc("Prefer strided load/store.")); - static cl::opt ForceOrderedReductions( "force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 324ce03bde910..5c89f0f6dbf15 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -44,7 +44,6 @@ namespace llvm { extern cl::opt EnableVPlanNativePath; } extern cl::opt ForceTargetInstructionCost; -extern cl::opt PreferStridedLoadStore; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -2622,8 +2621,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *DataTy = VectorType::get(ScalarDataTy, State.VF); const Align Alignment = getLoadStoreAlignment(&Ingredient); bool CreateGather = !isConsecutive(); - bool UseStridedLoadStore = - PreferStridedLoadStore || State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); @@ -2631,19 +2628,16 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Value *EVL = State.get(getEVL(), VPLane(0)); Value *Addr = State.get(getAddr(), !CreateGather); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { + if (VPValue *VPMask = getMask()) Mask = State.get(VPMask); - if (isReverse() && !UseStridedLoadStore) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { + else Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } if (CreateGather) { NewLI = Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, nullptr, "wide.masked.gather"); - } else if (isReverse() && UseStridedLoadStore) { + } else if (isReverse()) { auto *EltTy = DataTy->getElementType(); auto *PtrTy = Addr->getType(); Value *Operands[] = { @@ -2664,8 +2658,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); State.addMetadata(NewLI, LI); Instruction *Res = NewLI; - if (isReverse() && !UseStridedLoadStore) - Res = createReverseEVL(Builder, Res, EVL, "vp.reverse"); State.set(this, Res); } @@ -2689,16 +2681,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; - bool UseStridedLoadStore = - PreferStridedLoadStore || Ctx.TTI.preferStridedLoadStore(); - if (UseStridedLoadStore) - return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, - getAddr()->getUnderlyingValue(), - false, Alignment, Ctx.CostKind); - - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, + getAddr()->getUnderlyingValue(), false, + Alignment, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -2763,8 +2748,6 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { VPValue *StoredValue = getStoredValue(); bool CreateScatter = !isConsecutive(); const Align Alignment = getLoadStoreAlignment(&Ingredient); - bool UseStridedLoadStore = - PreferStridedLoadStore || State.TTI->preferStridedLoadStore(); auto &Builder = State.Builder; State.setDebugLocFrom(getDebugLoc()); @@ -2772,22 +2755,18 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) { CallInst *NewSI = nullptr; Value *StoredVal = State.get(StoredValue); Value *EVL = State.get(getEVL(), VPLane(0)); - if (isReverse() && !UseStridedLoadStore) - StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse"); Value *Mask = nullptr; - if (VPValue *VPMask = getMask()) { + if (VPValue *VPMask = getMask()) Mask = State.get(VPMask); - if (isReverse() && !UseStridedLoadStore) - Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask"); - } else { + else Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); - } + Value *Addr = State.get(getAddr(), !CreateScatter); if (CreateScatter) { NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), Intrinsic::vp_scatter, {StoredVal, Addr, Mask, EVL}); - } else if (isReverse() && UseStridedLoadStore) { + } else if (isReverse()) { Type *StoredValTy = StoredVal->getType(); auto *EltTy = cast(StoredValTy)->getElementType(); auto *PtrTy = Addr->getType(); @@ -2831,16 +2810,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, if (!Reverse) return Cost; - bool UseStridedLoadStore = - PreferStridedLoadStore || Ctx.TTI.preferStridedLoadStore(); - if (UseStridedLoadStore) - return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, - getAddr()->getUnderlyingValue(), - false, Alignment, Ctx.CostKind); - - return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, - cast(Ty), {}, Ctx.CostKind, - 0); + return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, + getAddr()->getUnderlyingValue(), false, + Alignment, Ctx.CostKind); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) From 9562a3f7cfaf0279dd531ea41ea1552f648236de Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 20:09:36 +0800 Subject: [PATCH 08/11] Fix copy-paste mistake and check isReverse() --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 71ec9e2ebbd3c..14cbfa3bec6f4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7436,8 +7436,9 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // The VPlan-based cost model may calculate the cost of strided load/store // which can't be modeled in the legacy cost model. - if (isa(&R) || isa(&R)) - return true; + if (isa(&R) || isa(&R)) + if (cast(&R)->isReverse()) + return true; if (Instruction *UI = GetInstructionForCost(&R)) SeenInstrs.insert(UI); From 4e4e4f2d57cb0bff6fa0e423a312e6ea864749e0 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 21 Jan 2025 20:49:32 +0800 Subject: [PATCH 09/11] Remove createReverseEVL and sink non-reversed Cost --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 5c89f0f6dbf15..b1c7e3c003a5c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent, } #endif -/// Use all-true mask for reverse rather than actual mask, as it avoids a -/// dependence w/o affecting the result. -static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand, - Value *EVL, const Twine &Name) { - VectorType *ValTy = cast(Operand->getType()); - Value *AllTrueMask = - Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue()); - return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse, - {Operand, AllTrueMask, EVL}, nullptr, Name); -} - void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { auto *LI = cast(&Ingredient); @@ -2676,10 +2665,9 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) - return Cost; + return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, + AS, Ctx.CostKind); return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, getAddr()->getUnderlyingValue(), false, @@ -2805,10 +2793,9 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = getLoadStoreAddressSpace(const_cast(&Ingredient)); - InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost( - Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind); if (!Reverse) - return Cost; + return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment, + AS, Ctx.CostKind); return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, getAddr()->getUnderlyingValue(), false, From bd2f19594a6149ee9c830eaebce3871c2f87630c Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Wed, 22 Jan 2025 11:42:55 +0800 Subject: [PATCH 10/11] Change the name of strided load and use multiparameter isa --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +- llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- ...ctorize-force-tail-with-evl-reverse-load-store.ll | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 14cbfa3bec6f4..99cb0684f3332 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7436,7 +7436,7 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, // The VPlan-based cost model may calculate the cost of strided load/store // which can't be modeled in the legacy cost model. - if (isa(&R) || isa(&R)) + if (isa(&R)) if (cast(&R)->isReverse()) return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index b1c7e3c003a5c..4583854e8b875 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2636,7 +2636,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) { Mask, EVL}; NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, Builder.getInt32Ty()}, - Operands, nullptr, "vp.neg.strided.load"); + Operands, nullptr, "vp.reverse.load"); } else { VectorBuilder VBuilder(Builder); VBuilder.setEVL(EVL).setMask(Mask); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index 5d8d37920058a..af1d216e7384d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -39,14 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] -; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] ; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] ; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] ; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] -; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, splat (i1 true), i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_REVERSE_LOAD]], ptr align 4 [[TMP17]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -151,14 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] ; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] ; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] -; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, [[TMP15]], i32 [[TMP5]]) +; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] ; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] ; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] ; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] -; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, [[TMP15]], i32 [[TMP5]]) +; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_REVERSE_LOAD]], ptr align 4 [[TMP25]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] @@ -274,8 +274,8 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] ; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] -; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) -; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_NEG_STRIDED_LOAD]] +; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) +; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE_LOAD]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv8i8.nxv8p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]] ; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 From 807de0d46198edd6ca95b90ac6e5d39492772558 Mon Sep 17 00:00:00 2001 From: Wang Pengcheng Date: Tue, 11 Feb 2025 15:34:07 +0800 Subject: [PATCH 11/11] Use right pointer --- .../Transforms/Vectorize/LoopVectorize.cpp | 2 +- ...-force-tail-with-evl-reverse-load-store.ll | 42 ++++--------------- ...orize-force-tail-with-evl-uniform-store.ll | 6 +-- 3 files changed, 9 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 99cb0684f3332..e7eb5006c694d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8288,7 +8288,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, auto *GEP = dyn_cast( Ptr->getUnderlyingValue()->stripPointerCasts()); VPSingleDefRecipe *VectorPtr; - if (Reverse) { + if (Reverse && !CM.foldTailWithEVL()) { // When folding the tail, we may compute an address that we don't in the // original scalar loop and it may not be inbounds. Drop Inbounds in that // case. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll index af1d216e7384d..1e56dc6feccf6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll @@ -34,18 +34,10 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt ; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], -1 ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]] -; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]] -; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]] +; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 ; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]] -; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]] -; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]] -; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]] +; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 ; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_REVERSE_LOAD]], ptr align 4 [[TMP17]], i32 -4, splat (i1 true), i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]] @@ -146,18 +138,10 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal ; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt [[VP_OP_LOAD]], splat (i32 100) ; IF-EVL-NEXT: [[TMP15:%.*]] = select [[TMP10]], [[TMP14]], zeroinitializer ; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0 ; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]] -; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64 -; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]] -; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]] -; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]] +; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0 ; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32( [[VP_REVERSE_LOAD]], ptr align 4 [[TMP25]], i32 -4, [[TMP15]], i32 [[TMP5]]) ; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]] @@ -269,27 +253,15 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr ; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]] ; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 ; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]] -; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]] -; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] -; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]] +; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 ; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], [[VP_REVERSE_LOAD]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv8i8.nxv8p0( align 1 [[TMP14]], splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]] -; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]] -; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]] -; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]] +; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0 ; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]] -; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64 -; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]] -; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]] -; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]] -; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]] +; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0 ; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32( [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, splat (i1 true), i32 [[TMP6]]) ; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64 ; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll index 69ba0bad45de6..ab1fe4f38d3b1 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll @@ -38,11 +38,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) { ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP14]], i32 0 ; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32( zeroinitializer, ptr align 8 [[TMP20]], i32 -8, splat (i1 true), i32 [[TMP11]]) ; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]