Skip to content

Commit 8202c94

Browse files
authored
[CostModel][X86] getMaskedMemoryOpCost - widening masks must compute the cost of the full width insert_subvector across multiple legal vectors (#145693)
The memory value and mask value types might legalise differently - e.g. a v64i32 might split into 4 x v16i32 / 8 x v8i32 but the mask might legalize as 1 x v64i8 / 2 x v32i8 etc. If the legalised value type has been split, then we must ensure we compute the cost for the entire mask value type and let getShuffleCost handle any legalisation, not assume that only a single trailing split mask will require widening.
1 parent 81e0799 commit 8202c94

File tree

4 files changed

+24
-41
lines changed

4 files changed

+24
-41
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5441,7 +5441,8 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
54415441

54425442
else if (LT.first * Ty.getVectorNumElements() > NumElem) {
54435443
auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
5444-
Ty.getVectorNumElements());
5444+
(unsigned)LT.first.getValue() *
5445+
Ty.getVectorNumElements());
54455446
// Expanding requires fill mask with zeroes
54465447
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, NewMaskTy, {},
54475448
CostKind, 0, MaskTy);

llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,11 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
274274
; AVX-NEXT: ret void
275275
;
276276
; AVX2-LABEL: @gather_load_3(
277-
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
278-
; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
279-
; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
280-
; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
281-
; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
282-
; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
283-
; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
284-
; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
277+
; AVX2-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
278+
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
279+
; AVX2-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
280+
; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
281+
; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
285282
; AVX2-NEXT: ret void
286283
;
287284
; AVX512F-LABEL: @gather_load_3(
@@ -412,14 +409,11 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
412409
; AVX-NEXT: ret void
413410
;
414411
; AVX2-LABEL: @gather_load_4(
415-
; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
416-
; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
417-
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
418-
; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
419-
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
420-
; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
421-
; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
422-
; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
412+
; AVX2-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
413+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
414+
; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
415+
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
416+
; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
423417
; AVX2-NEXT: ret void
424418
;
425419
; AVX512F-LABEL: @gather_load_4(

llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -274,14 +274,11 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
274274
; AVX-NEXT: ret void
275275
;
276276
; AVX2-LABEL: @gather_load_3(
277-
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1:%.*]], i64 24
278-
; AVX2-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
279-
; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
280-
; AVX2-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[TMP14]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
281-
; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
282-
; AVX2-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
283-
; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
284-
; AVX2-NEXT: store <8 x i32> [[TMP26]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
277+
; AVX2-NEXT: [[TMP3:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[TMP1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
278+
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <22 x i32> [[TMP3]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
279+
; AVX2-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP4]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
280+
; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
281+
; AVX2-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]]
285282
; AVX2-NEXT: ret void
286283
;
287284
; AVX512F-LABEL: @gather_load_3(
@@ -412,14 +409,11 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
412409
; AVX-NEXT: ret void
413410
;
414411
; AVX2-LABEL: @gather_load_4(
415-
; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds nuw i8, ptr [[T1:%.*]], i64 24
416-
; AVX2-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[T1]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
417-
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <4 x i32> <i32 0, i32 11, i32 4, i32 15>
418-
; AVX2-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr nonnull [[T26]], i32 4, <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <16 x i32> poison), !tbaa [[TBAA0]]
419-
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[TMP3]], <16 x i32> poison, <4 x i32> <i32 12, i32 3, i32 0, i32 15>
420-
; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
421-
; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
422-
; AVX2-NEXT: store <8 x i32> [[TMP9]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
412+
; AVX2-NEXT: [[TMP1:%.*]] = call <22 x i32> @llvm.masked.load.v22i32.p0(ptr [[T1:%.*]], i32 4, <22 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true>, <22 x i32> poison), !tbaa [[TBAA0]]
413+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <22 x i32> [[TMP1]], <22 x i32> poison, <8 x i32> <i32 0, i32 4, i32 6, i32 9, i32 11, i32 15, i32 18, i32 21>
414+
; AVX2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 1, i32 3, i32 3, i32 2, i32 2, i32 4, i32 1, i32 4>
415+
; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 6, i32 3, i32 2, i32 7>
416+
; AVX2-NEXT: store <8 x i32> [[TMP4]], ptr [[T0:%.*]], align 4, !tbaa [[TBAA0]]
423417
; AVX2-NEXT: ret void
424418
;
425419
; AVX512F-LABEL: @gather_load_4(

llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,9 @@
1515
define void @test(ptr noalias %p, ptr noalias %p1) {
1616
; CHECK-LABEL: @test(
1717
; CHECK-NEXT: entry:
18-
; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
19-
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
20-
; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
21-
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
22-
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4
18+
; CHECK-NEXT: [[TMP0:%.*]] = call <35 x i32> @llvm.masked.load.v35i32.p0(ptr [[P:%.*]], i32 4, <35 x i1> <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <35 x i32> poison)
19+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <35 x i32> [[TMP0]], <35 x i32> poison, <4 x i32> <i32 0, i32 32, i32 33, i32 34>
2320
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4
24-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
25-
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1
26-
; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2)
2721
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]]
2822
; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4
2923
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)