Skip to content

Commit da7fbed

Browse files
committed
[VectorCombine] Fold chain of (scalar load)->ext->ext to load->ext.
Add a new combine that folds a chain of (scalar load)->ext->ext (with shuffles/casts/inserts in between) to a single vector load and wide extend. This makes the IR simpler to analyze and to process, while the backend can still decide to break them up. Code like that comes from code written with vector intrinsics. Some examples of real-world use are in https://github.com/ARM-software/astc-encoder/.
1 parent 11d8454 commit da7fbed

File tree

2 files changed

+67
-48
lines changed

2 files changed

+67
-48
lines changed

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class VectorCombine {
128128
bool foldShuffleOfShuffles(Instruction &I);
129129
bool foldShuffleOfIntrinsics(Instruction &I);
130130
bool foldShuffleToIdentity(Instruction &I);
131+
bool foldShuffleExtExtracts(Instruction &I);
131132
bool foldShuffleFromReductions(Instruction &I);
132133
bool foldCastFromReductions(Instruction &I);
133134
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
@@ -2791,6 +2792,55 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
27912792
return true;
27922793
}
27932794

2795+
bool VectorCombine::foldShuffleExtExtracts(Instruction &I) {
2796+
// Try to fold vector zero- and sign-extends split across multiple operations
2797+
// into a single extend, removing redundant inserts and shuffles.
2798+
2799+
// Check if we have an extended shuffle that selects the first vector, which
2800+
// itself is another extend fed by a load.
2801+
Instruction *L;
2802+
if (!match(
2803+
&I,
2804+
m_OneUse(m_Shuffle(
2805+
m_OneUse(m_ZExtOrSExt(m_OneUse(m_BitCast(m_OneUse(m_InsertElt(
2806+
m_Value(), m_OneUse(m_Instruction(L)), m_SpecificInt(0))))))),
2807+
m_Value()))) ||
2808+
!cast<ShuffleVectorInst>(&I)->isIdentityWithExtract() ||
2809+
!isa<LoadInst>(L))
2810+
return false;
2811+
auto *InnerExt = cast<Instruction>(I.getOperand(0));
2812+
auto *OuterExt = dyn_cast<Instruction>(*I.user_begin());
2813+
if (!isa<SExtInst, ZExtInst>(OuterExt))
2814+
return false;
2815+
2816+
// If the inner extend is a sign extend and the outer one isnt (i.e. a
2817+
// zero-extend), don't fold. If the first one is zero-extend, it doesn't
2818+
// matter if the second one is a sign- or zero-extend.
2819+
if (isa<SExtInst>(InnerExt) && !isa<SExtInst>(OuterExt))
2820+
return false;
2821+
2822+
// Don't try to convert the load if it has an odd size.
2823+
if (!DL->typeSizeEqualsStoreSize(L->getType()))
2824+
return false;
2825+
auto *DstTy = cast<FixedVectorType>(OuterExt->getType());
2826+
auto *SrcTy =
2827+
FixedVectorType::get(InnerExt->getOperand(0)->getType()->getScalarType(),
2828+
DstTy->getNumElements());
2829+
if (DL->getTypeStoreSize(SrcTy) != DL->getTypeStoreSize(L->getType()))
2830+
return false;
2831+
2832+
// Convert to a vector load feeding a single wide extend.
2833+
Builder.SetInsertPoint(*L->getInsertionPointAfterDef());
2834+
auto *NewLoad = cast<LoadInst>(
2835+
Builder.CreateLoad(SrcTy, L->getOperand(0), L->getName() + ".vec"));
2836+
auto *NewExt = isa<ZExtInst>(InnerExt) ? Builder.CreateZExt(NewLoad, DstTy)
2837+
: Builder.CreateSExt(NewLoad, DstTy);
2838+
OuterExt->replaceAllUsesWith(NewExt);
2839+
replaceValue(*OuterExt, *NewExt);
2840+
Worklist.pushValue(NewLoad);
2841+
return true;
2842+
}
2843+
27942844
/// Given a commutative reduction, the order of the input lanes does not alter
27952845
/// the results. We can use this to remove certain shuffles feeding the
27962846
/// reduction, removing the need to shuffle at all.
@@ -3565,6 +3615,7 @@ bool VectorCombine::run() {
35653615
break;
35663616
case Instruction::ShuffleVector:
35673617
MadeChange |= widenSubvectorLoad(I);
3618+
MadeChange |= foldShuffleExtExtracts(I);
35683619
break;
35693620
default:
35703621
break;

llvm/test/Transforms/VectorCombine/AArch64/combine-shuffle-ext.ll

Lines changed: 16 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,8 @@ define <4 x i32> @load_i32_zext_to_v4i32(ptr %di) {
1111
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32(
1212
; CHECK-SAME: ptr [[DI:%.*]]) {
1313
; CHECK-NEXT: [[ENTRY:.*:]]
14-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
15-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
16-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
17-
; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
18-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
19-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
14+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
15+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
2016
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
2117
;
2218
entry:
@@ -33,12 +29,8 @@ define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(ptr %di) {
3329
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_both_nneg(
3430
; CHECK-SAME: ptr [[DI:%.*]]) {
3531
; CHECK-NEXT: [[ENTRY:.*:]]
36-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
37-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
38-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
39-
; CHECK-NEXT: [[E_1:%.*]] = zext nneg <8 x i8> [[VEC_BC]] to <8 x i16>
40-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
41-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
32+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
33+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
4234
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
4335
;
4436
entry:
@@ -121,13 +113,9 @@ define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(ptr %di) {
121113
; CHECK-LABEL: define <4 x i32> @load_i32_zext_to_v4i32_clobber_after_load(
122114
; CHECK-SAME: ptr [[DI:%.*]]) {
123115
; CHECK-NEXT: [[ENTRY:.*:]]
124-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
116+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
117+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <4 x i8> [[L_VEC]] to <4 x i32>
125118
; CHECK-NEXT: call void @use.i32(i32 0)
126-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
127-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
128-
; CHECK-NEXT: [[E_1:%.*]] = zext <8 x i8> [[VEC_BC]] to <8 x i16>
129-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
130-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
131119
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
132120
;
133121
entry:
@@ -287,12 +275,8 @@ define <8 x i32> @load_i64_zext_to_v8i32(ptr %di) {
287275
; CHECK-LABEL: define <8 x i32> @load_i64_zext_to_v8i32(
288276
; CHECK-SAME: ptr [[DI:%.*]]) {
289277
; CHECK-NEXT: [[ENTRY:.*:]]
290-
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8
291-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
292-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
293-
; CHECK-NEXT: [[EXT_1:%.*]] = zext <16 x i8> [[VEC_BC]] to <16 x i16>
294-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
295-
; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext nneg <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
278+
; CHECK-NEXT: [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
279+
; CHECK-NEXT: [[OUTER_EXT:%.*]] = zext <8 x i8> [[L_VEC]] to <8 x i32>
296280
; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]]
297281
;
298282
entry:
@@ -309,12 +293,8 @@ define <3 x i32> @load_i24_zext_to_v3i32(ptr %di) {
309293
; CHECK-LABEL: define <3 x i32> @load_i24_zext_to_v3i32(
310294
; CHECK-SAME: ptr [[DI:%.*]]) {
311295
; CHECK-NEXT: [[ENTRY:.*:]]
312-
; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4
313-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
314-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
315-
; CHECK-NEXT: [[EXT_1:%.*]] = zext <6 x i8> [[VEC_BC]] to <6 x i16>
316-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
317-
; CHECK-NEXT: [[EXT_2:%.*]] = zext nneg <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
296+
; CHECK-NEXT: [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
297+
; CHECK-NEXT: [[EXT_2:%.*]] = zext <3 x i8> [[L_VEC]] to <3 x i32>
318298
; CHECK-NEXT: ret <3 x i32> [[EXT_2]]
319299
;
320300
entry:
@@ -419,12 +399,8 @@ define <4 x i32> @load_i32_sext_to_v4i32(ptr %di) {
419399
; CHECK-LABEL: define <4 x i32> @load_i32_sext_to_v4i32(
420400
; CHECK-SAME: ptr [[DI:%.*]]) {
421401
; CHECK-NEXT: [[ENTRY:.*:]]
422-
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[DI]], align 4
423-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[L]], i64 0
424-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i32> [[VEC_INS]] to <8 x i8>
425-
; CHECK-NEXT: [[E_1:%.*]] = sext <8 x i8> [[VEC_BC]] to <8 x i16>
426-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <8 x i16> [[E_1]], <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
427-
; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i16> [[VEC_SHUFFLE]] to <4 x i32>
402+
; CHECK-NEXT: [[L_VEC:%.*]] = load <4 x i8>, ptr [[DI]], align 4
403+
; CHECK-NEXT: [[EXT_2:%.*]] = sext <4 x i8> [[L_VEC]] to <4 x i32>
428404
; CHECK-NEXT: ret <4 x i32> [[EXT_2]]
429405
;
430406
entry:
@@ -441,12 +417,8 @@ define <8 x i32> @load_i64_sext_to_v8i32(ptr %di) {
441417
; CHECK-LABEL: define <8 x i32> @load_i64_sext_to_v8i32(
442418
; CHECK-SAME: ptr [[DI:%.*]]) {
443419
; CHECK-NEXT: [[ENTRY:.*:]]
444-
; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[DI]], align 8
445-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[L]], i64 0
446-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i64> [[VEC_INS]] to <16 x i8>
447-
; CHECK-NEXT: [[EXT_1:%.*]] = sext <16 x i8> [[VEC_BC]] to <16 x i16>
448-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <16 x i16> [[EXT_1]], <16 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
449-
; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i16> [[VEC_SHUFFLE]] to <8 x i32>
420+
; CHECK-NEXT: [[L_VEC:%.*]] = load <8 x i8>, ptr [[DI]], align 8
421+
; CHECK-NEXT: [[OUTER_EXT:%.*]] = sext <8 x i8> [[L_VEC]] to <8 x i32>
450422
; CHECK-NEXT: ret <8 x i32> [[OUTER_EXT]]
451423
;
452424
entry:
@@ -463,12 +435,8 @@ define <3 x i32> @load_i24_sext_to_v3i32(ptr %di) {
463435
; CHECK-LABEL: define <3 x i32> @load_i24_sext_to_v3i32(
464436
; CHECK-SAME: ptr [[DI:%.*]]) {
465437
; CHECK-NEXT: [[ENTRY:.*:]]
466-
; CHECK-NEXT: [[L:%.*]] = load i24, ptr [[DI]], align 4
467-
; CHECK-NEXT: [[VEC_INS:%.*]] = insertelement <2 x i24> <i24 poison, i24 0>, i24 [[L]], i64 0
468-
; CHECK-NEXT: [[VEC_BC:%.*]] = bitcast <2 x i24> [[VEC_INS]] to <6 x i8>
469-
; CHECK-NEXT: [[EXT_1:%.*]] = sext <6 x i8> [[VEC_BC]] to <6 x i16>
470-
; CHECK-NEXT: [[VEC_SHUFFLE:%.*]] = shufflevector <6 x i16> [[EXT_1]], <6 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
471-
; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i16> [[VEC_SHUFFLE]] to <3 x i32>
438+
; CHECK-NEXT: [[L_VEC:%.*]] = load <3 x i8>, ptr [[DI]], align 4
439+
; CHECK-NEXT: [[EXT_2:%.*]] = sext <3 x i8> [[L_VEC]] to <3 x i32>
472440
; CHECK-NEXT: ret <3 x i32> [[EXT_2]]
473441
;
474442
entry:

0 commit comments

Comments
 (0)