Skip to content

Commit dfc1aee

Browse files
committed
Revert "[SLP] avoid reduction transform on patterns that the backend can load-combine"
This reverts SVN r373833, as it caused a failed assert "Non-zero loop cost expected" on building numerous projects, see PR43582 for details and reproduction samples. llvm-svn: 373882
1 parent 0c56f42 commit dfc1aee

File tree

4 files changed

+55
-179
lines changed

4 files changed

+55
-179
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,16 +1129,6 @@ class TargetTransformInfo {
11291129
/// Returns -1 if the cost is unknown.
11301130
int getInstructionThroughput(const Instruction *I) const;
11311131

1132-
/// Given an input value that is an element of an 'or' reduction, check if the
1133-
/// reduction is composed of narrower loaded values. Assuming that a
1134-
/// legal-sized reduction of shifted/zexted loaded values can be load combined
1135-
/// in the backend, create a relative cost that accounts for the removal of
1136-
/// the intermediate ops and replacement by a single wide load.
1137-
/// TODO: If load combining is allowed in the IR optimizer, this analysis
1138-
/// may not be necessary.
1139-
Optional<int> getLoadCombineCost(unsigned Opcode,
1140-
ArrayRef<const Value *> Args) const;
1141-
11421132
/// The abstract base class used to type erase specific TTI
11431133
/// implementations.
11441134
class Concept;

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -571,64 +571,11 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
571571
return OpInfo;
572572
}
573573

574-
Optional<int>
575-
TargetTransformInfo::getLoadCombineCost(unsigned Opcode,
576-
ArrayRef<const Value *> Args) const {
577-
if (Opcode != Instruction::Or)
578-
return llvm::None;
579-
if (Args.empty())
580-
return llvm::None;
581-
582-
// Look past the reduction to find a source value. Arbitrarily follow the
583-
// path through operand 0 of any 'or'. Also, peek through optional
584-
// shift-left-by-constant.
585-
const Value *ZextLoad = Args.front();
586-
while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
587-
match(ZextLoad, m_Shl(m_Value(), m_Constant())))
588-
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
589-
590-
// Check if the input to the reduction is an extended load.
591-
Value *LoadPtr;
592-
if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
593-
return llvm::None;
594-
595-
// Require that the total load bit width is a legal integer type.
596-
// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
597-
// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
598-
Type *WideType = ZextLoad->getType();
599-
Type *EltType = LoadPtr->getType()->getPointerElementType();
600-
unsigned WideWidth = WideType->getIntegerBitWidth();
601-
unsigned EltWidth = EltType->getIntegerBitWidth();
602-
if (!isTypeLegal(WideType) || WideWidth % EltWidth != 0)
603-
return llvm::None;
604-
605-
// Calculate relative cost: {narrow load+zext+shl+or} are assumed to be
606-
// removed and replaced by a single wide load.
607-
// FIXME: This is not accurate for the larger pattern where we replace
608-
// multiple narrow load sequences with just 1 wide load. We could
609-
// remove the addition of the wide load cost here and expect the caller
610-
// to make an adjustment for that.
611-
int Cost = 0;
612-
Cost -= getMemoryOpCost(Instruction::Load, EltType, 0, 0);
613-
Cost -= getCastInstrCost(Instruction::ZExt, WideType, EltType);
614-
Cost -= getArithmeticInstrCost(Instruction::Shl, WideType);
615-
Cost -= getArithmeticInstrCost(Instruction::Or, WideType);
616-
Cost += getMemoryOpCost(Instruction::Load, WideType, 0, 0);
617-
return Cost;
618-
}
619-
620-
621574
int TargetTransformInfo::getArithmeticInstrCost(
622575
unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
623576
OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
624577
OperandValueProperties Opd2PropInfo,
625578
ArrayRef<const Value *> Args) const {
626-
// Check if we can match this instruction as part of a larger pattern.
627-
Optional<int> LoadCombineCost = getLoadCombineCost(Opcode, Args);
628-
if (LoadCombineCost)
629-
return LoadCombineCost.getValue();
630-
631-
// Fallback to implementation-specific overrides or base class.
632579
int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
633580
Opd1PropInfo, Opd2PropInfo, Args);
634581
assert(Cost >= 0 && "TTI should not produce negative costs!");

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6499,19 +6499,10 @@ class HorizontalReduction {
64996499

65006500
int ScalarReduxCost = 0;
65016501
switch (ReductionData.getKind()) {
6502-
case RK_Arithmetic: {
6503-
// Note: Passing in the reduction operands allows the cost model to match
6504-
// load combining patterns for this reduction.
6505-
auto *ReduxInst = cast<Instruction>(ReductionRoot);
6506-
SmallVector<const Value *, 2> OperandList;
6507-
for (Value *Operand : ReduxInst->operands())
6508-
OperandList.push_back(Operand);
6509-
ScalarReduxCost = TTI->getArithmeticInstrCost(ReductionData.getOpcode(),
6510-
ScalarTy, TargetTransformInfo::OK_AnyValue,
6511-
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
6512-
TargetTransformInfo::OP_None, OperandList);
6502+
case RK_Arithmetic:
6503+
ScalarReduxCost =
6504+
TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
65136505
break;
6514-
}
65156506
case RK_Min:
65166507
case RK_Max:
65176508
case RK_UMin:

llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Lines changed: 52 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -15,37 +15,31 @@ define i64 @load_bswap(%v8i8* %p) {
1515
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
1616
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
1717
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
18-
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
19-
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
20-
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
21-
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
18+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <4 x i8>*
19+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
2220
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
2321
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
2422
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
2523
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
26-
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
27-
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
28-
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
29-
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
24+
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
3025
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
3126
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
3227
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
3328
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
34-
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
35-
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
36-
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
37-
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
29+
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <4 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32>
3830
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
3931
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
4032
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
41-
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
42-
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
43-
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
44-
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
45-
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
46-
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
47-
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
48-
; CHECK-NEXT: ret i64 [[OR01234567]]
33+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
34+
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
35+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
36+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
37+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
38+
; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[SH4]]
39+
; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[SH5]]
40+
; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[SH6]]
41+
; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z7]]
42+
; CHECK-NEXT: ret i64 [[OP_EXTRA]]
4943
;
5044
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
5145
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -103,38 +97,18 @@ define i64 @load_bswap_nop_shift(%v8i8* %p) {
10397
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
10498
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
10599
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
106-
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
107-
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
108-
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
109-
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
110-
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
111-
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
112-
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
113-
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
114-
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
115-
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
116-
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
117-
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
118-
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
119-
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
120-
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
121-
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
122-
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
123-
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
124-
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
125-
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
126-
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
127-
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
128-
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
129-
; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
130-
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
131-
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
132-
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
133-
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
134-
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
135-
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
136-
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
137-
; CHECK-NEXT: ret i64 [[OR01234567]]
100+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G0]] to <8 x i8>*
101+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
102+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
103+
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
104+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
105+
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
106+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
107+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
108+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
109+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
110+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
111+
; CHECK-NEXT: ret i64 [[TMP5]]
138112
;
139113
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
140114
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
@@ -194,36 +168,30 @@ define i64 @load64le(i8* %arg) {
194168
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
195169
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
196170
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
197-
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
198-
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
199-
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
200-
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
171+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[G1]] to <4 x i8>*
172+
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
201173
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
202174
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
203175
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
204176
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
205-
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
206-
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
207-
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
208-
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
177+
; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
209178
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
210179
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
211180
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
212-
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
213-
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
214-
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
215-
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
181+
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw <4 x i64> [[TMP3]], <i64 8, i64 16, i64 24, i64 32>
216182
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
217183
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
218184
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
219-
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]]
220-
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
221-
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
222-
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
223-
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
224-
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
225-
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
226-
; CHECK-NEXT: ret i64 [[O7]]
185+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
186+
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <4 x i64> [[TMP4]], [[RDX_SHUF]]
187+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[BIN_RDX]], <4 x i64> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
188+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <4 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
189+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[BIN_RDX2]], i32 0
190+
; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP5]], [[S5]]
191+
; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], [[S6]]
192+
; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP7]], [[S7]]
193+
; CHECK-NEXT: [[OP_EXTRA:%.*]] = or i64 [[TMP8]], [[Z0]]
194+
; CHECK-NEXT: ret i64 [[OP_EXTRA]]
227195
;
228196
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
229197
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
@@ -279,38 +247,18 @@ define i64 @load64le_nop_shift(i8* %arg) {
279247
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
280248
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
281249
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
282-
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
283-
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
284-
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
285-
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
286-
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
287-
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
288-
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
289-
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
290-
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
291-
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
292-
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
293-
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
294-
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
295-
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
296-
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
297-
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
298-
; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
299-
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
300-
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
301-
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
302-
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
303-
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
304-
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
305-
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
306-
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]]
307-
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
308-
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
309-
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
310-
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
311-
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
312-
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
313-
; CHECK-NEXT: ret i64 [[O7]]
250+
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[ARG]] to <8 x i8>*
251+
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
252+
; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
253+
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i64> [[TMP3]], <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56>
254+
; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
255+
; CHECK-NEXT: [[BIN_RDX:%.*]] = or <8 x i64> [[TMP4]], [[RDX_SHUF]]
256+
; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i64> [[BIN_RDX]], <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
257+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = or <8 x i64> [[BIN_RDX]], [[RDX_SHUF1]]
258+
; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i64> [[BIN_RDX2]], <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
259+
; CHECK-NEXT: [[BIN_RDX4:%.*]] = or <8 x i64> [[BIN_RDX2]], [[RDX_SHUF3]]
260+
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i64> [[BIN_RDX4]], i32 0
261+
; CHECK-NEXT: ret i64 [[TMP5]]
314262
;
315263
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
316264
%g2 = getelementptr inbounds i8, i8* %arg, i64 2

0 commit comments

Comments
 (0)