Skip to content

Commit 041d189

Browse files
authored
[RISCV][TTI] Adjust costing in getPartialReductionCost for zvqdotq (llvm#141430)
Two changes: 1) Handle fixed vector cases now that 77a3f8 has landed. 2) Fix a mistake in the original costing - the VF passed in is the input VF, not the output VF. Given that we should be costing the accumulator type with VF/4. Note that (2) does not cause any visible test differences as the vectorizer (outside of maximize-bandwidth mode) does not consider wide enough VF for the costing difference to matter.
1 parent b1017a4 commit 041d189

File tree

2 files changed

+226
-2
lines changed

2 files changed

+226
-2
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,10 +306,10 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost(
306306
Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
307307
InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
308308
OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) ||
309-
!VF.isKnownMultipleOf(4) || !VF.isScalable())
309+
!VF.isKnownMultipleOf(4))
310310
return InstructionCost::getInvalid();
311311

312-
Type *Tp = VectorType::get(AccumType, VF);
312+
Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
313313
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
314314
// Note: Asuming all vqdot* variants are equal cost
315315
// TODO: Thread CostKind through this API

llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
22
; RUN: opt -passes=loop-vectorize -mattr=+v -S < %s | FileCheck %s --check-prefixes=CHECK,V
33
; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -S < %s | FileCheck %s --check-prefixes=CHECK,ZVQDOTQ
4+
; RUN: opt -passes=loop-vectorize -mattr=+v -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-V
5+
; RUN: opt -passes=loop-vectorize -mattr=+v,+experimental-zvqdotq -scalable-vectorization=off -S < %s | FileCheck %s --check-prefixes=FIXED,FIXED-ZVQDOTQ
46

57
target triple = "riscv64-none-unknown-elf"
68

@@ -79,6 +81,80 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
7981
; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
8082
; ZVQDOTQ: scalar.ph:
8183
;
84+
; FIXED-V-LABEL: define i32 @vqdot(
85+
; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
86+
; FIXED-V-NEXT: entry:
87+
; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
88+
; FIXED-V: vector.ph:
89+
; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]]
90+
; FIXED-V: vector.body:
91+
; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
92+
; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
93+
; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
94+
; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
95+
; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
96+
; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
97+
; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
98+
; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
99+
; FIXED-V-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
100+
; FIXED-V-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
101+
; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
102+
; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
103+
; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
104+
; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
105+
; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
106+
; FIXED-V-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
107+
; FIXED-V-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
108+
; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
109+
; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
110+
; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
111+
; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
112+
; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
113+
; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
114+
; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
115+
; FIXED-V: middle.block:
116+
; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
117+
; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
118+
; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
119+
; FIXED-V: scalar.ph:
120+
;
121+
; FIXED-ZVQDOTQ-LABEL: define i32 @vqdot(
122+
; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
123+
; FIXED-ZVQDOTQ-NEXT: entry:
124+
; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
125+
; FIXED-ZVQDOTQ: vector.ph:
126+
; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
127+
; FIXED-ZVQDOTQ: vector.body:
128+
; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
129+
; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
130+
; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
131+
; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
132+
; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
133+
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
134+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
135+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
136+
; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
137+
; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
138+
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
139+
; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
140+
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
141+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
142+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
143+
; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
144+
; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
145+
; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
146+
; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
147+
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
148+
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
149+
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
150+
; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
151+
; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
152+
; FIXED-ZVQDOTQ: middle.block:
153+
; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
154+
; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
155+
; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
156+
; FIXED-ZVQDOTQ: scalar.ph:
157+
;
82158
entry:
83159
br label %for.body
84160

@@ -177,6 +253,80 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
177253
; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
178254
; ZVQDOTQ: scalar.ph:
179255
;
256+
; FIXED-V-LABEL: define i32 @vqdotu(
257+
; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
258+
; FIXED-V-NEXT: entry:
259+
; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
260+
; FIXED-V: vector.ph:
261+
; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]]
262+
; FIXED-V: vector.body:
263+
; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
264+
; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
265+
; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
266+
; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
267+
; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
268+
; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
269+
; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
270+
; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
271+
; FIXED-V-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
272+
; FIXED-V-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
273+
; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
274+
; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
275+
; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
276+
; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
277+
; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
278+
; FIXED-V-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
279+
; FIXED-V-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
280+
; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
281+
; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
282+
; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
283+
; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
284+
; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
285+
; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
286+
; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
287+
; FIXED-V: middle.block:
288+
; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
289+
; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
290+
; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
291+
; FIXED-V: scalar.ph:
292+
;
293+
; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotu(
294+
; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
295+
; FIXED-ZVQDOTQ-NEXT: entry:
296+
; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
297+
; FIXED-ZVQDOTQ: vector.ph:
298+
; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]]
299+
; FIXED-ZVQDOTQ: vector.body:
300+
; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
301+
; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
302+
; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
303+
; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
304+
; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
305+
; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
306+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
307+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
308+
; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
309+
; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
310+
; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
311+
; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
312+
; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
313+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
314+
; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
315+
; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
316+
; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
317+
; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
318+
; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
319+
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
320+
; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
321+
; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
322+
; FIXED-ZVQDOTQ-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
323+
; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
324+
; FIXED-ZVQDOTQ: middle.block:
325+
; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
326+
; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
327+
; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
328+
; FIXED-ZVQDOTQ: scalar.ph:
329+
;
180330
entry:
181331
br label %for.body
182332

@@ -238,6 +388,43 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
238388
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
239389
; CHECK: scalar.ph:
240390
;
391+
; FIXED-LABEL: define i32 @vqdotsu(
392+
; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
393+
; FIXED-NEXT: entry:
394+
; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
395+
; FIXED: vector.ph:
396+
; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
397+
; FIXED: vector.body:
398+
; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
399+
; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
400+
; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
401+
; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
402+
; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
403+
; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
404+
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
405+
; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
406+
; FIXED-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
407+
; FIXED-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
408+
; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
409+
; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
410+
; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
411+
; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
412+
; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
413+
; FIXED-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
414+
; FIXED-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
415+
; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
416+
; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
417+
; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
418+
; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
419+
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
420+
; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
421+
; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
422+
; FIXED: middle.block:
423+
; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
424+
; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
425+
; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
426+
; FIXED: scalar.ph:
427+
;
241428
entry:
242429
br label %for.body
243430

@@ -298,6 +485,43 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
298485
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
299486
; CHECK: scalar.ph:
300487
;
488+
; FIXED-LABEL: define i32 @vqdotsu2(
489+
; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
490+
; FIXED-NEXT: entry:
491+
; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
492+
; FIXED: vector.ph:
493+
; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
494+
; FIXED: vector.body:
495+
; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
496+
; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
497+
; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
498+
; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
499+
; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
500+
; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
501+
; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
502+
; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
503+
; FIXED-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
504+
; FIXED-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
505+
; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
506+
; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
507+
; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
508+
; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
509+
; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
510+
; FIXED-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
511+
; FIXED-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
512+
; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
513+
; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
514+
; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
515+
; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
516+
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
517+
; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
518+
; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
519+
; FIXED: middle.block:
520+
; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
521+
; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
522+
; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
523+
; FIXED: scalar.ph:
524+
;
301525
entry:
302526
br label %for.body
303527

0 commit comments

Comments
 (0)