Skip to content

Commit 42f43d5

Browse files
committed
[RISCV] Add optimization for memset inline
1 parent c72f8ba commit 42f43d5

File tree

5 files changed

+313
-176
lines changed

5 files changed

+313
-176
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1664,7 +1664,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
16641664
PredictableSelectIsExpensive = Subtarget.predictableSelectIsExpensive();
16651665

16661666
MaxStoresPerMemsetOptSize = Subtarget.getMaxStoresPerMemset(/*OptSize=*/true);
1667-
MaxStoresPerMemset = Subtarget.getMaxStoresPerMemset(/*OptSize=*/false);
1667+
MaxStoresPerMemset = Subtarget.hasVInstructions()
1668+
? (Subtarget.getRealMinVLen() / 8 *
1669+
Subtarget.getMaxLMULForFixedLengthVectors() /
1670+
(Subtarget.is64Bit() ? 8 : 4))
1671+
: Subtarget.getMaxStoresPerMemset(/*OptSize=*/false);
16681672

16691673
MaxGluedStoresPerMemcpy = Subtarget.getMaxGluedStoresPerMemcpy();
16701674
MaxStoresPerMemcpyOptSize = Subtarget.getMaxStoresPerMemcpy(/*OptSize=*/true);
@@ -23804,6 +23808,20 @@ EVT RISCVTargetLowering::getOptimalMemOpType(
2380423808
if (MinVLenInBytes <= RISCV::RVVBytesPerBlock)
2380523809
return MVT::Other;
2380623810

23811+
// If Op size is greater than LMUL8 memory operation, we don't support inline
23812+
// of memset. Return EVT based on Op size to avoid redundant splitting and
23813+
// merging operations if Op size is no greater than LMUL8 memory operation.
23814+
if (Op.isMemset()) {
23815+
if (Op.size() >
23816+
Subtarget.getMaxLMULForFixedLengthVectors() * MinVLenInBytes)
23817+
return MVT::Other;
23818+
if (Subtarget.hasVInstructionsI64() && Op.size() % 8 == 0)
23819+
return EVT::getVectorVT(Context, MVT::i64, Op.size() / 8);
23820+
if (Op.size() % 4 == 0)
23821+
return EVT::getVectorVT(Context, MVT::i32, Op.size() / 4);
23822+
return EVT::getVectorVT(Context, MVT::i8, Op.size());
23823+
}
23824+
2380723825
// Prefer i8 for non-zero memset as it allows us to avoid materializing
2380823826
// a large scalar constant and instead use vmv.v.x/i to do the
2380923827
// broadcast. For everything else, prefer ELenVT to minimize VL and thus

llvm/test/CodeGen/RISCV/pr135206.ll

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ define i1 @foo() nounwind "probe-stack"="inline-asm" "target-features"="+v" {
1212
; CHECK-NEXT: addi sp, sp, -2032
1313
; CHECK-NEXT: sd ra, 2024(sp) # 8-byte Folded Spill
1414
; CHECK-NEXT: sd s0, 2016(sp) # 8-byte Folded Spill
15-
; CHECK-NEXT: sd s1, 2008(sp) # 8-byte Folded Spill
16-
; CHECK-NEXT: sd s2, 2000(sp) # 8-byte Folded Spill
17-
; CHECK-NEXT: sd s3, 1992(sp) # 8-byte Folded Spill
1815
; CHECK-NEXT: lui a0, 7
1916
; CHECK-NEXT: sub t1, sp, a0
2017
; CHECK-NEXT: lui t2, 1
@@ -24,8 +21,9 @@ define i1 @foo() nounwind "probe-stack"="inline-asm" "target-features"="+v" {
2421
; CHECK-NEXT: bne sp, t1, .LBB0_1
2522
; CHECK-NEXT: # %bb.2:
2623
; CHECK-NEXT: addi sp, sp, -2048
27-
; CHECK-NEXT: addi sp, sp, -96
24+
; CHECK-NEXT: addi sp, sp, -48
2825
; CHECK-NEXT: csrr t1, vlenb
26+
; CHECK-NEXT: slli t1, t1, 2
2927
; CHECK-NEXT: lui t2, 1
3028
; CHECK-NEXT: .LBB0_3: # =>This Inner Loop Header: Depth=1
3129
; CHECK-NEXT: sub sp, sp, t2
@@ -34,45 +32,34 @@ define i1 @foo() nounwind "probe-stack"="inline-asm" "target-features"="+v" {
3432
; CHECK-NEXT: bge t1, t2, .LBB0_3
3533
; CHECK-NEXT: # %bb.4:
3634
; CHECK-NEXT: sub sp, sp, t1
37-
; CHECK-NEXT: li a0, 86
38-
; CHECK-NEXT: addi s0, sp, 48
39-
; CHECK-NEXT: addi s1, sp, 32
40-
; CHECK-NEXT: addi s2, sp, 16
41-
; CHECK-NEXT: lui a1, 353637
42-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
35+
; CHECK-NEXT: lui a0, 353637
36+
; CHECK-NEXT: addi a0, a0, 1622
37+
; CHECK-NEXT: slli a1, a0, 32
38+
; CHECK-NEXT: add a0, a0, a1
39+
; CHECK-NEXT: vsetivli zero, 7, e64, m4, ta, ma
4340
; CHECK-NEXT: vmv.v.x v8, a0
4441
; CHECK-NEXT: lui a0, 8
45-
; CHECK-NEXT: addi a0, a0, 32
42+
; CHECK-NEXT: addi a0, a0, 16
4643
; CHECK-NEXT: add a0, sp, a0
47-
; CHECK-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
48-
; CHECK-NEXT: addi a0, a1, 1622
49-
; CHECK-NEXT: vse8.v v8, (s0)
50-
; CHECK-NEXT: vse8.v v8, (s1)
51-
; CHECK-NEXT: vse8.v v8, (s2)
52-
; CHECK-NEXT: slli a1, a0, 32
53-
; CHECK-NEXT: add s3, a0, a1
54-
; CHECK-NEXT: sd s3, 64(sp)
44+
; CHECK-NEXT: vs4r.v v8, (a0) # vscale x 32-byte Folded Spill
45+
; CHECK-NEXT: addi s0, sp, 16
46+
; CHECK-NEXT: vse64.v v8, (s0)
5547
; CHECK-NEXT: call bar
5648
; CHECK-NEXT: lui a0, 8
57-
; CHECK-NEXT: addi a0, a0, 32
49+
; CHECK-NEXT: addi a0, a0, 16
5850
; CHECK-NEXT: add a0, sp, a0
59-
; CHECK-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
60-
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
61-
; CHECK-NEXT: vse8.v v8, (s0)
62-
; CHECK-NEXT: vse8.v v8, (s1)
63-
; CHECK-NEXT: vse8.v v8, (s2)
64-
; CHECK-NEXT: sd s3, 64(sp)
51+
; CHECK-NEXT: vl4r.v v8, (a0) # vscale x 32-byte Folded Reload
52+
; CHECK-NEXT: vsetivli zero, 7, e64, m4, ta, ma
53+
; CHECK-NEXT: vse64.v v8, (s0)
6554
; CHECK-NEXT: li a0, 0
6655
; CHECK-NEXT: csrr a1, vlenb
56+
; CHECK-NEXT: slli a1, a1, 2
6757
; CHECK-NEXT: add sp, sp, a1
6858
; CHECK-NEXT: lui a1, 8
69-
; CHECK-NEXT: addi a1, a1, -1952
59+
; CHECK-NEXT: addi a1, a1, -2000
7060
; CHECK-NEXT: add sp, sp, a1
7161
; CHECK-NEXT: ld ra, 2024(sp) # 8-byte Folded Reload
7262
; CHECK-NEXT: ld s0, 2016(sp) # 8-byte Folded Reload
73-
; CHECK-NEXT: ld s1, 2008(sp) # 8-byte Folded Reload
74-
; CHECK-NEXT: ld s2, 2000(sp) # 8-byte Folded Reload
75-
; CHECK-NEXT: ld s3, 1992(sp) # 8-byte Folded Reload
7663
; CHECK-NEXT: addi sp, sp, 2032
7764
; CHECK-NEXT: ret
7865
%1 = alloca %"buff", align 8

0 commit comments

Comments
 (0)