-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[ISel/RISCV] Custom-lower vector [l]lround #147713
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Lower it just like the vector [l]lrint, using vfcvt, with the right rounding mode. Updating costs to account for this custom-lowering is left to a companion patch.
@llvm/pr-subscribers-backend-risc-v Author: Ramkumar Ramachandra (artagnon) ChangesLower it just like the vector [l]lrint, using vfcvt, with the right rounding mode. Updating costs to account for this custom-lowering is left to a companion patch. Patch is 184.25 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147713.diff 5 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index dcb4f690ba35c..db709063b1977 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1070,6 +1070,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
// vXf32.
setOperationAction({ISD::FP_ROUND, ISD::FP_EXTEND}, VT, Custom);
setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
+ setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom);
// Custom-lower insert/extract operations to simplify patterns.
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT}, VT,
Custom);
@@ -1151,6 +1152,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
+ setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom);
setOperationAction({ISD::VP_MERGE, ISD::VP_SELECT, ISD::SELECT}, VT,
Custom);
setOperationAction(ISD::SELECT_CC, VT, Expand);
@@ -1453,6 +1455,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::VP_SINT_TO_FP, ISD::VP_UINT_TO_FP}, VT,
Custom);
setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
+ setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom);
if (Subtarget.hasStdExtZfhmin()) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
} else {
@@ -1478,6 +1481,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BITCAST, VT, Custom);
setOperationAction({ISD::VP_FP_ROUND, ISD::VP_FP_EXTEND}, VT, Custom);
setOperationAction({ISD::LRINT, ISD::LLRINT}, VT, Custom);
+ setOperationAction({ISD::LROUND, ISD::LLROUND}, VT, Custom);
if (Subtarget.hasStdExtZfbfmin()) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
} else {
@@ -1511,7 +1515,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND,
ISD::FROUNDEVEN, ISD::FRINT, ISD::LRINT,
- ISD::LLRINT, ISD::FNEARBYINT},
+ ISD::LLRINT, ISD::LROUND, ISD::LLROUND,
+ ISD::FNEARBYINT},
VT, Custom);
setCondCodeAction(VFPCCToExpand, VT, Expand);
@@ -3211,7 +3216,11 @@ static RISCVFPRndMode::RoundingMode matchRoundingOp(unsigned Opc) {
case ISD::VP_FCEIL:
return RISCVFPRndMode::RUP;
case ISD::FROUND:
+ case ISD::LROUND:
+ case ISD::LLROUND:
case ISD::STRICT_FROUND:
+ case ISD::STRICT_LROUND:
+ case ISD::STRICT_LLROUND:
case ISD::VP_FROUND:
return RISCVFPRndMode::RMM;
case ISD::FRINT:
@@ -3469,9 +3478,9 @@ lowerFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG,
DAG.getTargetConstant(FRM, DL, Subtarget.getXLenVT()));
}
-// Expand vector LRINT and LLRINT by converting to the integer domain.
-static SDValue lowerVectorXRINT(SDValue Op, SelectionDAG &DAG,
- const RISCVSubtarget &Subtarget) {
+// Expand vector [L]LRINT and [L]LROUND by converting to the integer domain.
+static SDValue lowerVectorXRINT_XROUND(SDValue Op, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
SDLoc DL(Op);
MVT DstVT = Op.getSimpleValueType();
SDValue Src = Op.getOperand(0);
@@ -7711,11 +7720,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
return lowerFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
case ISD::LRINT:
case ISD::LLRINT:
- if (Op.getValueType().isVector())
- return lowerVectorXRINT(Op, DAG, Subtarget);
- [[fallthrough]];
case ISD::LROUND:
case ISD::LLROUND: {
+ if (Op.getValueType().isVector())
+ return lowerVectorXRINT_XROUND(Op, DAG, Subtarget);
assert(Op.getOperand(0).getValueType() == MVT::f16 &&
"Unexpected custom legalisation");
SDLoc DL(Op);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llround.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llround.ll
index b8ca7fd71cb93..5751759ddd9cb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llround.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-llround.ll
@@ -1,35 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d,+zvfh -target-abi=ilp32d \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin -target-abi=ilp32d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh -target-abi=lp64d \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin -target-abi=lp64d \
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64
define <1 x i64> @llround_v1f16(<1 x half> %x) nounwind {
; RV32-LABEL: llround_v1f16:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: sw a0, 0(sp)
-; RV32-NEXT: sw a1, 4(sp)
-; RV32-NEXT: mv a0, sp
-; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
-; RV32-NEXT: vlse64.v v8, (a0), zero
-; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 16
+; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV32-NEXT: vfwcvt.f.f.v v9, v8
+; RV32-NEXT: fsrmi a0, 4
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vfwcvt.x.f.v v8, v9
+; RV32-NEXT: fsrm a0
; RV32-NEXT: ret
;
; RV64-LABEL: llround_v1f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; RV64-NEXT: vmv.s.x v8, a0
+; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; RV64-NEXT: vfwcvt.f.f.v v9, v8
+; RV64-NEXT: fsrmi a0, 4
+; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV64-NEXT: vfwcvt.x.f.v v8, v9
+; RV64-NEXT: fsrm a0
; RV64-NEXT: ret
%a = call <1 x i64> @llvm.llround.v1i64.v1f16(<1 x half> %x)
ret <1 x i64> %a
@@ -39,58 +32,22 @@ declare <1 x i64> @llvm.llround.v1i64.v1f16(<1 x half>)
define <2 x i64> @llround_v2f16(<2 x half> %x) nounwind {
; RV32-LABEL: llround_v2f16:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v8, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV32-NEXT: vfwcvt.f.f.v v9, v8
+; RV32-NEXT: fsrmi a0, 4
+; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV32-NEXT: vfwcvt.x.f.v v8, v9
+; RV32-NEXT: fsrm a0
; RV32-NEXT: ret
;
; RV64-LABEL: llround_v2f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vfmv.f.s fa5, v9
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a1, fa5, rmm
-; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: vslide1down.vx v8, v8, a1
+; RV64-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; RV64-NEXT: vfwcvt.f.f.v v9, v8
+; RV64-NEXT: fsrmi a0, 4
+; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; RV64-NEXT: vfwcvt.x.f.v v8, v9
+; RV64-NEXT: fsrm a0
; RV64-NEXT: ret
%a = call <2 x i64> @llvm.llround.v2i64.v2f16(<2 x half> %x)
ret <2 x i64> %a
@@ -100,108 +57,22 @@ declare <2 x i64> @llvm.llround.v2i64.v2f16(<2 x half>)
define <3 x i64> @llround_v3f16(<3 x half> %x) nounwind {
; RV32-LABEL: llround_v3f16:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 2
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vfwcvt.f.f.v v10, v8
+; RV32-NEXT: fsrmi a0, 4
+; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT: vfwcvt.x.f.v v8, v10
+; RV32-NEXT: fsrm a0
; RV32-NEXT: ret
;
; RV64-LABEL: llround_v3f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vslidedown.vi v11, v8, 3
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vfmv.f.s fa5, v9
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a1, fa5, rmm
-; RV64-NEXT: vfmv.f.s fa5, v10
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV64-NEXT: vfmv.f.s fa5, v11
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vslide1down.vx v8, v8, a0
+; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT: vfwcvt.f.f.v v10, v8
+; RV64-NEXT: fsrmi a0, 4
+; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64-NEXT: vfwcvt.x.f.v v8, v10
+; RV64-NEXT: fsrm a0
; RV64-NEXT: ret
%a = call <3 x i64> @llvm.llround.v3i64.v3f16(<3 x half> %x)
ret <3 x i64> %a
@@ -211,108 +82,22 @@ declare <3 x i64> @llvm.llround.v3i64.v3f16(<3 x half>)
define <4 x i64> @llround_v4f16(<4 x half> %x) nounwind {
; RV32-LABEL: llround_v4f16:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -32
-; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vmv.v.x v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 1
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 2
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 3
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl2r.v v8, (a2) # vscale x 16-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; RV32-NEXT: vslide1down.vx v8, v8, a0
-; RV32-NEXT: vslide1down.vx v8, v8, a1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV32-NEXT: vfwcvt.f.f.v v10, v8
+; RV32-NEXT: fsrmi a0, 4
+; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV32-NEXT: vfwcvt.x.f.v v8, v10
+; RV32-NEXT: fsrm a0
; RV32-NEXT: ret
;
; RV64-LABEL: llround_v4f16:
; RV64: # %bb.0:
-; RV64-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 1
-; RV64-NEXT: vfmv.f.s fa5, v8
-; RV64-NEXT: vslidedown.vi v10, v8, 2
-; RV64-NEXT: vslidedown.vi v11, v8, 3
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vfmv.f.s fa5, v9
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: fcvt.l.s a1, fa5, rmm
-; RV64-NEXT: vfmv.f.s fa5, v10
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; RV64-NEXT: vmv.v.x v8, a0
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vsetvli zero, zero, e16, mf2, ta, ma
-; RV64-NEXT: vfmv.f.s fa5, v11
-; RV64-NEXT: fcvt.s.h fa5, fa5
-; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma
-; RV64-NEXT: vslide1down.vx v8, v8, a1
-; RV64-NEXT: vslide1down.vx v8, v8, a0
-; RV64-NEXT: fcvt.l.s a0, fa5, rmm
-; RV64-NEXT: vslide1down.vx v8, v8, a0
+; RV64-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; RV64-NEXT: vfwcvt.f.f.v v10, v8
+; RV64-NEXT: fsrmi a0, 4
+; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64-NEXT: vfwcvt.x.f.v v8, v10
+; RV64-NEXT: fsrm a0
; RV64-NEXT: ret
%a = call <4 x i64> @llvm.llround.v4i64.v4f16(<4 x half> %x)
ret <4 x i64> %a
@@ -322,147 +107,22 @@ declare <4 x i64> @llvm.llround.v4i64.v4f16(<4 x half>)
define <8 x i64> @llround_v8f16(<8 x half> %x) nounwind {
; RV32-LABEL: llround_v8f16:
; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -208
-; RV32-NEXT: sw ra, 204(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 200(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 208
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: addi a0, sp, 192
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: sw a0, 64(sp)
-; RV32-NEXT: sw a1, 68(sp)
-; RV32-NEXT: addi a0, sp, 192
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 7
-; RV32-NEXT: vfmv.f.s fa5, v8
-; RV32-NEXT: fcvt.s.h fa0, fa5
-; RV32-NEXT: call llroundf
-; RV32-NEXT: sw a0, 120(sp)
-; RV32-NEXT: sw a1, 124(sp)
-; RV32-NEXT: addi a0, sp, 192
-; RV32-NEXT: vl1r.v v8, (a0) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 6
-; ...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin -target-abi=ilp32d \ | ||
; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 | ||
; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh -target-abi=lp64d \ | ||
; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin -target-abi=lp64d \ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just making a note here that we're switching from zvfh -> zvfhmin. We could add extra RUN lines and test both zvfh + zvfhmin, but I don't think it's that important. Happy to leave this up to you
Lower it just like the vector [l]lrint, using vfcvt, with the right rounding mode. Updating costs to account for this custom-lowering is left to a companion patch.