Skip to content

Commit ab6e02d

Browse files
committed
[RISCV] Match vwmulsu_vx with scalar splat input.
This is a more generic version of D119110 that uses MaskedValueIsZero to do the matching and SimplifyDemandedBits to remove any unneeded AND instructions. Tests were taken from D119110. Reviewed By: Chenbing.Zheng Differential Revision: https://reviews.llvm.org/D119622
1 parent d132b47 commit ab6e02d

File tree

2 files changed

+263
-6
lines changed

2 files changed

+263
-6
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7786,12 +7786,15 @@ static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
77867786
if (ScalarBits < EltBits)
77877787
return SDValue();
77887788

7789-
if (IsSignExt) {
7790-
if (DAG.ComputeNumSignBits(Op1) <= (ScalarBits - NarrowSize))
7791-
return SDValue();
7789+
// If the LHS is a sign extend, try to use vwmul.
7790+
if (IsSignExt && DAG.ComputeNumSignBits(Op1) > (ScalarBits - NarrowSize)) {
7791+
// Can use vwmul.
77927792
} else {
7793+
// Otherwise try to use vwmulu or vwmulsu.
77937794
APInt Mask = APInt::getBitsSetFrom(ScalarBits, NarrowSize);
7794-
if (!DAG.MaskedValueIsZero(Op1, Mask))
7795+
if (DAG.MaskedValueIsZero(Op1, Mask))
7796+
IsVWMULSU = IsSignExt;
7797+
else
77957798
return SDValue();
77967799
}
77977800

@@ -8438,6 +8441,16 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
84388441
return Gather;
84398442
break;
84408443
}
8444+
case RISCVISD::VMV_V_X_VL: {
8445+
// VMV.V.X only demands the vector element bitwidth from the scalar input.
8446+
unsigned ScalarSize = N->getOperand(0).getValueSizeInBits();
8447+
unsigned EltWidth = N->getValueType(0).getScalarSizeInBits();
8448+
if (ScalarSize > EltWidth)
8449+
if (SimplifyDemandedLowBitsHelper(0, EltWidth))
8450+
return SDValue(N, 0);
8451+
8452+
break;
8453+
}
84418454
}
84428455

84438456
return SDValue();

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll

Lines changed: 246 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
3-
; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK
2+
; RUN: llc -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3+
; RUN: llc -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
44

55
define <2 x i16> @vwmulsu_v2i16(<2 x i8>* %x, <2 x i8>* %y) {
66
; CHECK-LABEL: vwmulsu_v2i16:
@@ -681,3 +681,247 @@ define <16 x i64> @vwmulsu_vx_v16i64(<16 x i32>* %x, i32 %y) {
681681
%f = mul <16 x i64> %d, %e
682682
ret <16 x i64> %f
683683
}
684+
685+
define <8 x i16> @vwmulsu_vx_v8i16_i8(<8 x i8>* %x, i8* %y) {
686+
; CHECK-LABEL: vwmulsu_vx_v8i16_i8:
687+
; CHECK: # %bb.0:
688+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
689+
; CHECK-NEXT: vle8.v v9, (a0)
690+
; CHECK-NEXT: lbu a0, 0(a1)
691+
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
692+
; CHECK-NEXT: ret
693+
%a = load <8 x i8>, <8 x i8>* %x
694+
%b = load i8, i8* %y
695+
%c = zext i8 %b to i16
696+
%d = insertelement <8 x i16> poison, i16 %c, i32 0
697+
%e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer
698+
%f = sext <8 x i8> %a to <8 x i16>
699+
%g = mul <8 x i16> %e, %f
700+
ret <8 x i16> %g
701+
}
702+
703+
define <8 x i16> @vwmulsu_vx_v8i16_i8_swap(<8 x i8>* %x, i8* %y) {
704+
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap:
705+
; CHECK: # %bb.0:
706+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
707+
; CHECK-NEXT: vle8.v v8, (a0)
708+
; CHECK-NEXT: lb a0, 0(a1)
709+
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu
710+
; CHECK-NEXT: vzext.vf2 v9, v8
711+
; CHECK-NEXT: vmul.vx v8, v9, a0
712+
; CHECK-NEXT: ret
713+
%a = load <8 x i8>, <8 x i8>* %x
714+
%b = load i8, i8* %y
715+
%c = sext i8 %b to i16
716+
%d = insertelement <8 x i16> poison, i16 %c, i32 0
717+
%e = shufflevector <8 x i16> %d, <8 x i16> poison, <8 x i32> zeroinitializer
718+
%f = zext <8 x i8> %a to <8 x i16>
719+
%g = mul <8 x i16> %e, %f
720+
ret <8 x i16> %g
721+
}
722+
723+
define <4 x i32> @vwmulsu_vx_v4i32_i8(<4 x i16>* %x, i8* %y) {
724+
; CHECK-LABEL: vwmulsu_vx_v4i32_i8:
725+
; CHECK: # %bb.0:
726+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
727+
; CHECK-NEXT: vle16.v v9, (a0)
728+
; CHECK-NEXT: lbu a0, 0(a1)
729+
; CHECK-NEXT: vwmul.vx v8, v9, a0
730+
; CHECK-NEXT: ret
731+
%a = load <4 x i16>, <4 x i16>* %x
732+
%b = load i8, i8* %y
733+
%c = zext i8 %b to i32
734+
%d = insertelement <4 x i32> poison, i32 %c, i32 0
735+
%e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer
736+
%f = sext <4 x i16> %a to <4 x i32>
737+
%g = mul <4 x i32> %e, %f
738+
ret <4 x i32> %g
739+
}
740+
741+
define <4 x i32> @vwmulsu_vx_v4i32_i16(<4 x i16>* %x, i16* %y) {
742+
; CHECK-LABEL: vwmulsu_vx_v4i32_i16:
743+
; CHECK: # %bb.0:
744+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
745+
; CHECK-NEXT: vle16.v v9, (a0)
746+
; CHECK-NEXT: lhu a0, 0(a1)
747+
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
748+
; CHECK-NEXT: ret
749+
%a = load <4 x i16>, <4 x i16>* %x
750+
%b = load i16, i16* %y
751+
%c = zext i16 %b to i32
752+
%d = insertelement <4 x i32> poison, i32 %c, i32 0
753+
%e = shufflevector <4 x i32> %d, <4 x i32> poison, <4 x i32> zeroinitializer
754+
%f = sext <4 x i16> %a to <4 x i32>
755+
%g = mul <4 x i32> %e, %f
756+
ret <4 x i32> %g
757+
}
758+
759+
define <2 x i64> @vwmulsu_vx_v2i64_i8(<2 x i32>* %x, i8* %y) {
760+
; RV32-LABEL: vwmulsu_vx_v2i64_i8:
761+
; RV32: # %bb.0:
762+
; RV32-NEXT: addi sp, sp, -16
763+
; RV32-NEXT: .cfi_def_cfa_offset 16
764+
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
765+
; RV32-NEXT: lbu a1, 0(a1)
766+
; RV32-NEXT: vle32.v v8, (a0)
767+
; RV32-NEXT: sw zero, 12(sp)
768+
; RV32-NEXT: sw a1, 8(sp)
769+
; RV32-NEXT: addi a0, sp, 8
770+
; RV32-NEXT: vlse64.v v9, (a0), zero
771+
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
772+
; RV32-NEXT: vsext.vf2 v10, v8
773+
; RV32-NEXT: vmul.vv v8, v9, v10
774+
; RV32-NEXT: addi sp, sp, 16
775+
; RV32-NEXT: ret
776+
;
777+
; RV64-LABEL: vwmulsu_vx_v2i64_i8:
778+
; RV64: # %bb.0:
779+
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
780+
; RV64-NEXT: vle32.v v9, (a0)
781+
; RV64-NEXT: lbu a0, 0(a1)
782+
; RV64-NEXT: vwmul.vx v8, v9, a0
783+
; RV64-NEXT: ret
784+
%a = load <2 x i32>, <2 x i32>* %x
785+
%b = load i8, i8* %y
786+
%c = zext i8 %b to i64
787+
%d = insertelement <2 x i64> poison, i64 %c, i64 0
788+
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
789+
%f = sext <2 x i32> %a to <2 x i64>
790+
%g = mul <2 x i64> %e, %f
791+
ret <2 x i64> %g
792+
}
793+
794+
define <2 x i64> @vwmulsu_vx_v2i64_i16(<2 x i32>* %x, i16* %y) {
795+
; RV32-LABEL: vwmulsu_vx_v2i64_i16:
796+
; RV32: # %bb.0:
797+
; RV32-NEXT: addi sp, sp, -16
798+
; RV32-NEXT: .cfi_def_cfa_offset 16
799+
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
800+
; RV32-NEXT: lhu a1, 0(a1)
801+
; RV32-NEXT: vle32.v v8, (a0)
802+
; RV32-NEXT: sw zero, 12(sp)
803+
; RV32-NEXT: sw a1, 8(sp)
804+
; RV32-NEXT: addi a0, sp, 8
805+
; RV32-NEXT: vlse64.v v9, (a0), zero
806+
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
807+
; RV32-NEXT: vsext.vf2 v10, v8
808+
; RV32-NEXT: vmul.vv v8, v9, v10
809+
; RV32-NEXT: addi sp, sp, 16
810+
; RV32-NEXT: ret
811+
;
812+
; RV64-LABEL: vwmulsu_vx_v2i64_i16:
813+
; RV64: # %bb.0:
814+
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
815+
; RV64-NEXT: vle32.v v9, (a0)
816+
; RV64-NEXT: lhu a0, 0(a1)
817+
; RV64-NEXT: vwmul.vx v8, v9, a0
818+
; RV64-NEXT: ret
819+
%a = load <2 x i32>, <2 x i32>* %x
820+
%b = load i16, i16* %y
821+
%c = zext i16 %b to i64
822+
%d = insertelement <2 x i64> poison, i64 %c, i64 0
823+
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
824+
%f = sext <2 x i32> %a to <2 x i64>
825+
%g = mul <2 x i64> %e, %f
826+
ret <2 x i64> %g
827+
}
828+
829+
define <2 x i64> @vwmulsu_vx_v2i64_i32(<2 x i32>* %x, i32* %y) {
830+
; RV32-LABEL: vwmulsu_vx_v2i64_i32:
831+
; RV32: # %bb.0:
832+
; RV32-NEXT: addi sp, sp, -16
833+
; RV32-NEXT: .cfi_def_cfa_offset 16
834+
; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
835+
; RV32-NEXT: lw a1, 0(a1)
836+
; RV32-NEXT: vle32.v v8, (a0)
837+
; RV32-NEXT: sw zero, 12(sp)
838+
; RV32-NEXT: sw a1, 8(sp)
839+
; RV32-NEXT: addi a0, sp, 8
840+
; RV32-NEXT: vlse64.v v9, (a0), zero
841+
; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, mu
842+
; RV32-NEXT: vsext.vf2 v10, v8
843+
; RV32-NEXT: vmul.vv v8, v9, v10
844+
; RV32-NEXT: addi sp, sp, 16
845+
; RV32-NEXT: ret
846+
;
847+
; RV64-LABEL: vwmulsu_vx_v2i64_i32:
848+
; RV64: # %bb.0:
849+
; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, mu
850+
; RV64-NEXT: vle32.v v9, (a0)
851+
; RV64-NEXT: lwu a0, 0(a1)
852+
; RV64-NEXT: vwmulsu.vx v8, v9, a0
853+
; RV64-NEXT: ret
854+
%a = load <2 x i32>, <2 x i32>* %x
855+
%b = load i32, i32* %y
856+
%c = zext i32 %b to i64
857+
%d = insertelement <2 x i64> poison, i64 %c, i64 0
858+
%e = shufflevector <2 x i64> %d, <2 x i64> poison, <2 x i32> zeroinitializer
859+
%f = sext <2 x i32> %a to <2 x i64>
860+
%g = mul <2 x i64> %e, %f
861+
ret <2 x i64> %g
862+
}
863+
864+
define <8 x i16> @vwmulsu_vx_v8i16_i8_and(<8 x i8>* %x, i16 %y) {
865+
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and:
866+
; CHECK: # %bb.0:
867+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
868+
; CHECK-NEXT: vle8.v v9, (a0)
869+
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
870+
; CHECK-NEXT: ret
871+
%a = load <8 x i8>, <8 x i8>* %x
872+
%b = and i16 %y, 255
873+
%c = insertelement <8 x i16> poison, i16 %b, i32 0
874+
%d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer
875+
%e = sext <8 x i8> %a to <8 x i16>
876+
%f = mul <8 x i16> %d, %e
877+
ret <8 x i16> %f
878+
}
879+
880+
define <8 x i16> @vwmulsu_vx_v8i16_i8_and1(<8 x i8>* %x, i16 %y) {
881+
; CHECK-LABEL: vwmulsu_vx_v8i16_i8_and1:
882+
; CHECK: # %bb.0:
883+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
884+
; CHECK-NEXT: vle8.v v9, (a0)
885+
; CHECK-NEXT: andi a0, a1, 254
886+
; CHECK-NEXT: vwmulsu.vx v8, v9, a0
887+
; CHECK-NEXT: ret
888+
%a = load <8 x i8>, <8 x i8>* %x
889+
%b = and i16 %y, 254
890+
%c = insertelement <8 x i16> poison, i16 %b, i32 0
891+
%d = shufflevector <8 x i16> %c, <8 x i16> poison, <8 x i32> zeroinitializer
892+
%e = sext <8 x i8> %a to <8 x i16>
893+
%f = mul <8 x i16> %d, %e
894+
ret <8 x i16> %f
895+
}
896+
897+
define <4 x i32> @vwmulsu_vx_v4i32_i16_and(<4 x i16>* %x, i32 %y) {
898+
; CHECK-LABEL: vwmulsu_vx_v4i32_i16_and:
899+
; CHECK: # %bb.0:
900+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
901+
; CHECK-NEXT: vle16.v v9, (a0)
902+
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
903+
; CHECK-NEXT: ret
904+
%a = load <4 x i16>, <4 x i16>* %x
905+
%b = and i32 %y, 65535
906+
%c = insertelement <4 x i32> poison, i32 %b, i32 0
907+
%d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer
908+
%e = sext <4 x i16> %a to <4 x i32>
909+
%f = mul <4 x i32> %d, %e
910+
ret <4 x i32> %f
911+
}
912+
913+
define <4 x i32> @vwmulsu_vx_v4i32_i16_zext(<4 x i16>* %x, i16 %y) {
914+
; CHECK-LABEL: vwmulsu_vx_v4i32_i16_zext:
915+
; CHECK: # %bb.0:
916+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, mu
917+
; CHECK-NEXT: vle16.v v9, (a0)
918+
; CHECK-NEXT: vwmulsu.vx v8, v9, a1
919+
; CHECK-NEXT: ret
920+
%a = load <4 x i16>, <4 x i16>* %x
921+
%b = zext i16 %y to i32
922+
%c = insertelement <4 x i32> poison, i32 %b, i32 0
923+
%d = shufflevector <4 x i32> %c, <4 x i32> poison, <4 x i32> zeroinitializer
924+
%e = sext <4 x i16> %a to <4 x i32>
925+
%f = mul <4 x i32> %d, %e
926+
ret <4 x i32> %f
927+
}

0 commit comments

Comments
 (0)