[X86][XOP] Add tests for missing demanded elts handling for xop shifts

RKSimon · RKSimon · commit 13362abf3d66 · 2022-01-12T12:43:13.000Z
Noticed while investigating how to improve funnel shift codegen
diff --git a/llvm/test/CodeGen/X86/xop-shifts.ll b/llvm/test/CodeGen/X86/xop-shifts.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s
+
+define <16 x i8> @demandedelts_vpshab(<16 x i8> %a0, <16 x i8> %a1) {
+; CHECK-LABEL: demandedelts_vpshab:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; CHECK-NEXT:    vpshab %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %shift = call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %shuffle, <16 x i8> %a1)
+  %res = shufflevector <16 x i8> %shift, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <4 x i32> @demandedelts_vpshld(<4 x i32> %a0, <4 x i32> %a1) {
+; CHECK-LABEL: demandedelts_vpshld:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; CHECK-NEXT:    vpshld %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; CHECK-NEXT:    retq
+  %shuffle = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shift = call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %a0, <4 x i32> %shuffle)
+  %result = shufflevector <4 x i32> %shift, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %result
+}
+
+declare <16 x i8> @llvm.x86.xop.vpshab(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpshad(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.x86.xop.vpshld(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16>, <8 x i16>) nounwind readnone