Skip to content

Commit 89e968f

Browse files
committed
[X86] Pre-checkin test case for combining const operand to VNNI
instruction.
1 parent da77db5 commit 89e968f

File tree

1 file changed

+285
-0
lines changed

1 file changed

+285
-0
lines changed

llvm/test/CodeGen/X86/dpbusd_const.ll

Lines changed: 285 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,285 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni | FileCheck %s --check-prefixes=ALL,AVXVNNI
3+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VNNI
4+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni -mattr=+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512VLVNNI
5+
6+
define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) {
7+
; ALL-LABEL: mul_4xi8_zc_exceed:
8+
; ALL: # %bb.0: # %entry
9+
; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
10+
; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
11+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
12+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
13+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
14+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
15+
; ALL-NEXT: vmovd %xmm0, %eax
16+
; ALL-NEXT: addl %edi, %eax
17+
; ALL-NEXT: retq
18+
entry:
19+
%0 = zext <4 x i8> %a to <4 x i32>
20+
%1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 128>
21+
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
22+
%op.extra = add nsw i32 %2, %c
23+
ret i32 %op.extra
24+
}
25+
26+
define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) {
27+
; ALL-LABEL: mul_4xi8_zc:
28+
; ALL: # %bb.0: # %entry
29+
; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
30+
; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
31+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
32+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
33+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
34+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
35+
; ALL-NEXT: vmovd %xmm0, %eax
36+
; ALL-NEXT: addl %edi, %eax
37+
; ALL-NEXT: retq
38+
entry:
39+
%0 = zext <4 x i8> %a to <4 x i32>
40+
%1 = mul nsw <4 x i32> %0, <i32 0, i32 1, i32 2, i32 127>
41+
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
42+
%op.extra = add nsw i32 %2, %c
43+
ret i32 %op.extra
44+
}
45+
46+
define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) {
47+
; AVXVNNI-LABEL: mul_4xi4_cz:
48+
; AVXVNNI: # %bb.0: # %entry
49+
; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
50+
; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0
51+
; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
52+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
53+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
54+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
55+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
56+
; AVXVNNI-NEXT: vmovd %xmm0, %eax
57+
; AVXVNNI-NEXT: addl %edi, %eax
58+
; AVXVNNI-NEXT: retq
59+
;
60+
; AVX512VNNI-LABEL: mul_4xi4_cz:
61+
; AVX512VNNI: # %bb.0: # %entry
62+
; AVX512VNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15]
63+
; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0
64+
; AVX512VNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
65+
; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
66+
; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
67+
; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
68+
; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
69+
; AVX512VNNI-NEXT: vmovd %xmm0, %eax
70+
; AVX512VNNI-NEXT: addl %edi, %eax
71+
; AVX512VNNI-NEXT: retq
72+
;
73+
; AVX512VLVNNI-LABEL: mul_4xi4_cz:
74+
; AVX512VLVNNI: # %bb.0: # %entry
75+
; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
76+
; AVX512VLVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
77+
; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
78+
; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
79+
; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
80+
; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
81+
; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax
82+
; AVX512VLVNNI-NEXT: addl %edi, %eax
83+
; AVX512VLVNNI-NEXT: retq
84+
entry:
85+
%0 = zext <4 x i4> %a to <4 x i32>
86+
%1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 127>, %0
87+
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
88+
%op.extra = add nsw i32 %2, %c
89+
ret i32 %op.extra
90+
}
91+
92+
define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
93+
; ALL-LABEL: mul_4xi8_cs:
94+
; ALL: # %bb.0: # %entry
95+
; ALL-NEXT: vpmovsxbd %xmm0, %xmm0
96+
; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
97+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
98+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
99+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
100+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
101+
; ALL-NEXT: vmovd %xmm0, %eax
102+
; ALL-NEXT: addl %edi, %eax
103+
; ALL-NEXT: retq
104+
entry:
105+
%0 = sext <4 x i8> %a to <4 x i32>
106+
%1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 255>, %0
107+
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
108+
%op.extra = add nsw i32 %2, %c
109+
ret i32 %op.extra
110+
}
111+
112+
define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) {
113+
; ALL-LABEL: mul_4xi8_cs_exceed:
114+
; ALL: # %bb.0: # %entry
115+
; ALL-NEXT: vpmovsxbd %xmm0, %xmm0
116+
; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
117+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
118+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
119+
; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
120+
; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0
121+
; ALL-NEXT: vmovd %xmm0, %eax
122+
; ALL-NEXT: addl %edi, %eax
123+
; ALL-NEXT: retq
124+
entry:
125+
%0 = sext <4 x i8> %a to <4 x i32>
126+
%1 = mul nsw <4 x i32> <i32 0, i32 1, i32 2, i32 256>, %0
127+
%2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
128+
%op.extra = add nsw i32 %2, %c
129+
ret i32 %op.extra
130+
}
131+
132+
define i32 @mul_16xi8_zc(<16 x i8> %a, i32 %c) {
133+
; AVXVNNI-LABEL: mul_16xi8_zc:
134+
; AVXVNNI: # %bb.0: # %entry
135+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
136+
; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
137+
; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
138+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
139+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
140+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
141+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
142+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
143+
; AVXVNNI-NEXT: vmovd %xmm0, %eax
144+
; AVXVNNI-NEXT: addl %edi, %eax
145+
; AVXVNNI-NEXT: vzeroupper
146+
; AVXVNNI-NEXT: retq
147+
;
148+
; AVX512-LABEL: mul_16xi8_zc:
149+
; AVX512: # %bb.0: # %entry
150+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
151+
; AVX512-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
152+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
153+
; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
154+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
155+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
156+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
157+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
158+
; AVX512-NEXT: vmovd %xmm0, %eax
159+
; AVX512-NEXT: addl %edi, %eax
160+
; AVX512-NEXT: vzeroupper
161+
; AVX512-NEXT: retq
162+
entry:
163+
%0 = zext <16 x i8> %a to <16 x i32>
164+
%1 = mul nsw <16 x i32> %0, <i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64>
165+
%2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
166+
%op.extra = add nsw i32 %2, %c
167+
ret i32 %op.extra
168+
}
169+
170+
define i32 @mul_32xi8_zc(<32 x i8> %a, i32 %c) {
171+
; AVXVNNI-LABEL: mul_32xi8_zc:
172+
; AVXVNNI: # %bb.0: # %entry
173+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
174+
; AVXVNNI-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18014407099482112,18014407099482112,18014407099482112,18014407099482112]
175+
; AVXVNNI-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
176+
; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm0
177+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
178+
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm0, %ymm1
179+
; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm0
180+
; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0
181+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
182+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
183+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
184+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
185+
; AVXVNNI-NEXT: vmovd %xmm0, %eax
186+
; AVXVNNI-NEXT: addl %edi, %eax
187+
; AVXVNNI-NEXT: vzeroupper
188+
; AVXVNNI-NEXT: retq
189+
;
190+
; AVX512-LABEL: mul_32xi8_zc:
191+
; AVX512: # %bb.0: # %entry
192+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
193+
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18014407099482112,18014407099482112,18014407099482112,18014407099482112]
194+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
195+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
196+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
197+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
198+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
199+
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
200+
; AVX512-NEXT: vpaddd %xmm2, %xmm3, %xmm2
201+
; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0
202+
; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
203+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
204+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
205+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
206+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
207+
; AVX512-NEXT: vmovd %xmm0, %eax
208+
; AVX512-NEXT: addl %edi, %eax
209+
; AVX512-NEXT: vzeroupper
210+
; AVX512-NEXT: retq
211+
entry:
212+
%0 = zext <32 x i8> %a to <32 x i32>
213+
%1 = mul nsw <32 x i32> %0, <i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64>
214+
%2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
215+
%op.extra = add nsw i32 %2, %c
216+
ret i32 %op.extra
217+
}
218+
219+
define i32 @mul_64xi8_zc(<64 x i8> %a, i32 %c) {
220+
; AVXVNNI-LABEL: mul_64xi8_zc:
221+
; AVXVNNI: # %bb.0: # %entry
222+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
223+
; AVXVNNI-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18014407099482112,18014407099482112,18014407099482112,18014407099482112]
224+
; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
225+
; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm0
226+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
227+
; AVXVNNI-NEXT: vpmaddwd %ymm3, %ymm0, %ymm0
228+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
229+
; AVXVNNI-NEXT: vextracti128 $1, %ymm1, %xmm1
230+
; AVXVNNI-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
231+
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0
232+
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm3, %ymm4, %ymm2
233+
; AVXVNNI-NEXT: vpaddd %ymm0, %ymm2, %ymm0
234+
; AVXVNNI-NEXT: vextracti128 $1, %ymm0, %xmm1
235+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
236+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
237+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
238+
; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
239+
; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0
240+
; AVXVNNI-NEXT: vmovd %xmm0, %eax
241+
; AVXVNNI-NEXT: addl %edi, %eax
242+
; AVXVNNI-NEXT: vzeroupper
243+
; AVXVNNI-NEXT: retq
244+
;
245+
; AVX512-LABEL: mul_64xi8_zc:
246+
; AVX512: # %bb.0: # %entry
247+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
248+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
249+
; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18014407099482112,18014407099482112,18014407099482112,18014407099482112]
250+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
251+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
252+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm3
253+
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm3
254+
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
255+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
256+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
257+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm4, %ymm4
258+
; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
259+
; AVX512-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
260+
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
261+
; AVX512-NEXT: vpaddd %ymm4, %ymm1, %ymm1
262+
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
263+
; AVX512-NEXT: vpaddd %zmm0, %zmm3, %zmm0
264+
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
265+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
266+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
267+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
268+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
269+
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
270+
; AVX512-NEXT: vmovd %xmm0, %eax
271+
; AVX512-NEXT: addl %edi, %eax
272+
; AVX512-NEXT: vzeroupper
273+
; AVX512-NEXT: retq
274+
entry:
275+
%0 = zext <64 x i8> %a to <64 x i32>
276+
%1 = mul nsw <64 x i32> %0, <i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64, i32 0, i32 1, i32 2, i32 64>
277+
%2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
278+
%op.extra = add nsw i32 %2, %c
279+
ret i32 %op.extra
280+
}
281+
282+
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
283+
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
284+
declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
285+
declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)

0 commit comments

Comments
 (0)