@@ -9,10 +9,15 @@ define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %
9
9
; CHECK-NEXT: ptrue p0.d
10
10
; CHECK-NEXT: and z1.d, z1.d, #0xff
11
11
; CHECK-NEXT: and z0.d, z0.d, #0xff
12
- ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
13
- ; CHECK-NEXT: lsr z1.d, z0.d, #8
12
+ ; CHECK-NEXT: movprfx z2, z0
13
+ ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d
14
+ ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
15
+ ; CHECK-NEXT: lsr z1.d, z2.d, #8
16
+ ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0
14
17
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
15
- ; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
18
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
19
+ ; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
20
+ ; CHECK-NEXT: mov z0.d, z2.d
16
21
; CHECK-NEXT: ret
17
22
%a = call { <vscale x 2 x i8 >, <vscale x 2 x i1 > } @llvm.umul.with.overflow.nxv2i8 (<vscale x 2 x i8 > %x , <vscale x 2 x i8 > %y )
18
23
%b = extractvalue { <vscale x 2 x i8 >, <vscale x 2 x i1 > } %a , 0
@@ -29,10 +34,15 @@ define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %
29
34
; CHECK-NEXT: ptrue p0.s
30
35
; CHECK-NEXT: and z1.s, z1.s, #0xff
31
36
; CHECK-NEXT: and z0.s, z0.s, #0xff
32
- ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
33
- ; CHECK-NEXT: lsr z1.s, z0.s, #8
37
+ ; CHECK-NEXT: movprfx z2, z0
38
+ ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s
39
+ ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
40
+ ; CHECK-NEXT: lsr z1.s, z2.s, #8
41
+ ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
34
42
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
35
- ; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
43
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
44
+ ; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0
45
+ ; CHECK-NEXT: mov z0.d, z2.d
36
46
; CHECK-NEXT: ret
37
47
%a = call { <vscale x 4 x i8 >, <vscale x 4 x i1 > } @llvm.umul.with.overflow.nxv4i8 (<vscale x 4 x i8 > %x , <vscale x 4 x i8 > %y )
38
48
%b = extractvalue { <vscale x 4 x i8 >, <vscale x 4 x i1 > } %a , 0
@@ -49,10 +59,15 @@ define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %
49
59
; CHECK-NEXT: ptrue p0.h
50
60
; CHECK-NEXT: and z1.h, z1.h, #0xff
51
61
; CHECK-NEXT: and z0.h, z0.h, #0xff
52
- ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
53
- ; CHECK-NEXT: lsr z1.h, z0.h, #8
62
+ ; CHECK-NEXT: movprfx z2, z0
63
+ ; CHECK-NEXT: mul z2.h, p0/m, z2.h, z1.h
64
+ ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
65
+ ; CHECK-NEXT: lsr z1.h, z2.h, #8
66
+ ; CHECK-NEXT: cmpne p1.h, p0/z, z0.h, #0
54
67
; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0
55
- ; CHECK-NEXT: mov z0.h, p0/m, #0 // =0x0
68
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
69
+ ; CHECK-NEXT: mov z2.h, p0/m, #0 // =0x0
70
+ ; CHECK-NEXT: mov z0.d, z2.d
56
71
; CHECK-NEXT: ret
57
72
%a = call { <vscale x 8 x i8 >, <vscale x 8 x i1 > } @llvm.umul.with.overflow.nxv8i8 (<vscale x 8 x i8 > %x , <vscale x 8 x i8 > %y )
58
73
%b = extractvalue { <vscale x 8 x i8 >, <vscale x 8 x i1 > } %a , 0
@@ -149,10 +164,15 @@ define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i1
149
164
; CHECK-NEXT: ptrue p0.d
150
165
; CHECK-NEXT: and z1.d, z1.d, #0xffff
151
166
; CHECK-NEXT: and z0.d, z0.d, #0xffff
152
- ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
153
- ; CHECK-NEXT: lsr z1.d, z0.d, #16
167
+ ; CHECK-NEXT: movprfx z2, z0
168
+ ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d
169
+ ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
170
+ ; CHECK-NEXT: lsr z1.d, z2.d, #16
171
+ ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0
154
172
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
155
- ; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
173
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
174
+ ; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
175
+ ; CHECK-NEXT: mov z0.d, z2.d
156
176
; CHECK-NEXT: ret
157
177
%a = call { <vscale x 2 x i16 >, <vscale x 2 x i1 > } @llvm.umul.with.overflow.nxv2i16 (<vscale x 2 x i16 > %x , <vscale x 2 x i16 > %y )
158
178
%b = extractvalue { <vscale x 2 x i16 >, <vscale x 2 x i1 > } %a , 0
@@ -169,10 +189,15 @@ define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i1
169
189
; CHECK-NEXT: ptrue p0.s
170
190
; CHECK-NEXT: and z1.s, z1.s, #0xffff
171
191
; CHECK-NEXT: and z0.s, z0.s, #0xffff
172
- ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
173
- ; CHECK-NEXT: lsr z1.s, z0.s, #16
192
+ ; CHECK-NEXT: movprfx z2, z0
193
+ ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z1.s
194
+ ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
195
+ ; CHECK-NEXT: lsr z1.s, z2.s, #16
196
+ ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0
174
197
; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0
175
- ; CHECK-NEXT: mov z0.s, p0/m, #0 // =0x0
198
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
199
+ ; CHECK-NEXT: mov z2.s, p0/m, #0 // =0x0
200
+ ; CHECK-NEXT: mov z0.d, z2.d
176
201
; CHECK-NEXT: ret
177
202
%a = call { <vscale x 4 x i16 >, <vscale x 4 x i1 > } @llvm.umul.with.overflow.nxv4i16 (<vscale x 4 x i16 > %x , <vscale x 4 x i16 > %y )
178
203
%b = extractvalue { <vscale x 4 x i16 >, <vscale x 4 x i1 > } %a , 0
@@ -269,10 +294,15 @@ define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i3
269
294
; CHECK-NEXT: ptrue p0.d
270
295
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
271
296
; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
272
- ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
273
- ; CHECK-NEXT: lsr z1.d, z0.d, #32
297
+ ; CHECK-NEXT: movprfx z2, z0
298
+ ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z1.d
299
+ ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d
300
+ ; CHECK-NEXT: lsr z1.d, z2.d, #32
301
+ ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0
274
302
; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0
275
- ; CHECK-NEXT: mov z0.d, p0/m, #0 // =0x0
303
+ ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b
304
+ ; CHECK-NEXT: mov z2.d, p0/m, #0 // =0x0
305
+ ; CHECK-NEXT: mov z0.d, z2.d
276
306
; CHECK-NEXT: ret
277
307
%a = call { <vscale x 2 x i32 >, <vscale x 2 x i1 > } @llvm.umul.with.overflow.nxv2i32 (<vscale x 2 x i32 > %x , <vscale x 2 x i32 > %y )
278
308
%b = extractvalue { <vscale x 2 x i32 >, <vscale x 2 x i1 > } %a , 0
0 commit comments