Skip to content

Commit 4fee398

Browse files
[LLVM][SVE] Add isel for scalable vector bfloat copysign operations. (#130098)
1 parent 4508d6a commit 4fee398

File tree

4 files changed

+343
-29
lines changed

4 files changed

+343
-29
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1717,6 +1717,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
17171717
setOperationAction(ISD::BITCAST, VT, Custom);
17181718
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
17191719
setOperationAction(ISD::FABS, VT, Legal);
1720+
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
17201721
setOperationAction(ISD::FNEG, VT, Legal);
17211722
setOperationAction(ISD::FP_EXTEND, VT, Custom);
17221723
setOperationAction(ISD::FP_ROUND, VT, Custom);
@@ -10706,7 +10707,7 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
1070610707
// a SVE FCOPYSIGN.
1070710708
if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
1070810709
Subtarget->isSVEorStreamingSVEAvailable()) {
10709-
if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
10710+
if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
1071010711
return SDValue();
1071110712
EVT SVT = getPackedSVEVectorVT(VT);
1071210713

llvm/test/CodeGen/AArch64/sve-fcopysign.ll

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,19 @@ define <vscale x 2 x float> @copysign_nxv2f32_nxv2f64(<vscale x 2 x float> %a, <
185185
ret <vscale x 2 x float> %r
186186
}
187187

188+
define <vscale x 2 x float> @copysign_nxv2f32_nxv2bf16(<vscale x 2 x float> %a, <vscale x 2 x bfloat> %b) {
189+
; CHECK-LABEL: copysign_nxv2f32_nxv2bf16:
190+
; CHECK: // %bb.0:
191+
; CHECK-NEXT: lsl z1.s, z1.s, #16
192+
; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff
193+
; CHECK-NEXT: and z1.s, z1.s, #0x80000000
194+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
195+
; CHECK-NEXT: ret
196+
%tmp0 = fpext <vscale x 2 x bfloat> %b to <vscale x 2 x float>
197+
%r = call <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %tmp0)
198+
ret <vscale x 2 x float> %r
199+
}
200+
188201
;
189202
; llvm.copysign.nxv4f32
190203
;
@@ -230,6 +243,19 @@ define <vscale x 4 x float> @copysign_nxv4f32_nxv4f64(<vscale x 4 x float> %a, <
230243
ret <vscale x 4 x float> %r
231244
}
232245

246+
define <vscale x 4 x float> @copysign_nxv4f32_nxv4bf16(<vscale x 4 x float> %a, <vscale x 4 x bfloat> %b) {
247+
; CHECK-LABEL: copysign_nxv4f32_nxv4bf16:
248+
; CHECK: // %bb.0:
249+
; CHECK-NEXT: lsl z1.s, z1.s, #16
250+
; CHECK-NEXT: and z0.s, z0.s, #0x7fffffff
251+
; CHECK-NEXT: and z1.s, z1.s, #0x80000000
252+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
253+
; CHECK-NEXT: ret
254+
%tmp0 = fpext <vscale x 4 x bfloat> %b to <vscale x 4 x float>
255+
%r = call <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %tmp0)
256+
ret <vscale x 4 x float> %r
257+
}
258+
233259
;
234260
; llvm.copysign.nxv2f64
235261
;
@@ -273,9 +299,137 @@ define <vscale x 2 x double> @copysign_nxv2f64_nxv2f64(<vscale x 2 x double> %a,
273299
ret <vscale x 2 x double> %r
274300
}
275301

302+
define <vscale x 2 x double> @copysign_nxv2f64_nxv2bf16(<vscale x 2 x double> %a, <vscale x 2 x bfloat> %b) {
303+
; CHECK-LABEL: copysign_nxv2f64_nxv2bf16:
304+
; CHECK: // %bb.0:
305+
; CHECK-NEXT: lsl z1.s, z1.s, #16
306+
; CHECK-NEXT: ptrue p0.d
307+
; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff
308+
; CHECK-NEXT: fcvt z1.d, p0/m, z1.s
309+
; CHECK-NEXT: and z1.d, z1.d, #0x8000000000000000
310+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
311+
; CHECK-NEXT: ret
312+
%b.ext = fpext <vscale x 2 x bfloat> %b to <vscale x 2 x double>
313+
%r = call <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b.ext)
314+
ret <vscale x 2 x double> %r
315+
}
316+
317+
;
318+
; llvm.copysign.nxv2bf16
319+
;
320+
321+
define <vscale x 2 x bfloat> @copysign_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
322+
; CHECK-LABEL: copysign_nxv2bf16_nxv2bf16:
323+
; CHECK: // %bb.0:
324+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
325+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
326+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
327+
; CHECK-NEXT: ret
328+
%r = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
329+
ret <vscale x 2 x bfloat> %r
330+
}
331+
332+
define <vscale x 2 x bfloat> @copysign_nxv2bf16_nxv2f32(<vscale x 2 x bfloat> %a, <vscale x 2 x float> %b) {
333+
; CHECK-LABEL: copysign_nxv2bf16_nxv2f32:
334+
; CHECK: // %bb.0:
335+
; CHECK-NEXT: ptrue p0.d
336+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
337+
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
338+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
339+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
340+
; CHECK-NEXT: ret
341+
%tmp0 = fptrunc <vscale x 2 x float> %b to <vscale x 2 x bfloat>
342+
%r = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %tmp0)
343+
ret <vscale x 2 x bfloat> %r
344+
}
345+
346+
; TODO: Cannot downconvert from double to bfloat
347+
;define <vscale x 2 x bfloat> @copysign_nxv2bf16_nxv2f64(<vscale x 2 x bfloat> %a, <vscale x 2 x double> %b) {
348+
; %tmp0 = fptrunc <vscale x 2 x double> %b to <vscale x 2 x bfloat>
349+
; %r = call <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %tmp0)
350+
; ret <vscale x 2 x bfloat> %r
351+
;}
352+
353+
;
354+
; llvm.copysign.nxv2bf16
355+
;
356+
357+
define <vscale x 4 x bfloat> @copysign_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
358+
; CHECK-LABEL: copysign_nxv4bf16_nxv4bf16:
359+
; CHECK: // %bb.0:
360+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
361+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
362+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
363+
; CHECK-NEXT: ret
364+
%r = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
365+
ret <vscale x 4 x bfloat> %r
366+
}
367+
368+
define <vscale x 4 x bfloat> @copysign_nxv4bf16_nxv4f32(<vscale x 4 x bfloat> %a, <vscale x 4 x float> %b) {
369+
; CHECK-LABEL: copysign_nxv4bf16_nxv4f32:
370+
; CHECK: // %bb.0:
371+
; CHECK-NEXT: ptrue p0.s
372+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
373+
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
374+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
375+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
376+
; CHECK-NEXT: ret
377+
%b.trunc = fptrunc <vscale x 4 x float> %b to <vscale x 4 x bfloat>
378+
%r = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b.trunc)
379+
ret <vscale x 4 x bfloat> %r
380+
}
381+
382+
; TODO: Cannot downconvert from double to bfloat
383+
;define <vscale x 4 x bfloat> @copysign_nxv4bf16_nxv4f64(<vscale x 4 x bfloat> %a, <vscale x 4 x double> %b) {
384+
; %b.trunc = fptrunc <vscale x 4 x double> %b to <vscale x 4 x bfloat>
385+
; %r = call <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b.trunc)
386+
; ret <vscale x 4 x bfloat> %r
387+
;}
388+
389+
;
390+
; llvm.copysign.nxv8bf16
391+
;
392+
393+
define <vscale x 8 x bfloat> @copysign_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
394+
; CHECK-LABEL: copysign_nxv8bf16_nxv8bf16:
395+
; CHECK: // %bb.0:
396+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
397+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
398+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
399+
; CHECK-NEXT: ret
400+
%r = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
401+
ret <vscale x 8 x bfloat> %r
402+
}
403+
404+
define <vscale x 8 x bfloat> @copysign_nxv8bf16_nxv8f32(<vscale x 8 x bfloat> %a, <vscale x 8 x float> %b) {
405+
; CHECK-LABEL: copysign_nxv8bf16_nxv8f32:
406+
; CHECK: // %bb.0:
407+
; CHECK-NEXT: ptrue p0.s
408+
; CHECK-NEXT: and z0.h, z0.h, #0x7fff
409+
; CHECK-NEXT: bfcvt z2.h, p0/m, z2.s
410+
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
411+
; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
412+
; CHECK-NEXT: and z1.h, z1.h, #0x8000
413+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
414+
; CHECK-NEXT: ret
415+
%b.trunc = fptrunc <vscale x 8 x float> %b to <vscale x 8 x bfloat>
416+
%r = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b.trunc)
417+
ret <vscale x 8 x bfloat> %r
418+
}
419+
420+
; TODO: Cannot downconvert from double to bfloat
421+
;define <vscale x 8 x bfloat> @copysign_nxv8bf16_nxv8f64(<vscale x 8 x bfloat> %a, <vscale x 8 x double> %b) {
422+
; %b.trunc = fptrunc <vscale x 8 x double> %b to <vscale x 8 x bfloat>
423+
; %r = call <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b.trunc)
424+
; ret <vscale x 8 x bfloat> %r
425+
;}
426+
276427
declare <vscale x 2 x half> @llvm.copysign.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
277428
declare <vscale x 4 x half> @llvm.copysign.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
278429
declare <vscale x 8 x half> @llvm.copysign.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
279430
declare <vscale x 2 x float> @llvm.copysign.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
280431
declare <vscale x 4 x float> @llvm.copysign.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
281432
declare <vscale x 2 x double> @llvm.copysign.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
433+
declare <vscale x 2 x bfloat> @llvm.copysign.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
434+
declare <vscale x 4 x bfloat> @llvm.copysign.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
435+
declare <vscale x 8 x bfloat> @llvm.copysign.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll

Lines changed: 9 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -55,40 +55,21 @@ define void @test_copysign_f16(ptr %ap, ptr %bp) {
5555
define void @test_copysign_bf16(ptr %ap, ptr %bp) {
5656
; SVE-LABEL: test_copysign_bf16:
5757
; SVE: // %bb.0:
58-
; SVE-NEXT: sub sp, sp, #16
59-
; SVE-NEXT: .cfi_def_cfa_offset 16
60-
; SVE-NEXT: ldr h0, [x0]
61-
; SVE-NEXT: ldr h1, [x1]
62-
; SVE-NEXT: fmov w8, s0
63-
; SVE-NEXT: str h1, [sp, #12]
64-
; SVE-NEXT: ldrb w9, [sp, #13]
65-
; SVE-NEXT: and w8, w8, #0x7fff
66-
; SVE-NEXT: tst w9, #0x80
67-
; SVE-NEXT: fmov s0, w8
68-
; SVE-NEXT: eor w8, w8, #0x8000
69-
; SVE-NEXT: fmov s1, w8
70-
; SVE-NEXT: fcsel h0, h1, h0, ne
58+
; SVE-NEXT: ldr h0, [x1]
59+
; SVE-NEXT: ldr h1, [x0]
60+
; SVE-NEXT: and z0.h, z0.h, #0x8000
61+
; SVE-NEXT: and z1.h, z1.h, #0x7fff
62+
; SVE-NEXT: orr z0.d, z1.d, z0.d
7163
; SVE-NEXT: str h0, [x0]
72-
; SVE-NEXT: add sp, sp, #16
7364
; SVE-NEXT: ret
7465
;
7566
; SVE2-LABEL: test_copysign_bf16:
7667
; SVE2: // %bb.0:
77-
; SVE2-NEXT: sub sp, sp, #16
78-
; SVE2-NEXT: .cfi_def_cfa_offset 16
79-
; SVE2-NEXT: ldr h0, [x0]
68+
; SVE2-NEXT: mov z0.h, #32767 // =0x7fff
8069
; SVE2-NEXT: ldr h1, [x1]
81-
; SVE2-NEXT: fmov w8, s0
82-
; SVE2-NEXT: str h1, [sp, #12]
83-
; SVE2-NEXT: ldrb w9, [sp, #13]
84-
; SVE2-NEXT: and w8, w8, #0x7fff
85-
; SVE2-NEXT: tst w9, #0x80
86-
; SVE2-NEXT: fmov s0, w8
87-
; SVE2-NEXT: eor w8, w8, #0x8000
88-
; SVE2-NEXT: fmov s1, w8
89-
; SVE2-NEXT: fcsel h0, h1, h0, ne
90-
; SVE2-NEXT: str h0, [x0]
91-
; SVE2-NEXT: add sp, sp, #16
70+
; SVE2-NEXT: ldr h2, [x0]
71+
; SVE2-NEXT: bsl z2.d, z2.d, z1.d, z0.d
72+
; SVE2-NEXT: str h2, [x0]
9273
; SVE2-NEXT: ret
9374
;
9475
; NONEON-NOSVE-LABEL: test_copysign_bf16:

0 commit comments

Comments
 (0)