Skip to content

Commit 0f59720

Browse files
committed
AMDGPU: Fold fneg into bitcast of build_vector
The math libraries have a lot of code that performs manual sign bit operations by bitcasting doubles to int2 and doing bithacking on them. This is a bad canonical form we should rewrite to use high level sign operations directly on double. To avoid codegen regressions, we need to do a better job moving fnegs to operate only on the high 32-bits. This is only halfway to fixing the real case.
1 parent 727ebef commit 0f59720

File tree

3 files changed

+87
-41
lines changed

3 files changed

+87
-41
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
556556
//===----------------------------------------------------------------------===//
557557

558558
LLVM_READNONE
559-
static bool fnegFoldsIntoOp(unsigned Opc) {
559+
static bool fnegFoldsIntoOpcode(unsigned Opc) {
560560
switch (Opc) {
561561
case ISD::FADD:
562562
case ISD::FSUB:
@@ -583,11 +583,27 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
583583
case AMDGPUISD::FMED3:
584584
// TODO: handle llvm.amdgcn.fma.legacy
585585
return true;
586+
case ISD::BITCAST:
587+
llvm_unreachable("bitcast is special cased");
586588
default:
587589
return false;
588590
}
589591
}
590592

593+
static bool fnegFoldsIntoOp(const SDNode *N) {
594+
unsigned Opc = N->getOpcode();
595+
if (Opc == ISD::BITCAST) {
596+
// TODO: Is there a benefit to checking the conditions performFNegCombine
597+
// does? We don't for the other cases.
598+
SDValue BCSrc = N->getOperand(0);
599+
return BCSrc.getOpcode() == ISD::BUILD_VECTOR &&
600+
BCSrc.getNumOperands() == 2 &&
601+
BCSrc.getOperand(1).getValueSizeInBits() == 32;
602+
}
603+
604+
return fnegFoldsIntoOpcode(Opc);
605+
}
606+
591607
/// \p returns true if the operation will definitely need to use a 64-bit
592608
/// encoding, and thus will use a VOP3 encoding regardless of the source
593609
/// modifiers.
@@ -3773,7 +3789,7 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
37733789

37743790
if (NewLHS.hasOneUse()) {
37753791
unsigned Opc = NewLHS.getOpcode();
3776-
if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3792+
if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
37773793
ShouldFoldNeg = false;
37783794
if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
37793795
ShouldFoldNeg = false;
@@ -3915,8 +3931,6 @@ static unsigned inverseMinMax(unsigned Opc) {
39153931
/// \return true if it's profitable to try to push an fneg into its source
39163932
/// instruction.
39173933
bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
3918-
unsigned Opc = N0.getOpcode();
3919-
39203934
// If the input has multiple uses and we can either fold the negate down, or
39213935
// the other uses cannot, give up. This both prevents unprofitable
39223936
// transformations and infinite loops: we won't repeatedly try to fold around
@@ -3927,7 +3941,7 @@ bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
39273941
if (allUsesHaveSourceMods(N, 0))
39283942
return false;
39293943
} else {
3930-
if (fnegFoldsIntoOp(Opc) &&
3944+
if (fnegFoldsIntoOp(N0.getNode()) &&
39313945
(allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
39323946
return false;
39333947
}
@@ -4133,6 +4147,43 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
41334147
// TODO: Invert conditions of foldFreeOpFromSelect
41344148
return SDValue();
41354149
}
4150+
case ISD::BITCAST: {
4151+
SDLoc SL(N);
4152+
SDValue BCSrc = N0.getOperand(0);
4153+
if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
4154+
SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
4155+
if (HighBits.getValueType().getSizeInBits() != 32 ||
4156+
!fnegFoldsIntoOp(HighBits.getNode()))
4157+
return SDValue();
4158+
4159+
// f64 fneg only really needs to operate on the high half of of the
4160+
// register, so try to force it to an f32 operation to help make use of
4161+
// source modifiers.
4162+
//
4163+
//
4164+
// fneg (f64 (bitcast (build_vector x, y))) ->
4165+
// f64 (bitcast (build_vector (bitcast i32:x to f32),
4166+
// (fneg (bitcast i32:y to f32)))
4167+
4168+
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
4169+
SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
4170+
SDValue CastBack =
4171+
DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
4172+
4173+
SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
4174+
Ops.back() = CastBack;
4175+
DCI.AddToWorklist(NegHi.getNode());
4176+
SDValue Build =
4177+
DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
4178+
SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
4179+
4180+
if (!N0.hasOneUse())
4181+
DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
4182+
return Result;
4183+
}
4184+
4185+
return SDValue();
4186+
}
41364187
default:
41374188
return SDValue();
41384189
}

llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3026,11 +3026,10 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
30263026
; SI-NEXT: s_waitcnt lgkmcnt(0)
30273027
; SI-NEXT: s_and_b32 s4, 1, s4
30283028
; SI-NEXT: s_cselect_b32 s3, 0, s3
3029-
; SI-NEXT: s_cselect_b32 s2, 0, s2
30303029
; SI-NEXT: s_xor_b32 s3, s3, 0x80000000
30313030
; SI-NEXT: s_cmp_eq_u32 s4, 1
3032-
; SI-NEXT: s_cselect_b32 s3, 0, s3
30333031
; SI-NEXT: s_cselect_b32 s2, 0, s2
3032+
; SI-NEXT: s_cselect_b32 s3, 0, s3
30343033
; SI-NEXT: v_mov_b32_e32 v3, s1
30353034
; SI-NEXT: v_mov_b32_e32 v0, s2
30363035
; SI-NEXT: v_mov_b32_e32 v1, s3
@@ -3046,11 +3045,10 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
30463045
; VI-NEXT: s_waitcnt lgkmcnt(0)
30473046
; VI-NEXT: s_and_b32 s4, 1, s4
30483047
; VI-NEXT: s_cselect_b32 s3, 0, s3
3049-
; VI-NEXT: s_cselect_b32 s2, 0, s2
30503048
; VI-NEXT: s_xor_b32 s3, s3, 0x80000000
30513049
; VI-NEXT: s_cmp_eq_u32 s4, 1
3052-
; VI-NEXT: s_cselect_b32 s3, 0, s3
30533050
; VI-NEXT: s_cselect_b32 s2, 0, s2
3051+
; VI-NEXT: s_cselect_b32 s3, 0, s3
30543052
; VI-NEXT: v_mov_b32_e32 v3, s1
30553053
; VI-NEXT: v_mov_b32_e32 v0, s2
30563054
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3071,7 +3069,6 @@ define double @v_fneg_select_infloop_regression_f64(double %arg, i1 %arg1) {
30713069
; GCN-NEXT: v_and_b32_e32 v2, 1, v2
30723070
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
30733071
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
3074-
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
30753072
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
30763073
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
30773074
; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc

llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -397,21 +397,20 @@ define double @fneg_xor_select_f64(i1 %cond, double %arg0, double %arg1) {
397397
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
398398
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
399399
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
400-
; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
401400
; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
402-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
401+
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc
402+
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
403403
; GCN-NEXT: s_setpc_b64 s[30:31]
404404
;
405405
; GFX11-LABEL: fneg_xor_select_f64:
406406
; GFX11: ; %bb.0:
407407
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
408408
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
409409
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
410-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
410+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
411411
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
412-
; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
413-
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc_lo
414-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
412+
; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2
413+
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
415414
; GFX11-NEXT: s_setpc_b64 s[30:31]
416415
%select = select i1 %cond, double %arg0, double %arg1
417416
%fneg = fneg double %select
@@ -501,28 +500,29 @@ define double @select_fneg_select_fneg_f64(i1 %cond0, i1 %cond1, double %arg0, d
501500
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
502501
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
503502
; GCN-NEXT: v_and_b32_e32 v1, 1, v1
504-
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
505503
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
506-
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v3
504+
; GCN-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc
505+
; GCN-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
507506
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
508-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
507+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
509508
; GCN-NEXT: s_setpc_b64 s[30:31]
510509
;
511510
; GFX11-LABEL: select_fneg_select_fneg_f64:
512511
; GFX11: ; %bb.0:
513512
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514513
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
515-
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
516514
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
517-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
515+
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
516+
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
517+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
518518
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
519-
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_and_b32 v1, 1, v1
520-
; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo
521-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
519+
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
520+
; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
521+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
522522
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
523-
; GFX11-NEXT: v_xor_b32_e32 v5, 0x80000000, v3
523+
; GFX11-NEXT: v_xor_b32_e32 v3, 0x80000000, v2
524524
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
525-
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
525+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
526526
; GFX11-NEXT: s_setpc_b64 s[30:31]
527527
%fneg0 = fneg double %arg0
528528
%select0 = select i1 %cond0, double %arg1, double %fneg0
@@ -893,12 +893,12 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
893893
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894894
; GCN-NEXT: v_and_b32_e32 v5, 1, v0
895895
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
896-
; GCN-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc
897-
; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v3, vcc
898-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v4
896+
; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
897+
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
898+
; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
899899
; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0
900-
; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
901-
; GCN-NEXT: v_mov_b32_e32 v0, v2
900+
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
901+
; GCN-NEXT: v_mov_b32_e32 v0, v3
902902
; GCN-NEXT: s_setpc_b64 s[30:31]
903903
;
904904
; GFX11-LABEL: cospiD_pattern1:
@@ -908,12 +908,13 @@ define double @cospiD_pattern1(i32 %arg, double %arg1, double %arg2) {
908908
; GFX11-NEXT: v_and_b32_e32 v5, 1, v0
909909
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
910910
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5
911-
; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo
912-
; GFX11-NEXT: v_cndmask_b32_e32 v2, v1, v3, vcc_lo
911+
; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo
912+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
913913
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 1, v0
914-
; GFX11-NEXT: v_xor_b32_e32 v5, 0x80000000, v4
915-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
916-
; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_cndmask_b32 v1, v4, v5
914+
; GFX11-NEXT: v_mov_b32_e32 v0, v3
915+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
916+
; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
917+
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
917918
; GFX11-NEXT: s_setpc_b64 s[30:31]
918919
%i = and i32 %arg, 1
919920
%i3 = icmp eq i32 %i, 0
@@ -1390,17 +1391,14 @@ define double @fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64(float
13901391
; GCN-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
13911392
; GCN: ; %bb.0:
13921393
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1393-
; GCN-NEXT: v_add_f32_e32 v1, 2.0, v1
1394-
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1394+
; GCN-NEXT: v_sub_f32_e32 v1, -2.0, v1
13951395
; GCN-NEXT: s_setpc_b64 s[30:31]
13961396
;
13971397
; GFX11-LABEL: fneg_f64_bitcast_build_vector_v2f32_foldable_sources_to_f64:
13981398
; GFX11: ; %bb.0:
13991399
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14001400
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1401-
; GFX11-NEXT: v_add_f32_e32 v1, 2.0, v1
1402-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1403-
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
1401+
; GFX11-NEXT: v_sub_f32_e32 v1, -2.0, v1
14041402
; GFX11-NEXT: s_setpc_b64 s[30:31]
14051403
%fadd = fadd nsz nnan float %elt1, 2.0
14061404
%insert.0 = insertelement <2 x float> poison, float %elt0, i32 0

0 commit comments

Comments
 (0)