diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a248eb7444b20..e99e4a0b3e482 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5681,6 +5681,9 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase { /// only the first Count elements of the vector are used. SDValue expandVecReduce(SDNode *Node, SelectionDAG &DAG) const; + SDValue expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL, + SelectionDAG &DAG) const; + /// Expand a VECREDUCE_SEQ_* into an explicit ordered calculation. SDValue expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 528136a55f14a..9877015c96269 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -3356,29 +3356,8 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) { break; } case ISD::FCANONICALIZE: { - // This implements llvm.canonicalize.f* by multiplication with 1.0, as - // suggested in - // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic. - // It uses strict_fp operations even outside a strict_fp context in order - // to guarantee that the canonicalization is not optimized away by later - // passes. The result chain introduced by that is intentionally ignored - // since no ordering requirement is intended here. - - // Create strict multiplication by 1.0. - SDValue Operand = Node->getOperand(0); - EVT VT = Operand.getValueType(); - SDValue One = DAG.getConstantFP(1.0, dl, VT); - SDValue Chain = DAG.getEntryNode(); - SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, - {Chain, Operand, One}); - - // Propagate existing flags on canonicalize, and additionally set - // NoFPExcept. - SDNodeFlags CanonicalizeFlags = Node->getFlags(); - CanonicalizeFlags.setNoFPExcept(true); - Mul->setFlags(CanonicalizeFlags); - - Results.push_back(Mul); + SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG); + Results.push_back(Result); break; } case ISD::SIGN_EXTEND_INREG: { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index f908a66128ec8..a8feb22a4f191 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1309,6 +1309,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { return; } break; + case ISD::FCANONICALIZE: { + SDLoc dl(Node); + SDValue Result = TLI.expandFCanonicalizeWithStrictFmul(Node, dl, DAG); + Results.push_back(Result); + return; + } } SDValue Unrolled = DAG.UnrollVectorOp(Node); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e0597988e8907..1ec07917075f8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -11581,6 +11581,34 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const { return Res; } +SDValue +TargetLowering::expandFCanonicalizeWithStrictFmul(SDNode *Node, SDLoc DL, + SelectionDAG &DAG) const { + // This implements llvm.canonicalize.f* by multiplication with 1.0, as + // suggested in + // https://llvm.org/docs/LangRef.html#llvm-canonicalize-intrinsic. + // It uses strict_fp operations even outside a strict_fp context in order + // to guarantee that the canonicalization is not optimized away by later + // passes. The result chain introduced by that is intentionally ignored + // since no ordering requirement is intended here. + + // Create strict multiplication by 1.0. + SDValue Operand = Node->getOperand(0); + EVT VT = Operand.getValueType(); + SDValue One = DAG.getConstantFP(1.0, DL, VT); + SDValue Chain = DAG.getEntryNode(); + + SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, DL, {VT, MVT::Other}, + {Chain, Operand, One}); + + // Propagate existing flags on canonicalize, and additionally set NoFPExcept. + SDNodeFlags Flags = Node->getFlags(); + Flags.setNoFPExcept(true); + Mul->setFlags(Flags); + + return Mul; +} + SDValue TargetLowering::expandVecReduceSeq(SDNode *Node, SelectionDAG &DAG) const { SDLoc dl(Node); SDValue AccOp = Node->getOperand(0); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5e35d5630d667..cc24ba995e31a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -316,8 +316,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); } - setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); if (Subtarget.is64Bit()) { setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); @@ -348,9 +346,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!Subtarget.hasSSE2()) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); - setOperationAction(ISD::BITCAST , MVT::i32 , Expand); - setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom); + setOperationAction(ISD::BITCAST, MVT::i32, Expand); if (Subtarget.is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. @@ -716,7 +712,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f16, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); setOperationAction(ISD::LRINT, MVT::f16, Expand); @@ -871,7 +866,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); - setOperationAction(ISD::FCANONICALIZE , MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand); if (isTypeLegal(MVT::f16)) { setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); @@ -934,7 +929,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (isTypeLegal(MVT::f80)) { setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::f80, Expand); } setOperationAction(ISD::SETCC, MVT::f128, Custom); @@ -1070,11 +1065,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v4f32, Expand); setOperationAction(ISD::LOAD, MVT::v2f32, Custom); setOperationAction(ISD::STORE, MVT::v2f32, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f32, Expand); setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); @@ -1137,7 +1132,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UMULO, MVT::v2i32, Custom); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v2f64, Expand); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); @@ -1473,7 +1468,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMINIMUM, VT, Custom); setOperationAction(ISD::FMAXIMUMNUM, VT, Custom); setOperationAction(ISD::FMINIMUMNUM, VT, Custom); - setOperationAction(ISD::FCANONICALIZE, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Expand); } setOperationAction(ISD::LRINT, MVT::v8f32, Custom); @@ -1741,9 +1736,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Custom); - setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Custom); + setOperationAction(ISD::FCANONICALIZE, MVT::v8f16, Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::v16f16, Expand); + setOperationAction(ISD::FCANONICALIZE, MVT::v32f16, Expand); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1825,7 +1820,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FMA, VT, Legal); setOperationAction(ISD::STRICT_FMA, VT, Legal); setOperationAction(ISD::FCOPYSIGN, VT, Custom); - setOperationAction(ISD::FCANONICALIZE, VT, Custom); + setOperationAction(ISD::FCANONICALIZE, VT, Expand); } setOperationAction(ISD::LRINT, MVT::v16f32, Subtarget.hasDQI() ? Legal : Custom); @@ -33436,24 +33431,6 @@ static SDValue LowerPREFETCH(SDValue Op, const X86Subtarget &Subtarget, return Op; } -static SDValue LowerFCanonicalize(SDValue Op, SelectionDAG &DAG) { - SDNode *N = Op.getNode(); - SDValue Operand = N->getOperand(0); - EVT VT = Operand.getValueType(); - SDLoc dl(N); - - SDValue One = DAG.getConstantFP(1.0, dl, VT); - - // TODO: Fix Crash for bf16 when generating strict_fmul as it - // leads to a error : SoftPromoteHalfResult #0: t11: bf16,ch = strict_fmul t0, - // ConstantFP:bf16, t5 LLVM ERROR: Do not know how to soft - // promote this operator's result! - SDValue Chain = DAG.getEntryNode(); - SDValue StrictFmul = DAG.getNode(ISD::STRICT_FMUL, dl, {VT, MVT::Other}, - {Chain, Operand, One}); - return StrictFmul; -} - static StringRef getInstrStrFromOpNo(const SmallVectorImpl &AsmStrs, unsigned OpNo) { const APInt Operand(32, OpNo); @@ -33593,7 +33570,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::FSHL: case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); - case ISD::FCANONICALIZE: return LowerFCanonicalize(Op, DAG); case ISD::STRICT_SINT_TO_FP: case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::STRICT_UINT_TO_FP: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index ab476dd96c707..475e6201dd391 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -1,12 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH %s -; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: -; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX9: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_no_fold_canonicalize_loaded_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_fold_canonicalize_loaded_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %v = load float, ptr addrspace(1) %gep, align 4 @@ -15,11 +37,31 @@ define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(ptr addrsp ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32: -; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fmul_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 0x41700000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fmul_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v1, 0x41700000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -29,12 +71,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32: -; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fmul_legacy_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_legacy_f32_e32 v2, 0x41700000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fmul_legacy_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_legacy_f32_e32 v1, 0x41700000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -44,12 +105,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(ptr addr ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32: -; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_sub_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_f32_e32 v2, 0x41700000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_sub_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v1, 0x41700000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -59,12 +139,31 @@ define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32: -; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_add_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0x41700000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_add_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0x41700000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -74,12 +173,31 @@ define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(ptr addrspace(1) ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32: -; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_sqrt_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sqrt_f32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_sqrt_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sqrt_f32_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -89,12 +207,31 @@ define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_fceil_value_f32: -; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fceil_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_ceil_f32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fceil_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_ceil_f32_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -104,12 +241,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(ptr addrspace( ret void } -; GCN-LABEL: test_fold_canonicalize_floor_value_f32: -; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_floor_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_floor_f32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_floor_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_floor_f32_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -119,13 +275,33 @@ define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(ptr addrspace( ret void } -; GCN-LABEL: test_fold_canonicalize_fma_value_f32: -; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000 -; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fma_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_mov_b32 s0, 0x41700000 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_fma_f32 v2, v2, s0, s0 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fma_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_mov_b32 s2, 0x41700000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_fma_f32 v1, v1, s2, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -135,12 +311,33 @@ define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(ptr addrspace(1) ret void } -; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: -; GCN: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+$}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x41700000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mac_f32_e32 v3, 0x41700000, v2 +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fmad_ftz_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mac_f32_e32 v2, 0x41700000, v1 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -150,15 +347,60 @@ define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(ptr addrspa ret void } -; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32: -; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000 -; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] -; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspace(1) %arg) { +; VI-FLUSH-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x41700000 +; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-FLUSH-NEXT: flat_load_dword v2, v[0:1] +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) +; VI-FLUSH-NEXT: v_mac_f32_e32 v3, 0x41700000, v2 +; VI-FLUSH-NEXT: flat_store_dword v[0:1], v3 +; VI-FLUSH-NEXT: s_endpgm +; +; VI-DENORM-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1 +; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-DENORM-NEXT: flat_load_dword v2, v[0:1] +; VI-DENORM-NEXT: s_mov_b32 s0, 0x41700000 +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) +; VI-DENORM-NEXT: v_fma_f32 v2, v2, s0, s0 +; VI-DENORM-NEXT: flat_store_dword v[0:1], v2 +; VI-DENORM-NEXT: s_endpgm +; +; GFX9-DENORM-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; GFX9-DENORM: ; %bb.0: +; GFX9-DENORM-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-DENORM-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DENORM-NEXT: s_mov_b32 s2, 0x41700000 +; GFX9-DENORM-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DENORM-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) +; GFX9-DENORM-NEXT: v_fma_f32 v1, v1, s2, s2 +; GFX9-DENORM-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-DENORM-NEXT: s_endpgm +; +; GFX9-FLUSH-LABEL: test_fold_canonicalize_fmuladd_value_f32: +; GFX9-FLUSH: ; %bb.0: +; GFX9-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, 0x41700000 +; GFX9-FLUSH-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-FLUSH-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) +; GFX9-FLUSH-NEXT: v_mac_f32_e32 v2, 0x41700000, v1 +; GFX9-FLUSH-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-FLUSH-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -168,15 +410,32 @@ define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(ptr addrspac ret void } -; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32: -; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]], -; VI: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]] -; GFX9: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_canonicalize_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_canonicalize_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -186,12 +445,36 @@ define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(ptr add ret void } -; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: -; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) { +; VI-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v1, v[1:2] +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f64_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f64_f32_e32 v[1:2], v1 +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -202,12 +485,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(ptr add ret void } -; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: -; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) { +; VI-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_ushort v1, v[1:2] +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f32_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %load = load half, ptr addrspace(1) %gep, align 2 @@ -218,12 +525,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(ptr add ret void } -; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16: -; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 { +; VI-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_ushort v1, v[1:2] +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %load = load half, ptr addrspace(1) %gep, align 2 @@ -234,12 +565,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf1 ret void } -; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64: -; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) { +; VI-LABEL: test_fold_canonicalize_fpround_value_f32_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f32_f64_e32 v2, v[1:2] +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpround_value_f32_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f64_e32 v1, v[1:2] +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %load = load double, ptr addrspace(1) %gep, align 8 @@ -250,12 +605,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(ptr addr ret void } -; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32: -; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_max -; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) { +; VI-LABEL: test_fold_canonicalize_fpround_value_f16_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v1, v[1:2] +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpround_value_f16_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -266,12 +645,36 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(ptr addr ret void } -; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16: -; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_max -; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #2 { +; VI-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dword v1, v[1:2] +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v3, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -282,16 +685,43 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16 ret void } -; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: -; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}} -; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}} -; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]] -; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}} -; GFX9: v_pack_b32_f16 [[V:v[0-9]+]], [[V1]], [[V0]] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) { +; VI-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc +; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[1:2], v1, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %arg, i32 %id %load = load <2 x float>, ptr addrspace(1) %gep, align 8 @@ -302,10 +732,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(ptr ret void } -; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32: -; VI: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}} -; GFX9: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_no_fold_canonicalize_fneg_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, -1.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_fold_canonicalize_fneg_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -315,12 +766,33 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(ptr addrspac ret void } -; GCN-LABEL: test_fold_canonicalize_fneg_value_f32: -; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fneg_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0, v2 +; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fneg_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -331,10 +803,31 @@ define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32: -; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| -; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_no_fold_canonicalize_fabs_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e64 v2, 1.0, |v2| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_fold_canonicalize_fabs_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -344,13 +837,32 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(ptr addrspac ret void } -; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32: -; VI: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}| -; GFX9: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}| -; GCN-NOT: v_mul_ -; GCN-NOT: v_max_ define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr addrspace(1) %arg, float %sign) { +; VI-LABEL: test_no_fold_canonicalize_fcopysign_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e64 v2, 1.0, |v2| +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_no_fold_canonicalize_fcopysign_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e64 v1, |v1|, |v1| +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -362,12 +874,33 @@ define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(ptr add ret void } -; GCN-LABEL: test_fold_canonicalize_fabs_value_f32: -; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_fabs_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0, v2 +; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_fabs_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -378,12 +911,34 @@ define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_sin_value_f32: -; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_sin_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 0.15915494, v2 +; VI-NEXT: v_fract_f32_e32 v2, v2 +; VI-NEXT: v_sin_f32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_sin_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_sin_f32_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -393,12 +948,34 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(ptr addrspace(1) ret void } -; GCN-LABEL: test_fold_canonicalize_cos_value_f32: -; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_cos_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 0.15915494, v2 +; VI-NEXT: v_fract_f32_e32 v2, v2 +; VI-NEXT: v_cos_f32_e32 v2, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_cos_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f32_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_cos_f32_e32 v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -408,12 +985,34 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(ptr addrspace(1) ret void } -; GCN-LABEL: test_fold_canonicalize_sin_value_f16: -; GCN: v_sin_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]] define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_sin_value_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; VI-NEXT: v_fract_f16_e32 v2, v2 +; VI-NEXT: v_sin_f16_e32 v2, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_sin_value_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_sin_f16_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %load = load half, ptr addrspace(1) %gep, align 2 @@ -423,12 +1022,34 @@ define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(ptr addrspace(1) ret void } -; GCN-LABEL: test_fold_canonicalize_cos_value_f16: -; GCN: v_cos_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]] define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_cos_value_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; VI-NEXT: v_fract_f16_e32 v2, v2 +; VI-NEXT: v_cos_f16_e32 v2, v2 +; VI-NEXT: flat_store_short v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_cos_value_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX9-NEXT: v_cos_f16_e32 v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %load = load half, ptr addrspace(1) %gep, align 2 @@ -438,12 +1059,27 @@ define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(ptr addrspace(1) ret void } -; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32: -; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_qNaN_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_qNaN_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000) @@ -451,17 +1087,35 @@ define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(ptr addrspace(1 ret void } -; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; VI: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] -; GFX9: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]] -; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]] -; GCN-NOT: v_max -; GCN-NOT: v_mul -; GFX9: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_min_f32_e32 v2, 0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -481,12 +1135,33 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee ; ret void ; } -; GCN-LABEL: test_fold_canonicalize_minnum_value_f32: -; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_minnum_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0, v2 +; VI-NEXT: v_min_f32_e32 v2, 0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_minnum_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX9-NEXT: v_min_f32_e32 v1, 0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -499,11 +1174,31 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(ptr addrspace ; FIXME: Should there be more checks here? minnum with NaN operand is simplified away. -; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32: -; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]] -; VI: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]] -; GFX9: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]] define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_sNaN_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_sNaN_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -523,20 +1218,37 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1 ; ret void ; } -; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: -; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]] -; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]] -; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]] -; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]] -; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]] -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; VI-NEXT: v_max_f32_e32 v2, 0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -546,12 +1258,33 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_iee ret void } -; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32: -; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} -; GCN-NOT: v_max -; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_maxnum_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0, v2 +; VI-NEXT: v_max_f32_e32 v2, 0, v2 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_maxnum_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0, v1 +; GFX9-NEXT: v_max_f32_e32 v1, 0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load = load float, ptr addrspace(1) %gep, align 4 @@ -562,12 +1295,33 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(ptr addrspace ret void } -; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64: -; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_maxnum_value_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f64 v[2:3], v[2:3], 0 +; VI-NEXT: v_max_f64 v[2:3], v[2:3], 0 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_maxnum_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 0 +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %load = load double, ptr addrspace(1) %gep, align 8 @@ -578,50 +1332,130 @@ define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(ptr addrspace ret void } -; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee: -; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN-NEXT: ; return define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) { +; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 +; GCN-NEXT: ; return to shader part epilog entry: %v = fmul float %arg, 15.0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } -; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee: -; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN-NOT: v_mul -; GCN-NOT: v_max -; GCN-NEXT: ; return define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) { +; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: v_mul_f32_e32 v0, 0x41700000, v0 +; GCN-NEXT: ; return to shader part epilog entry: %v = fmul nnan float %arg, 15.0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } -; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee: -; GCN: v_div_fixup_f32 -; GCN-NOT: v_max -; GCN-NOT: v_mul -; GCN: ; return define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) { +; VI-FLUSH-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee: +; VI-FLUSH: ; %bb.0: ; %entry +; VI-FLUSH-NEXT: s_mov_b32 s2, 0x41700000 +; VI-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; VI-FLUSH-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; VI-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; VI-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, s2 +; VI-FLUSH-NEXT: ; return to shader part epilog +; +; VI-DENORM-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee: +; VI-DENORM: ; %bb.0: ; %entry +; VI-DENORM-NEXT: s_mov_b32 s2, 0x41700000 +; VI-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; VI-DENORM-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; VI-DENORM-NEXT: v_rcp_f32_e32 v3, v1 +; VI-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; VI-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3 +; VI-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3 +; VI-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2 +; VI-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4 +; VI-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2 +; VI-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; VI-DENORM-NEXT: v_div_fixup_f32 v0, v1, v0, s2 +; VI-DENORM-NEXT: ; return to shader part epilog +; +; GFX9-DENORM-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee: +; GFX9-DENORM: ; %bb.0: ; %entry +; GFX9-DENORM-NEXT: s_mov_b32 s2, 0x41700000 +; GFX9-DENORM-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; GFX9-DENORM-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; GFX9-DENORM-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-DENORM-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX9-DENORM-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX9-DENORM-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX9-DENORM-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX9-DENORM-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX9-DENORM-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX9-DENORM-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX9-DENORM-NEXT: v_div_fixup_f32 v0, v1, v0, s2 +; GFX9-DENORM-NEXT: ; return to shader part epilog +; +; GFX9-FLUSH-LABEL: test_fold_canonicalize_fdiv_value_f32_no_ieee: +; GFX9-FLUSH: ; %bb.0: ; %entry +; GFX9-FLUSH-NEXT: s_mov_b32 s2, 0x41700000 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2 +; GFX9-FLUSH-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2 +; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v1 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0 +; GFX9-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3 +; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3 +; GFX9-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2 +; GFX9-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4 +; GFX9-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2 +; GFX9-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; GFX9-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX9-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, s2 +; GFX9-FLUSH-NEXT: ; return to shader part epilog entry: %v = fdiv float 15.0, %arg0 %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } -; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32 -; GFX9-DENORM: global_load_dword [[V:v[0-9]+]], -; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[V]], s{{\[[0-9]+:[0-9]+\]}} -; GFX9-DENORM-NOT: 1.0 -; GFX9-DENORM-NOT: v_max -; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} -; GFX9-FLUSH: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { +; VI-LABEL: test_fold_canonicalize_load_nnan_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mul_f32_e32 v3, 1.0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_dword v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %v = load float, ptr addrspace(1) %gep, align 4 @@ -631,12 +1465,34 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(ptr addrsp ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64 -; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]], -; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] -; GCN-NOT: v_mul_ -; GCN-NOT: v_max_ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { +; VI-LABEL: test_fold_canonicalize_load_nnan_value_f64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, ptr addrspace(1) %arg, i32 %id %v = load double, ptr addrspace(1) %gep, align 8 @@ -646,11 +1502,34 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16 -; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]], -; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]] -; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 { +; VI-LABEL: test_fold_canonicalize_load_nnan_value_f16: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_max_f16_e32 v3, v0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_store_short v[0:1], v3 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_load_nnan_value_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX9-NEXT: global_store_short v0, v1, s[2:3] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id %v = load half, ptr addrspace(1) %gep, align 2 @@ -660,13 +1539,46 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrsp ret void } -; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32: -; GCN: v_add_f32 -; GCN: v_add_f32 -; GCN: v_cndmask_b32 -; GCN-NOT: v_mul_ -; GCN-NOT: v_max_ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace(1) %arg) { +; VI-LABEL: test_fold_canonicalize_select_value_f32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v2, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v3, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: flat_load_dword v4, v[0:1] glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_f32_e32 v2, 0x41700000, v2 +; VI-NEXT: v_add_f32_e32 v3, 0x42000000, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: test_fold_canonicalize_select_value_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; kill: killed $vgpr0_vgpr1 +; GFX9-NEXT: global_load_dword v3, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v1, 0x41700000, v1 +; GFX9-NEXT: v_add_f32_e32 v2, 0x42000000, v2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id %load0 = load volatile float, ptr addrspace(1) %gep, align 4 @@ -685,57 +1597,93 @@ define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(ptr addrspace ; passed through the minnum. ; FIXME: canonicalize doens't work correctly without ieee_mode -; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode: -; GFX9-NOT: v0 -; GFX9-NOT: v1 -; GFX9: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: ; return to shader -; VI-FLUSH: v_min_f32_e32 v0, v0, v1 -; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; VI-FLUSH-NEXT: ; return -; VI-DENORM-NOT: v0 -; VI-DENORM: v_min_f32_e32 v0, v0, v1 -; VI-DENORM-NEXT: ; return define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) { +; VI-FLUSH-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-NEXT: ; return to shader part epilog +; +; VI-DENORM-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-DENORM-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } -; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode: -; GFX9: v_min_f32_e32 v0, v0, v1 -; GFX9-NEXT: s_setpc_b64 -; VI-DAG: v_mul_f32_e32 v0, 1.0, v0 -; VI-DAG: v_mul_f32_e32 v1, 1.0, v1 -; VI: v_min_f32_e32 v0, v0, v1 -; VI-NEXT: s_setpc_b64 define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) { +; VI-LABEL: test_fold_canonicalize_minnum_value_ieee_mode: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: test_fold_canonicalize_minnum_value_ieee_mode: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } ; Canonicalizing flush necessary pre-gfx9 -; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan: -; GCN: v_min_f32_e32 v0, v0, v1 -; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: ; return define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 { +; VI-FLUSH-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-NEXT: ; return to shader part epilog +; +; VI-DENORM-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-DENORM-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: test_fold_canonicalize_minnum_value_no_ieee_mode_nnan: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog %v = tail call float @llvm.minnum.f32(float %arg0, float %arg1) %canonicalized = tail call float @llvm.canonicalize.f32(float %v) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16: -; GFX9-DAG: v_add_f16_e32 -; GFX9-DAG: v_mul_f16_e32 -; GFX9-NOT: v_max -; GFX9-NOT: v_pk_max define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) { +; VI-LABEL: v_test_canonicalize_build_vector_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f16_e32 v1, 1.0, v0 +; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_build_vector_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v0 +; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lo = extractelement <2 x half> %vec, i32 0 %hi = extractelement <2 x half> %vec, i32 1 %lo.op = fadd half %lo, 1.0 @@ -746,10 +1694,25 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) { ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16: -; GFX9: v_add_f16_e32 -; GFX9: v_pk_max define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) { +; VI-LABEL: v_test_canonicalize_build_vector_noncanon1_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_f16_e32 v1, 1.0, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_build_vector_noncanon1_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v0 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_bfi_b32 v0, s4, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %lo = extractelement <2 x half> %vec, i32 0 %lo.op = fadd half %lo, 1.0 %ins = insertelement <2 x half> %vec, half %lo.op, i32 0 @@ -757,10 +1720,26 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> % ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16: -; GFX9: v_add_f16_sdwa -; GFX9: v_pk_max define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) { +; VI-LABEL: v_test_canonicalize_build_vector_noncanon0_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_build_vector_noncanon0_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX9-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %hi = extractelement <2 x half> %vec, i32 1 %hi.op = fadd half %hi, 1.0 %ins = insertelement <2 x half> %vec, half %hi.op, i32 1 @@ -768,11 +1747,12 @@ define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> % ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16: -; GFX9: s_waitcnt -; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0 -; GFX9-NEXT: s_setpc_b64 define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { +; GCN-LABEL: v_test_canonicalize_extract_element_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %vec.op = fmul <2 x half> %vec, %elt = extractelement <2 x half> %vec.op, i32 0 %canonicalized = call half @llvm.canonicalize.f16(half %elt) @@ -787,22 +1767,73 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) { ; ret <2 x half> %canonicalized ; } -; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16: -; GFX9: v_mul_f16 -; GFX9: v_pk_max_f16 v0, v0, v0 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) { +; VI-LABEL: v_test_canonicalize_insertelement_noncanon_vec_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: v_or_b32_e32 v1, v1, v3 +; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; VI-NEXT: v_bfi_b32 v0, v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v0 +; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_vec_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mul_f16_e32 v1, 0x4800, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX9-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %ins.op = fmul half %val, 8.0 %ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) ret <2 x half> %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16: -; GFX9: v_pk_mul_f16 -; GFX9: v_pk_max_f16 v0, v0, v0 -; GFX9-NEXT: s_setpc_b64 define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) { +; VI-LABEL: v_test_canonicalize_insertelement_noncanon_insval_v2f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, 0x4400 +; VI-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: v_mov_b32_e32 v3, 16 +; VI-NEXT: s_mov_b32 s4, 0xffff +; VI-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; VI-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; VI-NEXT: v_bfi_b32 v0, v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v0 +; VI-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_insertelement_noncanon_insval_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v1, v1, v1, s4 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 +; GFX9-NEXT: s_mov_b32 s4, 0xffff +; GFX9-NEXT: v_pk_mul_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s4 +; GFX9-NEXT: v_bfi_b32 v0, v2, v1, v0 +; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] %vec.op = fmul <2 x half> %vec, %ins = insertelement <2 x half> %vec.op, half %val, i32 %idx %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins) @@ -815,101 +1846,157 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x ; ret <2 x half> %canonicalized ; } -; GCN-LABEL: {{^}}v_test_canonicalize_cubeid: -; GCN: s_waitcnt -; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) { +; GCN-LABEL: v_test_canonicalize_cubeid: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2 +; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c) %canonicalized = call float @llvm.canonicalize.f32(float %cvt) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant: -; GCN: s_waitcnt -; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_frexp_mant(float %a) { +; GCN-LABEL: v_test_canonicalize_frexp_mant: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a) %canonicalized = call float @llvm.canonicalize.f32(float %cvt) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_log: -; GCN: s_waitcnt -; GCN-NEXT: v_log_f32 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_amdgcn_log(float %a) { +; GCN-LABEL: v_test_canonicalize_amdgcn_log: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_log_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %log = call float @llvm.amdgcn.log.f32(float %a) %canonicalized = call float @llvm.canonicalize.f32(float %log) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_amdgcn_exp2: -; GCN: s_waitcnt -; GCN-NEXT: v_exp_f32 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_amdgcn_exp2(float %a) { +; GCN-LABEL: v_test_canonicalize_amdgcn_exp2: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_exp_f32_e32 v0, v0 +; GCN-NEXT: s_setpc_b64 s[30:31] %log = call float @llvm.amdgcn.exp2.f32(float %a) %canonicalized = call float @llvm.canonicalize.f32(float %log) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_minimum: -; GCN: s_waitcnt -; GCN-NEXT: v_min_f32_e32 [[MIN:v[0-9]+]], v0, v1 -; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 0x7fc00000 -; GCN-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, [[K]], [[MIN]], vcc -; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_minimum(float %a, float %b) { +; VI-FLUSH-LABEL: v_test_canonicalize_minimum: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-FLUSH-NEXT: v_min_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; VI-FLUSH-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; VI-FLUSH-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; VI-DENORM-LABEL: v_test_canonicalize_minimum: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-DENORM-NEXT: v_min_f32_e32 v2, v0, v1 +; VI-DENORM-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; VI-DENORM-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; VI-DENORM-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_minimum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minimum.f32(float %a, float %b) %canonicalized = call float @llvm.canonicalize.f32(float %min) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_maximum: -; GCN: s_waitcnt -; GCN-NEXT: v_max_f32_e32 [[MIN:v[0-9]+]], v0, v1 -; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 0x7fc00000 -; GCN-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 -; GCN-NEXT: v_cndmask_b32_e32 v0, [[K]], [[MIN]], vcc -; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_maximum(float %a, float %b) { +; VI-FLUSH-LABEL: v_test_canonicalize_maximum: +; VI-FLUSH: ; %bb.0: +; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-FLUSH-NEXT: v_max_f32_e32 v2, v0, v1 +; VI-FLUSH-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; VI-FLUSH-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; VI-FLUSH-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-FLUSH-NEXT: s_setpc_b64 s[30:31] +; +; VI-DENORM-LABEL: v_test_canonicalize_maximum: +; VI-DENORM: ; %bb.0: +; VI-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-DENORM-NEXT: v_max_f32_e32 v2, v0, v1 +; VI-DENORM-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; VI-DENORM-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; VI-DENORM-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; VI-DENORM-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_maximum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.maximum.f32(float %a, float %b) %canonicalized = call float @llvm.canonicalize.f32(float %min) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_minimumnum: -; GCN: s_waitcnt -; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GCN-NEXT: v_min_f32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_minimumnum(float %a, float %b) { +; VI-LABEL: v_test_canonicalize_minimumnum: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_min_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_minimumnum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.minimumnum.f32(float %a, float %b) %canonicalized = call float @llvm.canonicalize.f32(float %min) ret float %canonicalized } -; GCN-LABEL: {{^}}v_test_canonicalize_maximumnum: -; GCN: s_waitcnt -; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GCN-NEXT: v_max_f32_e32 v0, v0, v1 -; GCN-NEXT: s_setpc_b64 define float @v_test_canonicalize_maximumnum(float %a, float %b) { +; VI-LABEL: v_test_canonicalize_maximumnum: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; VI-NEXT: v_max_f32_e32 v0, v0, v1 +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_test_canonicalize_maximumnum: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] %min = call float @llvm.maximumnum.f32(float %a, float %b) %canonicalized = call float @llvm.canonicalize.f32(float %min) ret float %canonicalized @@ -917,7 +2004,6 @@ define float @v_test_canonicalize_maximumnum(float %a, float %b) { ; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0 ; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive -; GCN: .amd_amdgpu_isa declare float @llvm.canonicalize.f32(float) #0 declare float @llvm.copysign.f32(float, float) #0 @@ -949,3 +2035,6 @@ declare float @llvm.amdgcn.exp2.f32(float) #0 attributes #0 = { nounwind readnone } attributes #1 = { "no-nans-fp-math"="true" } attributes #2 = { "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-DENORM: {{.*}} +; GCN-FLUSH: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 9ef48588a51ae..4abfd045bd878 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -199,8 +199,9 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 ; VI-LABEL: v_test_canonicalize_build_vector_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1394,10 +1395,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v2, v0, v1 +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v0 +; VI-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1468,10 +1470,11 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) % ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0| -; VI-NEXT: v_or_b32_e32 v2, v0, v1 +; VI-NEXT: v_mul_f16_e64 v2, |v0|, 1.0 +; VI-NEXT: v_mul_f16_sdwa v0, |v0|, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1545,10 +1548,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0| -; VI-NEXT: v_or_b32_e32 v2, v0, v1 +; VI-NEXT: v_mul_f16_e64 v2, -|v0|, 1.0 +; VI-NEXT: v_mul_f16_sdwa v0, -|v0|, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1624,10 +1628,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) % ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e64 v0, -v0, -v0 -; VI-NEXT: v_or_b32_e32 v2, v0, v1 +; VI-NEXT: v_mul_f16_e64 v2, -v0, 1.0 +; VI-NEXT: v_mul_f16_sdwa v0, -v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -1695,12 +1700,13 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, ; VI: ; %bb.0: ; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s3, s2, 16 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_max_f16_e64 v0, s2, s2 -; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_or_b32_e32 v2, v0, v1 +; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0 +; VI-NEXT: s_lshr_b32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v2, v1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2425,9 +2431,10 @@ define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v3f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -2463,12 +2470,13 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: v_or_b32_e32 v1, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v3, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v2, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v4f16: @@ -2883,9 +2891,11 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 ; VI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 -; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; VI-NEXT: v_mul_f16_e64 v1, s4, 1.0 +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; VI-NEXT: v_or_b32_e32 v0, v0, v2 +; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16: @@ -2932,10 +2942,13 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal ; VI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_e64 v2, s4, 1.0 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00 +; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; VI-NEXT: v_or_b32_e32 v1, v2, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16: @@ -2985,10 +2998,13 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half ; VI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mul_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3044,15 +3060,16 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v6f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v3, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v4, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v5 -; VI-NEXT: v_or_b32_e32 v1, v1, v4 -; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f16_sdwa v5, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: v_or_b32_e32 v1, v1, v5 +; VI-NEXT: v_or_b32_e32 v2, v2, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v6f16: @@ -3095,18 +3112,19 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v8f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v6, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v7 -; VI-NEXT: v_or_b32_e32 v1, v1, v6 -; VI-NEXT: v_or_b32_e32 v2, v2, v5 -; VI-NEXT: v_or_b32_e32 v3, v3, v4 +; VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_mul_f16_sdwa v6, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f16_sdwa v7, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: v_or_b32_e32 v1, v1, v7 +; VI-NEXT: v_or_b32_e32 v2, v2, v6 +; VI-NEXT: v_or_b32_e32 v3, v3, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v8f16: @@ -3155,24 +3173,25 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v12f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v9, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v10, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v11, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v11 -; VI-NEXT: v_or_b32_e32 v1, v1, v10 -; VI-NEXT: v_or_b32_e32 v2, v2, v9 -; VI-NEXT: v_or_b32_e32 v3, v3, v8 -; VI-NEXT: v_or_b32_e32 v4, v4, v7 -; VI-NEXT: v_or_b32_e32 v5, v5, v6 +; VI-NEXT: v_mov_b32_e32 v6, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; VI-NEXT: v_mul_f16_sdwa v8, v4, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_mul_f16_sdwa v9, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_mul_f16_sdwa v10, v2, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f16_sdwa v11, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: v_or_b32_e32 v1, v1, v11 +; VI-NEXT: v_or_b32_e32 v2, v2, v10 +; VI-NEXT: v_or_b32_e32 v3, v3, v9 +; VI-NEXT: v_or_b32_e32 v4, v4, v8 +; VI-NEXT: v_or_b32_e32 v5, v5, v7 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v12f16: @@ -3233,30 +3252,31 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v16f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v12, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v13, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v14, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v15, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v15 -; VI-NEXT: v_or_b32_e32 v1, v1, v14 -; VI-NEXT: v_or_b32_e32 v2, v2, v13 -; VI-NEXT: v_or_b32_e32 v3, v3, v12 -; VI-NEXT: v_or_b32_e32 v4, v4, v11 -; VI-NEXT: v_or_b32_e32 v5, v5, v10 -; VI-NEXT: v_or_b32_e32 v6, v6, v9 -; VI-NEXT: v_or_b32_e32 v7, v7, v8 +; VI-NEXT: v_mov_b32_e32 v8, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; VI-NEXT: v_mul_f16_sdwa v10, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; VI-NEXT: v_mul_f16_sdwa v11, v5, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; VI-NEXT: v_mul_f16_sdwa v12, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_mul_f16_sdwa v13, v3, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_mul_f16_sdwa v14, v2, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; VI-NEXT: v_mul_f16_sdwa v15, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v8 +; VI-NEXT: v_or_b32_e32 v1, v1, v15 +; VI-NEXT: v_or_b32_e32 v2, v2, v14 +; VI-NEXT: v_or_b32_e32 v3, v3, v13 +; VI-NEXT: v_or_b32_e32 v4, v4, v12 +; VI-NEXT: v_or_b32_e32 v5, v5, v11 +; VI-NEXT: v_or_b32_e32 v6, v6, v10 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v16f16: @@ -3329,54 +3349,55 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v32f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v19, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mov_b32_e32 v16, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v19, v0, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 ; VI-NEXT: v_or_b32_e32 v0, v0, v19 -; VI-NEXT: v_max_f16_sdwa v19, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 +; VI-NEXT: v_mul_f16_sdwa v19, v1, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; VI-NEXT: v_or_b32_e32 v1, v1, v19 -; VI-NEXT: v_max_f16_sdwa v19, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 +; VI-NEXT: v_mul_f16_sdwa v19, v2, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 ; VI-NEXT: v_or_b32_e32 v2, v2, v19 -; VI-NEXT: v_max_f16_sdwa v19, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v3, v3, v3 +; VI-NEXT: v_mul_f16_sdwa v19, v3, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 ; VI-NEXT: v_or_b32_e32 v3, v3, v19 -; VI-NEXT: v_max_f16_sdwa v19, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v4, v4, v4 +; VI-NEXT: v_mul_f16_sdwa v19, v4, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4 ; VI-NEXT: v_or_b32_e32 v4, v4, v19 -; VI-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v5, v5, v5 +; VI-NEXT: v_mul_f16_sdwa v19, v5, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5 ; VI-NEXT: v_or_b32_e32 v5, v5, v19 -; VI-NEXT: v_max_f16_sdwa v19, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v6, v6, v6 +; VI-NEXT: v_mul_f16_sdwa v19, v6, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6 ; VI-NEXT: v_or_b32_e32 v6, v6, v19 -; VI-NEXT: v_max_f16_sdwa v19, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v7, v7, v7 +; VI-NEXT: v_mul_f16_sdwa v19, v7, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7 ; VI-NEXT: v_or_b32_e32 v7, v7, v19 -; VI-NEXT: v_max_f16_sdwa v19, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v8, v8, v8 +; VI-NEXT: v_mul_f16_sdwa v19, v8, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v8, 1.0, v8 ; VI-NEXT: v_or_b32_e32 v8, v8, v19 -; VI-NEXT: v_max_f16_sdwa v19, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v9, v9, v9 +; VI-NEXT: v_mul_f16_sdwa v19, v9, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v9, 1.0, v9 ; VI-NEXT: v_or_b32_e32 v9, v9, v19 -; VI-NEXT: v_max_f16_sdwa v19, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v10, v10, v10 +; VI-NEXT: v_mul_f16_sdwa v19, v10, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v10, 1.0, v10 ; VI-NEXT: v_or_b32_e32 v10, v10, v19 -; VI-NEXT: v_max_f16_sdwa v19, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v11, v11, v11 -; VI-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; VI-NEXT: v_mul_f16_sdwa v19, v11, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; VI-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; VI-NEXT: v_mul_f16_sdwa v18, v14, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v14, 1.0, v14 ; VI-NEXT: v_or_b32_e32 v11, v11, v19 -; VI-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v15, v15, v15 -; VI-NEXT: v_max_f16_e32 v14, v14, v14 -; VI-NEXT: v_max_f16_e32 v13, v13, v13 -; VI-NEXT: v_max_f16_e32 v12, v12, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v19 -; VI-NEXT: v_or_b32_e32 v13, v13, v18 -; VI-NEXT: v_or_b32_e32 v14, v14, v17 -; VI-NEXT: v_or_b32_e32 v15, v15, v16 +; VI-NEXT: v_mul_f16_sdwa v19, v13, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; VI-NEXT: v_mul_f16_sdwa v16, v12, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v16 +; VI-NEXT: v_or_b32_e32 v13, v13, v19 +; VI-NEXT: v_or_b32_e32 v14, v14, v18 +; VI-NEXT: v_or_b32_e32 v15, v15, v17 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v32f16: @@ -3499,104 +3520,105 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 { ; VI-LABEL: v_test_canonicalize_var_v64f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v31, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 -; VI-NEXT: v_or_b32_e32 v0, v0, v31 -; VI-NEXT: v_max_f16_sdwa v31, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_or_b32_e32 v1, v1, v31 -; VI-NEXT: v_max_f16_sdwa v31, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v2, v2, v2 -; VI-NEXT: v_or_b32_e32 v2, v2, v31 -; VI-NEXT: v_max_f16_sdwa v31, v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v3, v3, v3 -; VI-NEXT: v_or_b32_e32 v3, v3, v31 -; VI-NEXT: v_max_f16_sdwa v31, v4, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v4, v4, v4 -; VI-NEXT: v_or_b32_e32 v4, v4, v31 -; VI-NEXT: v_max_f16_sdwa v31, v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v5, v5, v5 -; VI-NEXT: v_or_b32_e32 v5, v5, v31 -; VI-NEXT: v_max_f16_sdwa v31, v6, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v6, v6, v6 -; VI-NEXT: v_or_b32_e32 v6, v6, v31 -; VI-NEXT: v_max_f16_sdwa v31, v7, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v7, v7, v7 -; VI-NEXT: v_or_b32_e32 v7, v7, v31 -; VI-NEXT: v_max_f16_sdwa v31, v8, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v8, v8, v8 -; VI-NEXT: v_or_b32_e32 v8, v8, v31 -; VI-NEXT: v_max_f16_sdwa v31, v9, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v9, v9, v9 -; VI-NEXT: v_or_b32_e32 v9, v9, v31 -; VI-NEXT: v_max_f16_sdwa v31, v10, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v10, v10, v10 -; VI-NEXT: v_or_b32_e32 v10, v10, v31 -; VI-NEXT: v_max_f16_sdwa v31, v11, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v11, v11, v11 -; VI-NEXT: v_or_b32_e32 v11, v11, v31 -; VI-NEXT: v_max_f16_sdwa v31, v12, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v12, v12, v12 -; VI-NEXT: v_or_b32_e32 v12, v12, v31 -; VI-NEXT: v_max_f16_sdwa v31, v13, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v13, v13, v13 -; VI-NEXT: v_or_b32_e32 v13, v13, v31 -; VI-NEXT: v_max_f16_sdwa v31, v14, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v14, v14, v14 -; VI-NEXT: v_or_b32_e32 v14, v14, v31 -; VI-NEXT: v_max_f16_sdwa v31, v15, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v15, v15, v15 -; VI-NEXT: v_or_b32_e32 v15, v15, v31 -; VI-NEXT: v_max_f16_sdwa v31, v16, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v16, v16, v16 -; VI-NEXT: v_or_b32_e32 v16, v16, v31 -; VI-NEXT: v_max_f16_sdwa v31, v17, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v17, v17, v17 -; VI-NEXT: v_or_b32_e32 v17, v17, v31 -; VI-NEXT: v_max_f16_sdwa v31, v18, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v18, v18, v18 -; VI-NEXT: v_or_b32_e32 v18, v18, v31 -; VI-NEXT: v_max_f16_sdwa v31, v19, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v19, v19, v19 -; VI-NEXT: v_or_b32_e32 v19, v19, v31 -; VI-NEXT: v_max_f16_sdwa v31, v20, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v20, v20, v20 -; VI-NEXT: v_or_b32_e32 v20, v20, v31 -; VI-NEXT: v_max_f16_sdwa v31, v21, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v21, v21, v21 -; VI-NEXT: v_or_b32_e32 v21, v21, v31 -; VI-NEXT: v_max_f16_sdwa v31, v22, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v22, v22, v22 -; VI-NEXT: v_or_b32_e32 v22, v22, v31 -; VI-NEXT: v_max_f16_sdwa v31, v23, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v23, v23, v23 -; VI-NEXT: v_or_b32_e32 v23, v23, v31 -; VI-NEXT: v_max_f16_sdwa v31, v24, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v24, v24, v24 -; VI-NEXT: v_or_b32_e32 v24, v24, v31 -; VI-NEXT: v_max_f16_sdwa v31, v25, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v25, v25, v25 -; VI-NEXT: v_or_b32_e32 v25, v25, v31 -; VI-NEXT: v_max_f16_sdwa v31, v26, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v26, v26, v26 -; VI-NEXT: v_or_b32_e32 v26, v26, v31 -; VI-NEXT: v_max_f16_sdwa v31, v27, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v27, v27, v27 -; VI-NEXT: v_or_b32_e32 v27, v27, v31 -; VI-NEXT: v_max_f16_sdwa v31, v28, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v28, v28, v28 -; VI-NEXT: v_or_b32_e32 v28, v28, v31 -; VI-NEXT: v_max_f16_sdwa v31, v29, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v29, v29, v29 -; VI-NEXT: v_or_b32_e32 v29, v29, v31 -; VI-NEXT: v_max_f16_sdwa v31, v30, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v30, v30, v30 -; VI-NEXT: v_or_b32_e32 v30, v30, v31 -; VI-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; VI-NEXT: v_mov_b32_e32 v31, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v32, v0, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v1, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; VI-NEXT: v_or_b32_e32 v1, v1, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v2, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v3, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; VI-NEXT: v_or_b32_e32 v3, v3, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v4, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; VI-NEXT: v_or_b32_e32 v4, v4, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v5, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; VI-NEXT: v_or_b32_e32 v5, v5, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v6, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; VI-NEXT: v_or_b32_e32 v6, v6, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v7, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; VI-NEXT: v_or_b32_e32 v7, v7, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v8, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v8, 1.0, v8 +; VI-NEXT: v_or_b32_e32 v8, v8, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v9, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v9, 1.0, v9 +; VI-NEXT: v_or_b32_e32 v9, v9, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v10, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v10, 1.0, v10 +; VI-NEXT: v_or_b32_e32 v10, v10, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; VI-NEXT: v_or_b32_e32 v11, v11, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; VI-NEXT: v_or_b32_e32 v12, v12, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v13, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; VI-NEXT: v_or_b32_e32 v13, v13, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v14, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v14, 1.0, v14 +; VI-NEXT: v_or_b32_e32 v14, v14, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v15, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; VI-NEXT: v_or_b32_e32 v15, v15, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v16, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v16, 1.0, v16 +; VI-NEXT: v_or_b32_e32 v16, v16, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v17, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v17, 1.0, v17 +; VI-NEXT: v_or_b32_e32 v17, v17, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v18, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v18, 1.0, v18 +; VI-NEXT: v_or_b32_e32 v18, v18, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v19, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v19, 1.0, v19 +; VI-NEXT: v_or_b32_e32 v19, v19, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v20, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v20, 1.0, v20 +; VI-NEXT: v_or_b32_e32 v20, v20, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v21, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v21, 1.0, v21 +; VI-NEXT: v_or_b32_e32 v21, v21, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v22, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v22, 1.0, v22 +; VI-NEXT: v_or_b32_e32 v22, v22, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v23, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v23, 1.0, v23 +; VI-NEXT: v_or_b32_e32 v23, v23, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v24, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v24, 1.0, v24 +; VI-NEXT: v_or_b32_e32 v24, v24, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v25, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v25, 1.0, v25 +; VI-NEXT: v_or_b32_e32 v25, v25, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v26, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v26, 1.0, v26 +; VI-NEXT: v_or_b32_e32 v26, v26, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v27, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v27, 1.0, v27 +; VI-NEXT: v_or_b32_e32 v27, v27, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v28, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v28, 1.0, v28 +; VI-NEXT: v_or_b32_e32 v28, v28, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v29, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v29, 1.0, v29 +; VI-NEXT: v_or_b32_e32 v29, v29, v32 +; VI-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v30, 1.0, v30 +; VI-NEXT: v_or_b32_e32 v30, v30, v32 +; VI-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_max_f16_sdwa v32, v31, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v31, v31, v31 -; VI-NEXT: v_or_b32_e32 v31, v31, v32 +; VI-NEXT: v_mul_f16_sdwa v31, v32, v31 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_e32 v32, 1.0, v32 +; VI-NEXT: v_or_b32_e32 v31, v32, v31 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_var_v64f16: diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index bc541043f1fab..900f81811af15 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -2506,9 +2506,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1) ; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 -; GFX8-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v4 @@ -2886,14 +2886,15 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1) ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, 0x3c00 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-NEXT: v_or_b32_e32 v3, v0, v3 +; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v4, v4, v0 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: test_canonicalize_value_v2f16_denorm: @@ -2957,8 +2958,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX6-NEXT: v_mov_b32_e32 v5, s1 ; GFX6-NEXT: v_mov_b32_e32 v4, s0 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX6-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX6-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX6-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 ; GFX6-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX6-NEXT: s_endpgm ; @@ -2977,8 +2978,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX8-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX8-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: s_endpgm ; @@ -2990,8 +2991,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] ; GFX9-NEXT: s_endpgm ; @@ -3005,8 +3006,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX11-NEXT: s_endpgm ; @@ -3020,8 +3021,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out) ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: global_load_b128 v[0:3], v0, s[0:1] ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3] +; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1] ; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1] ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -3044,14 +3045,14 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 { ; GFX9-LABEL: v_test_canonicalize_v2f32_flush: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v2f32_flush: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v2f32_flush: @@ -3061,7 +3062,7 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 +; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> %arg) ret <2 x float> %canon @@ -3080,16 +3081,16 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 { ; GFX9-LABEL: v_test_canonicalize_v3f32_flush: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v3f32_flush: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GFX11-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX11-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v3f32_flush: @@ -3099,8 +3100,8 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v2 +; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX12-NEXT: v_mul_f32_e32 v2, 1.0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> %arg) ret <3 x float> %canon @@ -3120,17 +3121,17 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 { ; GFX9-LABEL: v_test_canonicalize_v4f32_flush: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v4f32_flush: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX11-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v4f32_flush: @@ -3140,8 +3141,8 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 +; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX12-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %arg) ret <4 x float> %canon @@ -3165,23 +3166,23 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 { ; GFX9-LABEL: v_test_canonicalize_v8f32_flush: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f32_e32 v0, v0, v0 -; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 -; GFX9-NEXT: v_max_f32_e32 v2, v2, v2 -; GFX9-NEXT: v_max_f32_e32 v3, v3, v3 -; GFX9-NEXT: v_max_f32_e32 v4, v4, v4 -; GFX9-NEXT: v_max_f32_e32 v5, v5, v5 -; GFX9-NEXT: v_max_f32_e32 v6, v6, v6 -; GFX9-NEXT: v_max_f32_e32 v7, v7, v7 +; GFX9-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX9-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX9-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX9-NEXT: v_mul_f32_e32 v3, 1.0, v3 +; GFX9-NEXT: v_mul_f32_e32 v4, 1.0, v4 +; GFX9-NEXT: v_mul_f32_e32 v5, 1.0, v5 +; GFX9-NEXT: v_mul_f32_e32 v6, 1.0, v6 +; GFX9-NEXT: v_mul_f32_e32 v7, 1.0, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v8f32_flush: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 -; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 -; GFX11-NEXT: v_dual_max_f32 v4, v4, v4 :: v_dual_max_f32 v5, v5, v5 -; GFX11-NEXT: v_dual_max_f32 v6, v6, v6 :: v_dual_max_f32 v7, v7, v7 +; GFX11-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX11-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3 +; GFX11-NEXT: v_dual_mul_f32 v4, 1.0, v4 :: v_dual_mul_f32 v5, 1.0, v5 +; GFX11-NEXT: v_dual_mul_f32 v6, 1.0, v6 :: v_dual_mul_f32 v7, 1.0, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v8f32_flush: @@ -3191,10 +3192,10 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_max_num_f32 v0, v0, v0 :: v_dual_max_num_f32 v1, v1, v1 -; GFX12-NEXT: v_dual_max_num_f32 v2, v2, v2 :: v_dual_max_num_f32 v3, v3, v3 -; GFX12-NEXT: v_dual_max_num_f32 v4, v4, v4 :: v_dual_max_num_f32 v5, v5, v5 -; GFX12-NEXT: v_dual_max_num_f32 v6, v6, v6 :: v_dual_max_num_f32 v7, v7, v7 +; GFX12-NEXT: v_dual_mul_f32 v0, 1.0, v0 :: v_dual_mul_f32 v1, 1.0, v1 +; GFX12-NEXT: v_dual_mul_f32 v2, 1.0, v2 :: v_dual_mul_f32 v3, 1.0, v3 +; GFX12-NEXT: v_dual_mul_f32 v4, 1.0, v4 :: v_dual_mul_f32 v5, 1.0, v5 +; GFX12-NEXT: v_dual_mul_f32 v6, 1.0, v6 :: v_dual_mul_f32 v7, 1.0, v7 ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> %arg) ret <8 x float> %canon @@ -3204,22 +3205,22 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 { ; GFX678-LABEL: v_test_canonicalize_v2f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v2f64: @@ -3229,8 +3230,8 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> %arg) ret <2 x double> %canon @@ -3240,25 +3241,25 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 { ; GFX678-LABEL: v_test_canonicalize_v3f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX678-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_v3f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v3f64: @@ -3268,9 +3269,9 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] +; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3] +; GFX12-NEXT: v_mul_f64_e32 v[4:5], 1.0, v[4:5] ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> %arg) ret <3 x double> %canon @@ -3280,28 +3281,28 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 { ; GFX678-LABEL: v_test_canonicalize_v4f64: ; GFX678: ; %bb.0: ; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX678-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX678-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX678-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX678-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX678-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX678-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX678-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 +; GFX678-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0 ; GFX678-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_test_canonicalize_v4f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX9-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX9-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX9-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 +; GFX9-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_test_canonicalize_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] -; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] -; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[4:5] -; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[6:7] +; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 1.0 +; GFX11-NEXT: v_mul_f64 v[2:3], v[2:3], 1.0 +; GFX11-NEXT: v_mul_f64 v[4:5], v[4:5], 1.0 +; GFX11-NEXT: v_mul_f64 v[6:7], v[6:7], 1.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_test_canonicalize_v4f64: @@ -3311,10 +3312,10 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] -; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] -; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[4:5] -; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[6:7], v[6:7] +; GFX12-NEXT: v_mul_f64_e32 v[0:1], 1.0, v[0:1] +; GFX12-NEXT: v_mul_f64_e32 v[2:3], 1.0, v[2:3] +; GFX12-NEXT: v_mul_f64_e32 v[4:5], 1.0, v[4:5] +; GFX12-NEXT: v_mul_f64_e32 v[6:7], 1.0, v[6:7] ; GFX12-NEXT: s_setpc_b64 s[30:31] %canon = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %arg) ret <4 x double> %canon diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 2f08931f2287e..78ee94e8c372c 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1291,22 +1291,26 @@ define <3 x half> @v_max3_v3f16_maximumnum_maximumnum__v_v_v_0(<3 x half> %a, <3 ; GFX8-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0 +; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v5 +; GFX8-NEXT: v_max_f16_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v6 +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v4 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_max3_v3f16_maximumnum_maximumnum__v_v_v_0: @@ -1467,28 +1471,33 @@ define <4 x half> @v_max3_v4f16_maximumnum_maximumnum__v_v_v_0(<4 x half> %a, <4 ; GFX8-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0 ; GFX8-NEXT: v_max_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_f16_sdwa v7, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_max_f16_e32 v7, v8, v7 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_max_f16_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_max_f16_e32 v1, v1, v5 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f16_e32 v8, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_max_f16_e32 v2, v8, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v3, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v5 +; GFX8-NEXT: v_max_f16_e32 v2, v2, v3 +; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v3, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v6 +; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-NEXT: v_max_f16_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_max3_v4f16_maximumnum_maximumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index 969c6c3980fc3..0a7f7810051c2 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1291,22 +1291,26 @@ define <3 x half> @v_min3_v3f16_minimumnum_minimumnum__v_v_v_0(<3 x half> %a, <3 ; GFX8-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0 +; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX8-NEXT: v_max_f16_e32 v3, v5, v5 -; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-NEXT: v_max_f16_e32 v3, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v5 +; GFX8-NEXT: v_min_f16_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v2, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v6 +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v4 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_min3_v3f16_minimumnum_minimumnum__v_v_v_0: @@ -1467,28 +1471,33 @@ define <4 x half> @v_min3_v4f16_minimumnum_minimumnum__v_v_v_0(<4 x half> %a, <4 ; GFX8-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_max_f16_sdwa v6, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v7, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX8-NEXT: v_mul_f16_e32 v6, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v7, 1.0, v0 ; GFX8-NEXT: v_min_f16_e32 v6, v7, v6 -; GFX8-NEXT: v_max_f16_sdwa v7, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x3c00 +; GFX8-NEXT: v_mul_f16_sdwa v2, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-NEXT: v_max_f16_e32 v2, v3, v3 -; GFX8-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-NEXT: v_min_f16_e32 v7, v8, v7 -; GFX8-NEXT: v_min_f16_e32 v1, v1, v2 -; GFX8-NEXT: v_max_f16_sdwa v2, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_sdwa v3, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-NEXT: v_min_f16_sdwa v2, v7, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_min_f16_e32 v1, v1, v5 -; GFX8-NEXT: v_min_f16_e32 v0, v0, v4 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v3 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v3 +; GFX8-NEXT: v_mul_f16_e32 v8, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_sdwa v1, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-NEXT: v_min_f16_e32 v2, v8, v2 +; GFX8-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-NEXT: v_mul_f16_sdwa v3, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v5 +; GFX8-NEXT: v_min_f16_e32 v2, v2, v3 +; GFX8-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-NEXT: v_mul_f16_sdwa v3, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_f16_e32 v3, 1.0, v6 +; GFX8-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-NEXT: v_min_f16_e32 v3, v3, v4 +; GFX8-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-SDAG-LABEL: v_min3_v4f16_minimumnum_minimumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 863240cc591c3..62f081c38b0ce 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -763,17 +763,17 @@ define amdgpu_kernel void @maxnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0 +; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0 ; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 +; VI-NEXT: v_mul_f16_e64 v1, s9, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s3, 1.0 ; VI-NEXT: v_max_f16_e32 v1, v2, v1 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -907,22 +907,22 @@ define amdgpu_kernel void @maxnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_mul_f16_e64 v0, s9, 1.0 +; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0 ; VI-NEXT: s_lshr_b32 s0, s9, 16 ; VI-NEXT: v_max_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0 ; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_max_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0 ; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1042,15 +1042,15 @@ define amdgpu_kernel void @fmax_v4f16_imm_a( ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0 +; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0 ; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1 ; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 7e8c30161c1c8..2fa7c4d3e45cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -827,17 +827,17 @@ define amdgpu_kernel void @minnum_v3f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v1, s2, s2 +; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0 +; VI-NEXT: v_mul_f16_e64 v1, s2, 1.0 ; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e64 v1, s9, s9 -; VI-NEXT: v_max_f16_e64 v2, s3, s3 +; VI-NEXT: v_mul_f16_e64 v1, s9, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s3, 1.0 ; VI-NEXT: v_min_f16_e32 v1, v2, v1 ; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 @@ -970,22 +970,22 @@ define amdgpu_kernel void @minnum_v4f16( ; VI-NEXT: s_mov_b32 s4, s0 ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_max_f16_e64 v0, s9, s9 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 +; VI-NEXT: v_mul_f16_e64 v0, s9, 1.0 +; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0 ; VI-NEXT: s_lshr_b32 s0, s9, 16 ; VI-NEXT: v_min_f16_e32 v0, v1, v0 -; VI-NEXT: v_max_f16_e64 v1, s0, s0 +; VI-NEXT: v_mul_f16_e64 v1, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v0, v1 -; VI-NEXT: v_max_f16_e64 v0, s8, s8 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_mul_f16_e64 v0, s8, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0 ; VI-NEXT: s_lshr_b32 s0, s8, 16 ; VI-NEXT: v_min_f16_e32 v0, v2, v0 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 +; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0 ; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1104,15 +1104,15 @@ define amdgpu_kernel void @fmin_v4f16_imm_a( ; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s0, s3, 16 -; VI-NEXT: v_max_f16_e64 v1, s3, s3 -; VI-NEXT: v_max_f16_e64 v3, s0, s0 -; VI-NEXT: v_max_f16_e64 v2, s2, s2 +; VI-NEXT: v_mul_f16_e64 v1, s3, 1.0 +; VI-NEXT: v_mul_f16_e64 v3, s0, 1.0 +; VI-NEXT: v_mul_f16_e64 v2, s2, 1.0 ; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1 ; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: s_lshr_b32 s0, s2, 16 ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2 -; VI-NEXT: v_max_f16_e64 v2, s0, s0 +; VI-NEXT: v_mul_f16_e64 v2, s0, 1.0 ; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 32e0d393a1001..8696e61891e13 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -1088,27 +1088,34 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-VI: ; %bb.0: ; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2 ; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3 ; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5 ; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7 ; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2 -; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 ; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4 -; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5 -; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0 -; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1 -; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2 -; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00 -; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3 -; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1 +; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4 +; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v5 +; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; SDAG-VI-NEXT: v_mul_f16_e64 v3, 0, 1.0 +; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; SDAG-VI-NEXT: v_max_f16_e32 v0, v0, v3 +; SDAG-VI-NEXT: v_max_f16_e32 v2, v2, v3 +; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; SDAG-VI-NEXT: v_max_f16_e32 v1, v1, v3 +; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; SDAG-VI-NEXT: v_mul_f16_e64 v3, 1.0, 1.0 +; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; SDAG-VI-NEXT: v_min_f16_e32 v2, v2, v3 +; SDAG-VI-NEXT: v_min_f16_e32 v1, v1, v3 ; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0 ; SDAG-VI-NEXT: s_setpc_b64 s[30:31] ; @@ -1308,7 +1315,11 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1 ; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2 ; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3 +; SDAG-VI-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00 +; SDAG-VI-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; SDAG-VI-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; SDAG-VI-NEXT: v_mul_f16_e32 v2, 1.0, v2 ; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll index 4f73e8e9c1883..ba7b1fc2b9a49 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll @@ -3636,16 +3636,17 @@ define <3 x half> @v_maximumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v3f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v3 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v3f16: @@ -3926,20 +3927,21 @@ define <4 x half> @v_maximumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v4f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v3 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v2 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v4f16: @@ -4260,27 +4262,28 @@ define <6 x half> @v_maximumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v6f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v5 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v4 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v3 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v6f16: @@ -4552,34 +4555,35 @@ define <8 x half> @v_maximumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v8f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v7 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v6 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v4 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v8f16: @@ -4964,62 +4968,63 @@ define <16 x half> @v_maximumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v16f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v16, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v15 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v15, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v14 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v14, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v14, v18, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v13 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v13, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v12 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v12, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v18, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v11 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v11, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v10 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v9 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v18, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17 -; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v13 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v14 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v15 +; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v16f16: @@ -5820,144 +5825,121 @@ define <32 x half> @v_maximumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-SDAG-LABEL: v_maximumnum_v32f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v17, v17, v17 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v17 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX8-SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v31, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v30, 1.0, v30 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14 ; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v30, v30, v30 -; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 -; GFX8-SDAG-NEXT: v_max_f16_e32 v29, v29, v29 -; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13 -; GFX8-SDAG-NEXT: v_max_f16_e32 v28, v28, v28 -; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12 -; GFX8-SDAG-NEXT: v_max_f16_e32 v27, v27, v27 -; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11 -; GFX8-SDAG-NEXT: v_max_f16_e32 v26, v26, v26 -; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10 -; GFX8-SDAG-NEXT: v_max_f16_e32 v25, v25, v25 -; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v24, v24, v24 -; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v23, v23, v23 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v22, v22, v22 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v21, v21, v21 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v20, v20, v20 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v19, v19, v19 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v18, v18, v18 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v16, v16, v16 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v30 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v30, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v29, 1.0, v29 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v30, v33, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v29 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v29, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v28, 1.0, v28 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v29, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v28 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v28, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v27, 1.0, v27 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v28, v33, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v27 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v27, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v26, 1.0, v26 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v27, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v26 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v26, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v25, 1.0, v25 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v26, v33, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v25 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v25, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8 ; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v24 +; GFX8-SDAG-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v25, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v26 +; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v27 +; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v28 +; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v30 +; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32 +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v33, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v24 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v24, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v23, 1.0, v23 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v23 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v23, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v22, 1.0, v22 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v22 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v22, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v21, 1.0, v21 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v34, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v21 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v21, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v20, 1.0, v20 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v20 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v20, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v19, 1.0, v19 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v34, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v19 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v19, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v18, 1.0, v18 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v18 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v17, 1.0, v17 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v17 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v31, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v16, 1.0, v16 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v31, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v16 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v33 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v54 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v53 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v52 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v51 -; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v50 -; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v49 -; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v48 -; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v39 -; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v38 -; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v36 -; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v34 -; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32 -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v31, v31, v31 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v31 -; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v35 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v33 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_maximumnum_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll index 558006d2b6957..f50e83bc4a616 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll @@ -3461,16 +3461,17 @@ define <3 x half> @v_minimumnum_v3f16(<3 x half> %x, <3 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v3f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v3 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v3, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v3f16: @@ -3751,20 +3752,21 @@ define <4 x half> @v_minimumnum_v4f16(<4 x half> %x, <4 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v4f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v4, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v5, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v3 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v2 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v4f16: @@ -4085,27 +4087,28 @@ define <6 x half> @v_minimumnum_v6f16(<6 x half> %x, <6 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v6f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v6, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v7, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v7, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v5 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v4 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v4, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v3 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v7 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v6f16: @@ -4377,34 +4380,35 @@ define <8 x half> @v_minimumnum_v8f16(<8 x half> %x, <8 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v8f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v8, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v9, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v8, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v10, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v10, v11, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v11, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v12, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v11, v12, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v7 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v7, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v7, v10, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v6 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v6, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v5, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v6, v10, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v5, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v4 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v11 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v9 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v8 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v7 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v9 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v8f16: @@ -4789,62 +4793,63 @@ define <16 x half> @v_minimumnum_v16f16(<16 x half> %x, <16 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v16f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v16, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v16, v17, v16 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v17, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v16, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v15, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v7, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v18, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v19, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v20, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v20, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v20, v21, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v21, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v22, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v23, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v24, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v23, v24, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v7, v7, v15 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v15, v14, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v15, v18, v15 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v6, v6, v14 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v14, v13, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v5, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v14, v18, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v5, v5, v13 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v13, v12, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v4, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v4, v4, v12 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v12, v11, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v12, v18, v12 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v11 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v11, v10, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v11, v18, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v10 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v10, v9, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v1, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 ; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v9 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v9, v8, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v16, v0, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v10, v18, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v9, v16, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v8 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v23 -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v22 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v21 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v19 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v18 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v17 -; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v16 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v9 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v10 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v11 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v12 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v13 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v14 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v15 +; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v17 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v16f16: @@ -5645,144 +5650,121 @@ define <32 x half> @v_minimumnum_v32f16(<32 x half> %x, <32 x half> %y) { ; GFX8-SDAG-LABEL: v_minimumnum_v32f16: ; GFX8-SDAG: ; %bb.0: ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX8-SDAG-NEXT: v_max_f16_sdwa v38, v27, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v39, v11, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v48, v26, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v49, v10, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v50, v25, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v51, v9, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v40, v22, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v41, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v58, v17, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v59, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v17, v17, v17 -; GFX8-SDAG-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v52, v24, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v53, v8, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v54, v23, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v55, v7, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v42, v21, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v43, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v44, v20, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v45, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v46, v19, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v47, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v56, v18, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v57, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v39, v49, v48 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v48, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v51, v41, v40 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v40, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v17 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v49, v53, v52 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v50, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v52, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v53, v45, v44 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v54, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_sdwa v55, v57, v56 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v40 -; GFX8-SDAG-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v32, v30, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v14, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v34, v29, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v13, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v36, v28, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v12, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v31, 0x3c00 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v32, v30, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v14, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v30, 1.0, v30 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v14, 1.0, v14 ; GFX8-SDAG-NEXT: v_min_f16_sdwa v32, v33, v32 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v33, v16, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v36, v37, v36 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_sdwa v37, v15, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v15, v15, v15 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v33, v35, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_max_f16_e32 v30, v30, v30 -; GFX8-SDAG-NEXT: v_max_f16_e32 v14, v14, v14 -; GFX8-SDAG-NEXT: v_max_f16_e32 v29, v29, v29 -; GFX8-SDAG-NEXT: v_max_f16_e32 v13, v13, v13 -; GFX8-SDAG-NEXT: v_max_f16_e32 v28, v28, v28 -; GFX8-SDAG-NEXT: v_max_f16_e32 v12, v12, v12 -; GFX8-SDAG-NEXT: v_max_f16_e32 v27, v27, v27 -; GFX8-SDAG-NEXT: v_max_f16_e32 v11, v11, v11 -; GFX8-SDAG-NEXT: v_max_f16_e32 v26, v26, v26 -; GFX8-SDAG-NEXT: v_max_f16_e32 v10, v10, v10 -; GFX8-SDAG-NEXT: v_max_f16_e32 v25, v25, v25 -; GFX8-SDAG-NEXT: v_max_f16_e32 v9, v9, v9 -; GFX8-SDAG-NEXT: v_max_f16_e32 v24, v24, v24 -; GFX8-SDAG-NEXT: v_max_f16_e32 v8, v8, v8 -; GFX8-SDAG-NEXT: v_max_f16_e32 v23, v23, v23 -; GFX8-SDAG-NEXT: v_max_f16_e32 v7, v7, v7 -; GFX8-SDAG-NEXT: v_max_f16_e32 v22, v22, v22 -; GFX8-SDAG-NEXT: v_max_f16_e32 v6, v6, v6 -; GFX8-SDAG-NEXT: v_max_f16_e32 v21, v21, v21 -; GFX8-SDAG-NEXT: v_max_f16_e32 v5, v5, v5 -; GFX8-SDAG-NEXT: v_max_f16_e32 v20, v20, v20 -; GFX8-SDAG-NEXT: v_max_f16_e32 v4, v4, v4 -; GFX8-SDAG-NEXT: v_max_f16_e32 v19, v19, v19 -; GFX8-SDAG-NEXT: v_max_f16_e32 v3, v3, v3 -; GFX8-SDAG-NEXT: v_max_f16_e32 v18, v18, v18 -; GFX8-SDAG-NEXT: v_max_f16_e32 v2, v2, v2 -; GFX8-SDAG-NEXT: v_max_f16_e32 v16, v16, v16 -; GFX8-SDAG-NEXT: v_max_f16_e32 v0, v0, v0 ; GFX8-SDAG-NEXT: v_min_f16_e32 v14, v14, v30 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v30, v29, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v13, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v29, 1.0, v29 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v13, 1.0, v13 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v30, v33, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v13, v13, v29 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v29, v28, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v12, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v28, 1.0, v28 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v12, 1.0, v12 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v29, v33, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v12, v12, v28 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v28, v27, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v11, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v27, 1.0, v27 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v11, 1.0, v11 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v28, v33, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v11, v11, v27 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v27, v26, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v10, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v26, 1.0, v26 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v10, 1.0, v10 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v27, v33, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v10, v10, v26 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v26, v25, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v9, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v25, 1.0, v25 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v9, 1.0, v9 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v26, v33, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v9, v9, v25 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v25, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v8, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v8, 1.0, v8 ; GFX8-SDAG-NEXT: v_min_f16_e32 v8, v8, v24 +; GFX8-SDAG-NEXT: buffer_load_dword v24, off, s[0:3], s32 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v25, v33, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v33, v15, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v15, 1.0, v15 +; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v25 +; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v26 +; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v27 +; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v28 +; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v29 +; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v30 +; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32 +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v24, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v24, 1.0, v24 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v33, v33, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_e32 v15, v15, v24 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v24, v23, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v7, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v23, 1.0, v23 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v7, 1.0, v7 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v24, v34, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v7, v7, v23 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v23, v22, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v6, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v22, 1.0, v22 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v6, 1.0, v6 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v23, v34, v23 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v6, v6, v22 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v22, v21, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v21, 1.0, v21 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v5, 1.0, v5 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v22, v34, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v5, v5, v21 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v21, v20, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v4, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v20, 1.0, v20 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v4, 1.0, v4 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v21, v34, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v4, v4, v20 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v20, v19, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v3, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v19, 1.0, v19 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v3, 1.0, v3 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v20, v34, v20 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v3, v3, v19 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v19, v18, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v2, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v18, 1.0, v18 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v2, 1.0, v2 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v19, v34, v19 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v2, v2, v18 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v18, v17, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v34, v1, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v17, 1.0, v17 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v1, 1.0, v1 +; GFX8-SDAG-NEXT: v_min_f16_e32 v1, v1, v17 +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v17, v16, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_sdwa v31, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-SDAG-NEXT: v_mul_f16_e32 v16, 1.0, v16 +; GFX8-SDAG-NEXT: v_mul_f16_e32 v0, 1.0, v0 +; GFX8-SDAG-NEXT: v_min_f16_sdwa v18, v34, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-SDAG-NEXT: v_min_f16_sdwa v17, v31, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-SDAG-NEXT: v_min_f16_e32 v0, v0, v16 -; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v33 -; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v55 -; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v54 -; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v53 -; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v52 -; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v51 -; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v50 -; GFX8-SDAG-NEXT: v_or_b32_e32 v8, v8, v49 -; GFX8-SDAG-NEXT: v_or_b32_e32 v9, v9, v48 -; GFX8-SDAG-NEXT: v_or_b32_e32 v10, v10, v39 -; GFX8-SDAG-NEXT: v_or_b32_e32 v11, v11, v38 -; GFX8-SDAG-NEXT: v_or_b32_e32 v12, v12, v36 -; GFX8-SDAG-NEXT: v_or_b32_e32 v13, v13, v34 -; GFX8-SDAG-NEXT: v_or_b32_e32 v14, v14, v32 -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX8-SDAG-NEXT: v_max_f16_sdwa v35, v31, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-SDAG-NEXT: v_max_f16_e32 v31, v31, v31 -; GFX8-SDAG-NEXT: v_min_f16_sdwa v35, v37, v35 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-SDAG-NEXT: v_min_f16_e32 v15, v15, v31 -; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v35 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v17 +; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v18 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v19 +; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v20 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v4, v21 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v5, v22 +; GFX8-SDAG-NEXT: v_or_b32_e32 v6, v6, v23 +; GFX8-SDAG-NEXT: v_or_b32_e32 v7, v7, v24 +; GFX8-SDAG-NEXT: v_or_b32_e32 v15, v15, v33 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: v_minimumnum_v32f16: diff --git a/llvm/test/CodeGen/AMDGPU/reduction.ll b/llvm/test/CodeGen/AMDGPU/reduction.ll index 7f9044ae164d5..a55081981408c 100644 --- a/llvm/test/CodeGen/AMDGPU/reduction.ll +++ b/llvm/test/CodeGen/AMDGPU/reduction.ll @@ -566,13 +566,14 @@ define half @reduction_maxnum_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_maxnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0 ; VI-NEXT: v_max_f16_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> @@ -596,13 +597,14 @@ define half @reduction_minnum_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_minnum_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0 ; VI-NEXT: v_min_f16_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> @@ -628,13 +630,14 @@ define half @reduction_fast_max_pattern_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_fast_max_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0 ; VI-NEXT: v_max_f16_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_max_f16_e32 v0, v0, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v2 +; VI-NEXT: v_max_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> @@ -662,13 +665,14 @@ define half @reduction_fast_min_pattern_v4f16(<4 x half> %vec4) { ; VI-LABEL: reduction_fast_min_pattern_v4f16: ; VI: ; %bb.0: ; %entry ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_sdwa v3, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; VI-NEXT: v_max_f16_e32 v1, v1, v1 -; VI-NEXT: v_max_f16_e32 v0, v0, v0 +; VI-NEXT: v_mul_f16_e32 v2, 1.0, v1 +; VI-NEXT: v_mul_f16_e32 v3, 1.0, v0 ; VI-NEXT: v_min_f16_e32 v2, v3, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x3c00 +; VI-NEXT: v_mul_f16_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_min_f16_e32 v0, v0, v1 -; VI-NEXT: v_min_f16_e32 v0, v0, v2 +; VI-NEXT: v_min_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] entry: %rdx.shuf = shufflevector <4 x half> %vec4, <4 x half> poison, <4 x i32> diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll index e02f931c4d31e..8b4640a1e33b1 100644 --- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll +++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll @@ -199,17 +199,8 @@ define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind { define <4 x float> @canonicalize_v4f32(<4 x float> %a) { ; Z16-LABEL: canonicalize_v4f32: ; Z16: # %bb.0: -; Z16-NEXT: vrepf %v0, %v24, 3 -; Z16-NEXT: vgmf %v1, 2, 8 -; Z16-NEXT: vrepf %v2, %v24, 2 -; Z16-NEXT: meebr %f0, %f1 -; Z16-NEXT: meebr %f2, %f1 -; Z16-NEXT: vrepf %v3, %v24, 1 -; Z16-NEXT: vmrhf %v0, %v2, %v0 -; Z16-NEXT: wfmsb %f2, %v24, %f1 -; Z16-NEXT: wfmsb %f1, %f3, %f1 -; Z16-NEXT: vmrhf %v1, %v2, %v1 -; Z16-NEXT: vmrhg %v24, %v1, %v0 +; Z16-NEXT: vgmf %v0, 2, 8 +; Z16-NEXT: vfmsb %v24, %v24, %v0 ; Z16-NEXT: br %r14 %canonicalized = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> %a) ret <4 x float> %canonicalized @@ -219,14 +210,8 @@ define <4 x double> @canonicalize_v4f64(<4 x double> %a) { ; Z16-LABEL: canonicalize_v4f64: ; Z16: # %bb.0: ; Z16-NEXT: vgmg %v0, 2, 11 -; Z16-NEXT: vrepg %v2, %v24, 1 -; Z16-NEXT: wfmdb %f1, %v24, %f0 -; Z16-NEXT: mdbr %f2, %f0 -; Z16-NEXT: vmrhg %v24, %v1, %v2 -; Z16-NEXT: vrepg %v2, %v26, 1 -; Z16-NEXT: wfmdb %f1, %v26, %f0 -; Z16-NEXT: wfmdb %f0, %f2, %f0 -; Z16-NEXT: vmrhg %v26, %v1, %v0 +; Z16-NEXT: vfmdb %v24, %v24, %v0 +; Z16-NEXT: vfmdb %v26, %v26, %v0 ; Z16-NEXT: br %r14 %canonicalized = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> %a) ret <4 x double> %canonicalized @@ -344,17 +329,8 @@ define void @canonicalize_ptr_v4f32(ptr %out) { ; Z16-LABEL: canonicalize_ptr_v4f32: ; Z16: # %bb.0: ; Z16-NEXT: vl %v0, 0(%r2), 3 -; Z16-NEXT: vrepf %v1, %v0, 3 -; Z16-NEXT: vgmf %v2, 2, 8 -; Z16-NEXT: vrepf %v3, %v0, 2 -; Z16-NEXT: meebr %f1, %f2 -; Z16-NEXT: meebr %f3, %f2 -; Z16-NEXT: vmrhf %v1, %v3, %v1 -; Z16-NEXT: wfmsb %f3, %f0, %f2 -; Z16-NEXT: vrepf %v0, %v0, 1 -; Z16-NEXT: meebr %f0, %f2 -; Z16-NEXT: vmrhf %v0, %v3, %v0 -; Z16-NEXT: vmrhg %v0, %v0, %v1 +; Z16-NEXT: vgmf %v1, 2, 8 +; Z16-NEXT: vfmsb %v0, %v0, %v1 ; Z16-NEXT: vst %v0, 0(%r2), 3 ; Z16-NEXT: br %r14 %val = load <4 x float>, ptr %out @@ -366,17 +342,11 @@ define void @canonicalize_ptr_v4f32(ptr %out) { define void @canonicalize_ptr_v4f64(ptr %out) { ; Z16-LABEL: canonicalize_ptr_v4f64: ; Z16: # %bb.0: +; Z16-NEXT: vl %v0, 0(%r2), 4 ; Z16-NEXT: vl %v1, 16(%r2), 4 ; Z16-NEXT: vgmg %v2, 2, 11 -; Z16-NEXT: wfmdb %f3, %f1, %f2 -; Z16-NEXT: vrepg %v1, %v1, 1 -; Z16-NEXT: mdbr %f1, %f2 -; Z16-NEXT: vl %v0, 0(%r2), 4 -; Z16-NEXT: vmrhg %v1, %v3, %v1 -; Z16-NEXT: wfmdb %f3, %f0, %f2 -; Z16-NEXT: vrepg %v0, %v0, 1 -; Z16-NEXT: mdbr %f0, %f2 -; Z16-NEXT: vmrhg %v0, %v3, %v0 +; Z16-NEXT: vfmdb %v1, %v1, %v2 +; Z16-NEXT: vfmdb %v0, %v0, %v2 ; Z16-NEXT: vst %v0, 0(%r2), 4 ; Z16-NEXT: vst %v1, 16(%r2), 4 ; Z16-NEXT: br %r14