From 683fae7878c6c9250bf7142a2fd16170aa734f71 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 8 Jul 2025 01:05:36 -0700 Subject: [PATCH 1/4] Precommit test for #121311 --- .../CodeGen/WebAssembly/simd-relaxed-fma.ll | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll new file mode 100644 index 0000000000000..ea3ee2a33cfa4 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 + +; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s +target triple = "wasm32" +define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) { +; CHECK-LABEL: fma_seperate: +; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0 +; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0 +; CHECK-NEXT: f32x4.mul $push2=, $pop1, $pop0 +; CHECK-NEXT: v128.load $push3=, 0($2):p2align=0 +; CHECK-NEXT: f32x4.add $push4=, $pop2, $pop3 +; CHECK-NEXT: v128.store 0($3):p2align=0, $pop4 +; CHECK-NEXT: return +entry: + %0 = load <4 x float>, ptr %a, align 1 + %1 = load <4 x float>, ptr %b, align 1 + %2 = load <4 x float>, ptr %c, align 1 + %mul.i = fmul fast <4 x float> %1, %0 + %add.i = fadd fast <4 x float> %mul.i, %2 + store <4 x float> %add.i, ptr %dest, align 1 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) { +; CHECK-LABEL: fma_llvm: +; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: v128.load $push25=, 0($0):p2align=0 +; CHECK-NEXT: local.tee $push24=, $6=, $pop25 +; CHECK-NEXT: f32x4.extract_lane $push2=, $pop24, 0 +; CHECK-NEXT: v128.load $push23=, 0($1):p2align=0 +; CHECK-NEXT: local.tee $push22=, $5=, $pop23 +; CHECK-NEXT: f32x4.extract_lane $push1=, $pop22, 0 +; CHECK-NEXT: v128.load $push21=, 0($2):p2align=0 +; CHECK-NEXT: local.tee $push20=, $4=, $pop21 +; CHECK-NEXT: f32x4.extract_lane $push0=, $pop20, 0 +; CHECK-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0 +; CHECK-NEXT: f32x4.splat $push4=, $pop3 +; CHECK-NEXT: f32x4.extract_lane $push7=, $6, 1 +; CHECK-NEXT: f32x4.extract_lane $push6=, $5, 1 +; CHECK-NEXT: f32x4.extract_lane $push5=, $4, 1 +; CHECK-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5 +; CHECK-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8 +; CHECK-NEXT: f32x4.extract_lane $push12=, $6, 2 +; CHECK-NEXT: f32x4.extract_lane $push11=, $5, 2 +; CHECK-NEXT: f32x4.extract_lane $push10=, $4, 2 +; CHECK-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10 +; CHECK-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13 +; CHECK-NEXT: f32x4.extract_lane $push17=, $6, 3 +; CHECK-NEXT: f32x4.extract_lane $push16=, $5, 3 +; CHECK-NEXT: f32x4.extract_lane $push15=, $4, 3 +; CHECK-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15 +; CHECK-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18 +; CHECK-NEXT: v128.store 0($3):p2align=0, $pop19 +; CHECK-NEXT: return +entry: + %0 = load <4 x float>, ptr %a, align 1 + %1 = load <4 x float>, ptr %b, align 1 + %2 = load <4 x float>, ptr %c, align 1 + %fma = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) + store <4 x float> %fma, ptr %dest, align 1 + ret void +} From b1c4c01dd18259980d8faae6a9e4f71cb30208c6 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 8 Jul 2025 01:49:37 -0700 Subject: [PATCH 2/4] [WASM] Optimize fma when relaxed and ffast-math Fixes #121311, which folds a series of multiply and add to wasm.fma when we have -mrelaxed-simd and -ffast-math. Also attempted to use wasm.fma instead of the built in llvm.fma --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 1 + .../WebAssembly/WebAssemblyISelLowering.cpp | 41 +++++++++++++++++++ .../CodeGen/WebAssembly/simd-relaxed-fma.ll | 39 ++++-------------- 3 files changed, 50 insertions(+), 31 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index a3675eecfea3f..ec566b168bc3d 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -475,6 +475,7 @@ struct SDNodeFlags { bool hasAllowReassociation() const { return Flags & AllowReassociation; } bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } + bool hasFastMath() const { return Flags & FastMathFlags; } bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index bf2e04caa0a61..ef0146f28aba1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -182,6 +182,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( // SIMD-specific configuration if (Subtarget->hasSIMD128()) { + // Enable fma optimization for wasm relaxed simd + if (Subtarget->hasRelaxedSIMD()) { + setTargetDAGCombine(ISD::FADD); + setTargetDAGCombine(ISD::FMA); + } + // Combine partial.reduce.add before legalization gets confused. setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); @@ -3412,6 +3418,37 @@ static SDValue performSETCCCombine(SDNode *N, return SDValue(); } +static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::FADD); + using namespace llvm::SDPatternMatch; + if (!N->getFlags().hasFastMath()) + return SDValue(); + + SDLoc DL(N); + SDValue A, B, C; + EVT VecVT = N->getValueType(0); + if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C))))) + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VecVT, + {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C}); + + return SDValue(); +} + +static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::FMA); + if (!N->getFlags().hasFastMath()) + return SDValue(); + + SDLoc DL(N); + SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2); + EVT VecVT = N->getValueType(0); + + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VecVT, + {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C}); +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::MUL); EVT VT = N->getValueType(0); @@ -3529,6 +3566,10 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return AnyAllCombine; return performLowerPartialReduction(N, DCI.DAG); } + case ISD::FADD: + return performFAddCombine(N, DCI.DAG); + case ISD::FMA: + return performFMACombine(N, DCI.DAG); case ISD::MUL: return performMulCombine(N, DCI.DAG); } diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll index ea3ee2a33cfa4..fe5e8573f12b4 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll @@ -6,12 +6,11 @@ define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) { ; CHECK-LABEL: fma_seperate: ; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> () ; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: v128.load $push2=, 0($2):p2align=0 ; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0 ; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0 -; CHECK-NEXT: f32x4.mul $push2=, $pop1, $pop0 -; CHECK-NEXT: v128.load $push3=, 0($2):p2align=0 -; CHECK-NEXT: f32x4.add $push4=, $pop2, $pop3 -; CHECK-NEXT: v128.store 0($3):p2align=0, $pop4 +; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0 +; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3 ; CHECK-NEXT: return entry: %0 = load <4 x float>, ptr %a, align 1 @@ -28,33 +27,11 @@ define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) { ; CHECK-LABEL: fma_llvm: ; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> () ; CHECK-NEXT: # %bb.0: # %entry -; CHECK-NEXT: v128.load $push25=, 0($0):p2align=0 -; CHECK-NEXT: local.tee $push24=, $6=, $pop25 -; CHECK-NEXT: f32x4.extract_lane $push2=, $pop24, 0 -; CHECK-NEXT: v128.load $push23=, 0($1):p2align=0 -; CHECK-NEXT: local.tee $push22=, $5=, $pop23 -; CHECK-NEXT: f32x4.extract_lane $push1=, $pop22, 0 -; CHECK-NEXT: v128.load $push21=, 0($2):p2align=0 -; CHECK-NEXT: local.tee $push20=, $4=, $pop21 -; CHECK-NEXT: f32x4.extract_lane $push0=, $pop20, 0 -; CHECK-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0 -; CHECK-NEXT: f32x4.splat $push4=, $pop3 -; CHECK-NEXT: f32x4.extract_lane $push7=, $6, 1 -; CHECK-NEXT: f32x4.extract_lane $push6=, $5, 1 -; CHECK-NEXT: f32x4.extract_lane $push5=, $4, 1 -; CHECK-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5 -; CHECK-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8 -; CHECK-NEXT: f32x4.extract_lane $push12=, $6, 2 -; CHECK-NEXT: f32x4.extract_lane $push11=, $5, 2 -; CHECK-NEXT: f32x4.extract_lane $push10=, $4, 2 -; CHECK-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10 -; CHECK-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13 -; CHECK-NEXT: f32x4.extract_lane $push17=, $6, 3 -; CHECK-NEXT: f32x4.extract_lane $push16=, $5, 3 -; CHECK-NEXT: f32x4.extract_lane $push15=, $4, 3 -; CHECK-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15 -; CHECK-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18 -; CHECK-NEXT: v128.store 0($3):p2align=0, $pop19 +; CHECK-NEXT: v128.load $push2=, 0($0):p2align=0 +; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0 +; CHECK-NEXT: v128.load $push0=, 0($2):p2align=0 +; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0 +; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3 ; CHECK-NEXT: return entry: %0 = load <4 x float>, ptr %a, align 1 From 4aa43a19f628d5518b0cd9775b70be850ee316a8 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 8 Jul 2025 17:05:44 -0700 Subject: [PATCH 3/4] [WASM] Fix nits for PR 147487 - Fix inefficient wasm test case. - Added scalar test case and more floating type. - Remove total ffast checking -> allowContract --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 1 - .../WebAssembly/WebAssemblyISelLowering.cpp | 22 ++- .../CodeGen/WebAssembly/simd-relaxed-fma.ll | 135 +++++++++++++----- 3 files changed, 114 insertions(+), 44 deletions(-) diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index ec566b168bc3d..a3675eecfea3f 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -475,7 +475,6 @@ struct SDNodeFlags { bool hasAllowReassociation() const { return Flags & AllowReassociation; } bool hasNoFPExcept() const { return Flags & NoFPExcept; } bool hasUnpredictable() const { return Flags & Unpredictable; } - bool hasFastMath() const { return Flags & FastMathFlags; } bool operator==(const SDNodeFlags &Other) const { return Flags == Other.Flags; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index ef0146f28aba1..a79fb0781cd4d 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3421,12 +3421,21 @@ static SDValue performSETCCCombine(SDNode *N, static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::FADD); using namespace llvm::SDPatternMatch; - if (!N->getFlags().hasFastMath()) + + EVT VecVT = N->getValueType(0); + + // WebAssembly doesn't have scalar fma yet + // https://github.com/WebAssembly/design/issues/1391 + if (!VecVT.isVector()) + return SDValue(); + + // Allows fp fusing + if (!N->getFlags().hasAllowContract()) return SDValue(); SDLoc DL(N); SDValue A, B, C; - EVT VecVT = N->getValueType(0); + if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C))))) return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VecVT, @@ -3437,12 +3446,17 @@ static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) { static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) { assert(N->getOpcode() == ISD::FMA); - if (!N->getFlags().hasFastMath()) + + EVT VecVT = N->getValueType(0); + if (!VecVT.isVector()) + return SDValue(); + + // Allows fp fusing + if (!N->getFlags().hasAllowContract()) return SDValue(); SDLoc DL(N); SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2); - EVT VecVT = N->getValueType(0); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VecVT, diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll index fe5e8573f12b4..882b0538af74f 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll @@ -1,43 +1,100 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+relaxed-simd | FileCheck %s target triple = "wasm32" -define void @fma_seperate(ptr %a, ptr %b, ptr %c, ptr %dest) { -; CHECK-LABEL: fma_seperate: -; CHECK: .functype fma_seperate (i32, i32, i32, i32) -> () -; CHECK-NEXT: # %bb.0: # %entry -; CHECK-NEXT: v128.load $push2=, 0($2):p2align=0 -; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0 -; CHECK-NEXT: v128.load $push0=, 0($0):p2align=0 -; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0 -; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3 -; CHECK-NEXT: return -entry: - %0 = load <4 x float>, ptr %a, align 1 - %1 = load <4 x float>, ptr %b, align 1 - %2 = load <4 x float>, ptr %c, align 1 - %mul.i = fmul fast <4 x float> %1, %0 - %add.i = fadd fast <4 x float> %mul.i, %2 - store <4 x float> %add.i, ptr %dest, align 1 - ret void -} - -; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) -define void @fma_llvm(ptr %a, ptr %b, ptr %c, ptr %dest) { -; CHECK-LABEL: fma_llvm: -; CHECK: .functype fma_llvm (i32, i32, i32, i32) -> () -; CHECK-NEXT: # %bb.0: # %entry -; CHECK-NEXT: v128.load $push2=, 0($0):p2align=0 -; CHECK-NEXT: v128.load $push1=, 0($1):p2align=0 -; CHECK-NEXT: v128.load $push0=, 0($2):p2align=0 -; CHECK-NEXT: f32x4.relaxed_madd $push3=, $pop2, $pop1, $pop0 -; CHECK-NEXT: v128.store 0($3):p2align=0, $pop3 -; CHECK-NEXT: return -entry: - %0 = load <4 x float>, ptr %a, align 1 - %1 = load <4 x float>, ptr %b, align 1 - %2 = load <4 x float>, ptr %c, align 1 - %fma = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2) - store <4 x float> %fma, ptr %dest, align 1 - ret void +define <4 x float> @fma_vector_4xf32_seperate(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: fma_vector_4xf32_seperate: +; CHECK: .functype fma_vector_4xf32_seperate (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0 +; CHECK-NEXT: return $pop0 +entry: + %mul.i = fmul fast <4 x float> %b, %a + %add.i = fadd fast <4 x float> %mul.i, %c + ret <4 x float> %add.i +} + +define <4 x float> @fma_vector_4xf32_llvm(<4 x float> %a, <4 x float> %b, <4 x float> %c) { +; CHECK-LABEL: fma_vector_4xf32_llvm: +; CHECK: .functype fma_vector_4xf32_llvm (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2 +; CHECK-NEXT: return $pop0 +entry: + %fma = tail call fast <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c) + ret <4 x float> %fma +} + + +define <2 x double> @fma_vector_2xf64_seperate(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: fma_vector_2xf64_seperate: +; CHECK: .functype fma_vector_2xf64_seperate (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0 +; CHECK-NEXT: return $pop0 +entry: + %mul.i = fmul fast <2 x double> %b, %a + %add.i = fadd fast <2 x double> %mul.i, %c + ret <2 x double> %add.i +} + +define <2 x double> @fma_vector_2xf64_llvm(<2 x double> %a, <2 x double> %b, <2 x double> %c) { +; CHECK-LABEL: fma_vector_2xf64_llvm: +; CHECK: .functype fma_vector_2xf64_llvm (v128, v128, v128) -> (v128) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2 +; CHECK-NEXT: return $pop0 +entry: + %fma = tail call fast <2 x double> @llvm.fma(<2 x double> %a, <2 x double> %b, <2 x double> %c) + ret <2 x double> %fma +} + + +define float @fma_scalar_f32_seperate(float %a, float %b, float %c) { +; CHECK-LABEL: fma_scalar_f32_seperate: +; CHECK: .functype fma_scalar_f32_seperate (f32, f32, f32) -> (f32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f32.mul $push0=, $1, $0 +; CHECK-NEXT: f32.add $push1=, $pop0, $2 +; CHECK-NEXT: return $pop1 +entry: + %mul.i = fmul fast float %b, %a + %add.i = fadd fast float %mul.i, %c + ret float %add.i +} + +define float @fma_scalar_f32_llvm(float %a, float %b, float %c) { +; CHECK-LABEL: fma_scalar_f32_llvm: +; CHECK: .functype fma_scalar_f32_llvm (f32, f32, f32) -> (f32) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: call $push0=, fmaf, $0, $1, $2 +; CHECK-NEXT: return $pop0 +entry: + %fma = tail call fast float @llvm.fma(float %a, float %b, float %c) + ret float %fma +} + + +define double @fma_scalar_f64_seperate(double %a, double %b, double %c) { +; CHECK-LABEL: fma_scalar_f64_seperate: +; CHECK: .functype fma_scalar_f64_seperate (f64, f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f64.mul $push0=, $1, $0 +; CHECK-NEXT: f64.add $push1=, $pop0, $2 +; CHECK-NEXT: return $pop1 +entry: + %mul.i = fmul fast double %b, %a + %add.i = fadd fast double %mul.i, %c + ret double %add.i +} + +define double @fma_scalar_f64_llvm(double %a, double %b, double %c) { +; CHECK-LABEL: fma_scalar_f64_llvm: +; CHECK: .functype fma_scalar_f64_llvm (f64, f64, f64) -> (f64) +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: call $push0=, fma, $0, $1, $2 +; CHECK-NEXT: return $pop0 +entry: + %fma = tail call fast double @llvm.fma(double %a, double %b, double %c) + ret double %fma } From 02569b69a380994e9a9182f904660e4096a3df21 Mon Sep 17 00:00:00 2001 From: Jasmine Tang Date: Tue, 8 Jul 2025 17:37:26 -0700 Subject: [PATCH 4/4] [WASM] Add more vector widths for PR 147487 - Added support for <8 x f32>. - Refactored out condition for relaxed simd to a seperate function. --- .../WebAssembly/WebAssemblyISelLowering.cpp | 48 ++++++++++++------- .../CodeGen/WebAssembly/simd-relaxed-fma.ll | 30 ++++++++++++ 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index a79fb0781cd4d..15038df6d5f6c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" @@ -3417,25 +3418,40 @@ static SDValue performSETCCCombine(SDNode *N, } return SDValue(); } +static bool canRelaxSimd(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + EVT VecVT = N->getValueType(0); -static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) { + // INFO: WebAssembly doesn't have scalar fma yet + // https://github.com/WebAssembly/design/issues/1391 + if (!VecVT.isVector()) + return false; + + // Allows fp fusing + if (!N->getFlags().hasAllowContract()) + return false; + + if (N->getValueType(0).bitsGT(MVT::f128)) + return false; + + return true; +} +static SDValue performFAddCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == ISD::FADD); using namespace llvm::SDPatternMatch; - EVT VecVT = N->getValueType(0); - - // WebAssembly doesn't have scalar fma yet + // INFO: WebAssembly doesn't have scalar fma yet // https://github.com/WebAssembly/design/issues/1391 + EVT VecVT = N->getValueType(0); if (!VecVT.isVector()) return SDValue(); - // Allows fp fusing - if (!N->getFlags().hasAllowContract()) + if (!canRelaxSimd(N, DCI)) return SDValue(); SDLoc DL(N); SDValue A, B, C; - + SelectionDAG &DAG = DCI.DAG; if (sd_match(N, m_FAdd(m_Value(A), m_FMul(m_Value(B), m_Value(C))))) return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, VecVT, @@ -3444,22 +3460,18 @@ static SDValue performFAddCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue performFMACombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performFMACombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { assert(N->getOpcode() == ISD::FMA); - EVT VecVT = N->getValueType(0); - if (!VecVT.isVector()) - return SDValue(); - - // Allows fp fusing - if (!N->getFlags().hasAllowContract()) + if (!canRelaxSimd(N, DCI)) return SDValue(); SDLoc DL(N); SDValue A = N->getOperand(0), B = N->getOperand(1), C = N->getOperand(2); - + SelectionDAG &DAG = DCI.DAG; return DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, VecVT, + ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), {DAG.getConstant(Intrinsic::wasm_relaxed_madd, DL, MVT::i32), A, B, C}); } @@ -3581,9 +3593,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performLowerPartialReduction(N, DCI.DAG); } case ISD::FADD: - return performFAddCombine(N, DCI.DAG); + return performFAddCombine(N, DCI); case ISD::FMA: - return performFMACombine(N, DCI.DAG); + return performFMACombine(N, DCI); case ISD::MUL: return performMulCombine(N, DCI.DAG); } diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll index 882b0538af74f..e4bd6a3a8cda6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll @@ -26,6 +26,36 @@ entry: } +define <8 x float> @fma_vector_8xf32_seperate(<8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fma_vector_8xf32_seperate: +; CHECK: .functype fma_vector_8xf32_seperate (i32, v128, v128, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2 +; CHECK-NEXT: v128.store 16($0), $pop0 +; CHECK-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1 +; CHECK-NEXT: v128.store 0($0), $pop1 +; CHECK-NEXT: return +entry: + %mul.i = fmul fast <8 x float> %b, %a + %add.i = fadd fast <8 x float> %mul.i, %c + ret <8 x float> %add.i +} + +define <8 x float> @fma_vector_8xf32_llvm(<8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fma_vector_8xf32_llvm: +; CHECK: .functype fma_vector_8xf32_llvm (i32, v128, v128, v128, v128, v128, v128) -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: f32x4.relaxed_madd $push0=, $2, $4, $6 +; CHECK-NEXT: v128.store 16($0), $pop0 +; CHECK-NEXT: f32x4.relaxed_madd $push1=, $1, $3, $5 +; CHECK-NEXT: v128.store 0($0), $pop1 +; CHECK-NEXT: return +entry: + %fma = tail call fast <8 x float> @llvm.fma(<8 x float> %a, <8 x float> %b, <8 x float> %c) + ret <8 x float> %fma +} + + define <2 x double> @fma_vector_2xf64_seperate(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: fma_vector_2xf64_seperate: ; CHECK: .functype fma_vector_2xf64_seperate (v128, v128, v128) -> (v128)