Skip to content

Commit 0f941d3

Browse files
shiltianrampitec
andcommitted
[AMDGPU] Add support for v_tanh_bf16 on gfx1250
Co-authored-by: Mekhanoshin, Stanislav <Stanislav.Mekhanoshin@amd.com>
1 parent e3e7393 commit 0f941d3

31 files changed

+900
-3
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "n
664664
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
665665
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")
666666

667+
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
668+
667669
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
668670
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
669671
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
497497
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
498498
return Builder.CreateCall(F, { Src });
499499
}
500+
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
501+
return emitBuiltinWithOneOverloadedType<1>(*this, E,
502+
Intrinsic::amdgcn_tanh);
500503
case AMDGPU::BI__builtin_amdgcn_uicmp:
501504
case AMDGPU::BI__builtin_amdgcn_uicmpl:
502505
case AMDGPU::BI__builtin_amdgcn_sicmp:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,25 @@ void test_s_monitor_sleep() {
2424
__builtin_amdgcn_s_monitor_sleep(10);
2525
}
2626

27+
// CHECK-LABEL: @test_tanh_bf16(
28+
// CHECK-NEXT: entry:
29+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
30+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
31+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
32+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
33+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
34+
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
35+
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
36+
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]])
37+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
38+
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
39+
// CHECK-NEXT: ret void
40+
//
41+
void test_tanh_bf16(global __bf16* out, __bf16 a)
42+
{
43+
*out = __builtin_amdgcn_tanh_bf16(a);
44+
}
45+
2746
// CHECK-LABEL: @test_cvt_f16_fp8(
2847
// CHECK-NEXT: entry:
2948
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
588588
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
589589
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
590590

591+
def int_amdgcn_tanh : DefaultAttrsIntrinsic<
592+
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
593+
>;
594+
591595
def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
592596
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
593597
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
535535
"Use true 16-bit registers"
536536
>;
537537

538+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
539+
"HasBF16TransInsts",
540+
"true",
541+
"Has bf16 transcendental instructions"
542+
>;
543+
538544
def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
539545
"HasBF16ConversionInsts",
540546
"true",
@@ -1946,6 +1952,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19461952
FeatureDPPSrc1SGPR,
19471953
FeatureBitOp3Insts,
19481954
FeatureTransposeLoadF4F6Insts,
1955+
FeatureBF16TransInsts,
19491956
FeatureBF16ConversionInsts,
19501957
FeatureCvtPkF16F32Inst,
19511958
FeatureMinimum3Maximum3PKF16,
@@ -2413,6 +2420,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
24132420
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
24142421
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
24152422

2423+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
2424+
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
2425+
24162426
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
24172427
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
24182428

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4001,7 +4001,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
40014001
case Intrinsic::amdgcn_rsq:
40024002
case Intrinsic::amdgcn_rcp_legacy:
40034003
case Intrinsic::amdgcn_rsq_legacy:
4004-
case Intrinsic::amdgcn_rsq_clamp: {
4004+
case Intrinsic::amdgcn_rsq_clamp:
4005+
case Intrinsic::amdgcn_tanh: {
40054006
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
40064007
SDValue Src = N->getOperand(1);
40074008
return Src.isUndef() ? Src : SDValue();
@@ -6184,7 +6185,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
61846185
case Intrinsic::amdgcn_rsq:
61856186
case Intrinsic::amdgcn_rcp_legacy:
61866187
case Intrinsic::amdgcn_rsq_legacy:
6187-
case Intrinsic::amdgcn_rsq_clamp: {
6188+
case Intrinsic::amdgcn_rsq_clamp:
6189+
case Intrinsic::amdgcn_tanh: {
61886190
if (SNaN)
61896191
return true;
61906192

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
700700
break;
701701
}
702702
case Intrinsic::amdgcn_sqrt:
703-
case Intrinsic::amdgcn_rsq: {
703+
case Intrinsic::amdgcn_rsq:
704+
case Intrinsic::amdgcn_tanh: {
704705
Value *Src = II.getArgOperand(0);
705706
if (isa<PoisonValue>(Src))
706707
return IC.replaceInstUsesWith(II, Src);

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45464546
case Intrinsic::amdgcn_rcp_legacy:
45474547
case Intrinsic::amdgcn_rsq_legacy:
45484548
case Intrinsic::amdgcn_rsq_clamp:
4549+
case Intrinsic::amdgcn_tanh:
45494550
case Intrinsic::amdgcn_fmul_legacy:
45504551
case Intrinsic::amdgcn_fma_legacy:
45514552
case Intrinsic::amdgcn_frexp_mant:

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
5959
bool HasCvtPkF16F32Inst = false;
6060
bool HasF32ToF16BF16ConversionSRInsts = false;
6161
bool EnableRealTrue16Insts = false;
62+
bool HasBF16TransInsts = false;
6263
bool HasBF16ConversionInsts = false;
6364
bool HasMadMixInsts = false;
6465
bool HasMadMacF32Insts = false;
@@ -202,6 +203,8 @@ class AMDGPUSubtarget {
202203
// supported and the support for fake True16 instructions is removed.
203204
bool useRealTrue16Insts() const;
204205

206+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
207+
205208
bool hasBF16ConversionInsts() const {
206209
return HasBF16ConversionInsts;
207210
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13658,6 +13658,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
1365813658
case Intrinsic::amdgcn_frexp_mant:
1365913659
case Intrinsic::amdgcn_fdot2:
1366013660
case Intrinsic::amdgcn_trig_preop:
13661+
case Intrinsic::amdgcn_tanh:
1366113662
return true;
1366213663
default:
1366313664
break;

0 commit comments

Comments
 (0)