Skip to content

[AMDGPU] Add support for v_tanh_bf16 on gfx1250 #147425

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,8 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_barrier_arrive_rtn_b64, "LiLi*3Li", "n
TARGET_BUILTIN(__builtin_amdgcn_s_setprio_inc_wg, "vIs", "n", "setprio-inc-wg-inst")
TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,9 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType });
return Builder.CreateCall(F, { Src });
}
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_tanh);
case AMDGPU::BI__builtin_amdgcn_uicmp:
case AMDGPU::BI__builtin_amdgcn_uicmpl:
case AMDGPU::BI__builtin_amdgcn_sicmp:
Expand Down
19 changes: 19 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,25 @@ void test_s_monitor_sleep() {
__builtin_amdgcn_s_monitor_sleep(10);
}

// CHECK-LABEL: @test_tanh_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.tanh.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_tanh_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_tanh_bf16(a);
}

// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;

def int_amdgcn_tanh : DefaultAttrsIntrinsic<
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
>;

def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
"Use true 16-bit registers"
>;

def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
"HasBF16TransInsts",
"true",
"Has bf16 transcendental instructions"
>;

def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
"HasBF16ConversionInsts",
"true",
Expand Down Expand Up @@ -1946,6 +1952,7 @@ def FeatureISAVersion12_50 : FeatureSet<
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTransposeLoadF4F6Insts,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3PKF16,
Expand Down Expand Up @@ -2413,6 +2420,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;

def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;

def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;

Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4001,7 +4001,8 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp: {
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_tanh: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
Expand Down Expand Up @@ -6184,7 +6185,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp: {
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_tanh: {
if (SNaN)
return true;

Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
case Intrinsic::amdgcn_sqrt:
case Intrinsic::amdgcn_rsq: {
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_tanh: {
Value *Src = II.getArgOperand(0);
if (isa<PoisonValue>(Src))
return IC.replaceInstUsesWith(II, Src);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_tanh:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fma_legacy:
case Intrinsic::amdgcn_frexp_mant:
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class AMDGPUSubtarget {
bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16TransInsts = false;
bool HasBF16ConversionInsts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
Expand Down Expand Up @@ -202,6 +203,8 @@ class AMDGPUSubtarget {
// supported and the support for fake True16 instructions is removed.
bool useRealTrue16Insts() const;

bool hasBF16TransInsts() const { return HasBF16TransInsts; }

bool hasBF16ConversionInsts() const {
return HasBF16ConversionInsts;
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13658,6 +13658,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_fdot2:
case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_tanh:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't tested

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was but not by this instruction, since this intrinsic is used for those different types. I'll remove them then?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't follow, this shouldn't be hard to test? e.g. fcanonicalize-elimination.ll, you should just need a canonicalize call after these which should fold out

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't do it at this moment because the support for v_cvt_pk_bf16_f32 on gfx1250 has not been upstreamed. Without it llvm.canonicalize.bf16 doesn't work.

I'll add the test to downstream though.

return true;
default:
break;
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2743,6 +2743,7 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>;
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;

def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,10 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;

let SubtargetPredicate = HasBF16TransInsts in {
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16",
Expand Down Expand Up @@ -1099,6 +1103,7 @@ defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>;
defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>;
defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;

defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
Expand Down
81 changes: 81 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s

; FIXME: GlobalISel does not work with bf16

declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0

define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
; SDAG-REAL16-LABEL: tanh_bf16:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, s2
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_bf16:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, s2
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %src) #0
store bfloat %tanh, ptr addrspace(1) %out, align 2
ret void
}

define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 {
; SDAG-REAL16-LABEL: tanh_bf16_constant_4:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 4.0
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_bf16_constant_4:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 4.0
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat 4.0) #0
store bfloat %tanh, ptr addrspace(1) %out, align 2
ret void
}

define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 {
; SDAG-REAL16-LABEL: tanh_bf16_constant_100:
; SDAG-REAL16: ; %bb.0:
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 0x42c8
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
; SDAG-REAL16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: tanh_bf16_constant_100:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 0x42c8
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
; SDAG-FAKE16-NEXT: s_endpgm
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat 100.0) #0
store bfloat %tanh, ptr addrspace(1) %out, align 2
ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
45 changes: 45 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
Original file line number Diff line number Diff line change
@@ -1,6 +1,51 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s

v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]

v_tanh_bf16 v5, v127
// GFX1250: v_tanh_bf16_e32 v5, v127 ; encoding: [0x7f,0x95,0x0a,0x7e]

v_tanh_bf16 v5, s1
// GFX1250: v_tanh_bf16_e32 v5, s1 ; encoding: [0x01,0x94,0x0a,0x7e]

v_tanh_bf16 v5, s105
// GFX1250: v_tanh_bf16_e32 v5, s105 ; encoding: [0x69,0x94,0x0a,0x7e]

v_tanh_bf16 v5, vcc_lo
// GFX1250: v_tanh_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0x94,0x0a,0x7e]

v_tanh_bf16 v5, vcc_hi
// GFX1250: v_tanh_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0x94,0x0a,0x7e]

v_tanh_bf16 v5, ttmp15
// GFX1250: v_tanh_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0x94,0x0a,0x7e]

v_tanh_bf16 v5, m0
// GFX1250: v_tanh_bf16_e32 v5, m0 ; encoding: [0x7d,0x94,0x0a,0x7e]

v_tanh_bf16 v5, exec_lo
// GFX1250: v_tanh_bf16_e32 v5, exec_lo ; encoding: [0x7e,0x94,0x0a,0x7e]

v_tanh_bf16 v5, exec_hi
// GFX1250: v_tanh_bf16_e32 v5, exec_hi ; encoding: [0x7f,0x94,0x0a,0x7e]

v_tanh_bf16 v5, null
// GFX1250: v_tanh_bf16_e32 v5, null ; encoding: [0x7c,0x94,0x0a,0x7e]

v_tanh_bf16 v5, -1
// GFX1250: v_tanh_bf16_e32 v5, -1 ; encoding: [0xc1,0x94,0x0a,0x7e]

v_tanh_bf16 v5, 0.5
// GFX1250: v_tanh_bf16_e32 v5, 0.5 ; encoding: [0xf0,0x94,0x0a,0x7e]

v_tanh_bf16 v5, src_scc
// GFX1250: v_tanh_bf16_e32 v5, src_scc ; encoding: [0xfd,0x94,0x0a,0x7e]

v_tanh_bf16 v127, 0x8000
// GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]

v_cvt_f32_bf16 v5, v1
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]

Expand Down
48 changes: 48 additions & 0 deletions llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
Original file line number Diff line number Diff line change
@@ -1,6 +1,54 @@
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s

v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]

v_tanh_bf16 v5, v127
// GFX1250: v_tanh_bf16_e32 v5, v127 ; encoding: [0x7f,0x95,0x0a,0x7e]

v_tanh_bf16 v5, s1
// GFX1250: v_tanh_bf16_e32 v5, s1 ; encoding: [0x01,0x94,0x0a,0x7e]

v_tanh_bf16 v5, s105
// GFX1250: v_tanh_bf16_e32 v5, s105 ; encoding: [0x69,0x94,0x0a,0x7e]

v_tanh_bf16 v5, vcc_lo
// GFX1250: v_tanh_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0x94,0x0a,0x7e]

v_tanh_bf16 v5, vcc_hi
// GFX1250: v_tanh_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0x94,0x0a,0x7e]

v_tanh_bf16 v5, ttmp15
// GFX1250: v_tanh_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0x94,0x0a,0x7e]

v_tanh_bf16 v5, m0
// GFX1250: v_tanh_bf16_e32 v5, m0 ; encoding: [0x7d,0x94,0x0a,0x7e]

v_tanh_bf16 v5, exec_lo
// GFX1250: v_tanh_bf16_e32 v5, exec_lo ; encoding: [0x7e,0x94,0x0a,0x7e]

v_tanh_bf16 v5, exec_hi
// GFX1250: v_tanh_bf16_e32 v5, exec_hi ; encoding: [0x7f,0x94,0x0a,0x7e]

v_tanh_bf16 v5, null
// GFX1250: v_tanh_bf16_e32 v5, null ; encoding: [0x7c,0x94,0x0a,0x7e]

v_tanh_bf16 v5, -1
// GFX1250: v_tanh_bf16_e32 v5, -1 ; encoding: [0xc1,0x94,0x0a,0x7e]

v_tanh_bf16 v5, 0.5
// GFX1250: v_tanh_bf16_e32 v5, 0.5 ; encoding: [0xf0,0x94,0x0a,0x7e]

v_tanh_bf16 v5, src_scc
// GFX1250: v_tanh_bf16_e32 v5, src_scc ; encoding: [0xfd,0x94,0x0a,0x7e]

v_tanh_bf16 v127, 0x8000
// GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]

v_tanh_bf16 v5.h, v1.h
// GFX1250: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f]

v_cvt_f32_bf16 v5, v1
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]

Expand Down
Loading
Loading