Skip to content

Commit 1b01d0f

Browse files
shiltianrampitec
andcommitted
[AMDGPU] Add support for v_tanh_bf16 on gfx1250
Co-authored-by: Mekhanoshin, Stanislav <Stanislav.Mekhanoshin@amd.com>
1 parent e3e7393 commit 1b01d0f

24 files changed

+868
-0
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,10 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
588588
def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
589589
def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
590590

591+
def int_amdgcn_tanh : DefaultAttrsIntrinsic<
592+
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
593+
>;
594+
591595
def int_amdgcn_cvt_pk_f16_fp8 : DefaultAttrsIntrinsic<
592596
[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrSpeculatable]
593597
>, ClangBuiltin<"__builtin_amdgcn_cvt_pk_f16_fp8">;

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,12 @@ def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
535535
"Use true 16-bit registers"
536536
>;
537537

538+
def FeatureBF16TransInsts : SubtargetFeature<"bf16-trans-insts",
539+
"HasBF16TransInsts",
540+
"true",
541+
"Has bf16 transcendental instructions"
542+
>;
543+
538544
def FeatureBF16ConversionInsts : SubtargetFeature<"bf16-cvt-insts",
539545
"HasBF16ConversionInsts",
540546
"true",
@@ -1946,6 +1952,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19461952
FeatureDPPSrc1SGPR,
19471953
FeatureBitOp3Insts,
19481954
FeatureTransposeLoadF4F6Insts,
1955+
FeatureBF16TransInsts,
19491956
FeatureBF16ConversionInsts,
19501957
FeatureCvtPkF16F32Inst,
19511958
FeatureMinimum3Maximum3PKF16,
@@ -2413,6 +2420,9 @@ def UseFakeTrue16Insts : True16PredicateClass<"Subtarget->hasTrue16BitInsts() &&
24132420
// FIXME When we default to RealTrue16 instead of Fake, change the line as follows.
24142421
// AssemblerPredicate<(all_of FeatureTrue16BitInsts, (not FeatureRealTrue16Insts))>;
24152422

2423+
def HasBF16TransInsts : Predicate<"Subtarget->hasBF16TransInsts()">,
2424+
AssemblerPredicate<(all_of FeatureBF16TransInsts)>;
2425+
24162426
def HasBF16ConversionInsts : Predicate<"Subtarget->hasBF16ConversionInsts()">,
24172427
AssemblerPredicate<(all_of FeatureBF16ConversionInsts)>;
24182428

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ class AMDGPUSubtarget {
5959
bool HasCvtPkF16F32Inst = false;
6060
bool HasF32ToF16BF16ConversionSRInsts = false;
6161
bool EnableRealTrue16Insts = false;
62+
bool HasBF16TransInsts = false;
6263
bool HasBF16ConversionInsts = false;
6364
bool HasMadMixInsts = false;
6465
bool HasMadMacF32Insts = false;
@@ -202,6 +203,8 @@ class AMDGPUSubtarget {
202203
// supported and the support for fake True16 instructions is removed.
203204
bool useRealTrue16Insts() const;
204205

206+
bool hasBF16TransInsts() const { return HasBF16TransInsts; }
207+
205208
bool hasBF16ConversionInsts() const {
206209
return HasBF16ConversionInsts;
207210
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2743,6 +2743,7 @@ def VOP_F16_F16 : VOPProfile<[f16, f16, untyped, untyped]>;
27432743
def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>;
27442744
def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
27452745
def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>;
2746+
def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>;
27462747

27472748
def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
27482749
def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,10 @@ defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
522522
defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
523523
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
524524
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
525+
526+
let SubtargetPredicate = HasBF16TransInsts in {
527+
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
528+
}
525529
} // End TRANS = 1, SchedRW = [WriteTrans32]
526530
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
527531
defm V_FREXP_EXP_I16_F16 : VOP1Inst_t16_with_profiles <"v_frexp_exp_i16_f16",
@@ -1099,6 +1103,7 @@ defm V_CVT_NORM_U16_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x064>;
10991103
defm V_CVT_F16_F32 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00a>;
11001104
defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
11011105

1106+
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
11021107
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
11031108
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
11041109
defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>;
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s
3+
; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s
4+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s
5+
; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s
6+
7+
; FIXME: GlobalISel does not work with bf16
8+
9+
declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
10+
11+
define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
12+
; SDAG-REAL16-LABEL: tanh_bf16:
13+
; SDAG-REAL16: ; %bb.0:
14+
; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
15+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
16+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
17+
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, s2
18+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
19+
; SDAG-REAL16-NEXT: s_endpgm
20+
;
21+
; SDAG-FAKE16-LABEL: tanh_bf16:
22+
; SDAG-FAKE16: ; %bb.0:
23+
; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
24+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
25+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
26+
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, s2
27+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
28+
; SDAG-FAKE16-NEXT: s_endpgm
29+
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat %src) #0
30+
store bfloat %tanh, ptr addrspace(1) %out, align 2
31+
ret void
32+
}
33+
34+
define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 {
35+
; SDAG-REAL16-LABEL: tanh_bf16_constant_4:
36+
; SDAG-REAL16: ; %bb.0:
37+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
38+
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 4.0
39+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
40+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
41+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
42+
; SDAG-REAL16-NEXT: s_endpgm
43+
;
44+
; SDAG-FAKE16-LABEL: tanh_bf16_constant_4:
45+
; SDAG-FAKE16: ; %bb.0:
46+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
47+
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 4.0
48+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
49+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
50+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
51+
; SDAG-FAKE16-NEXT: s_endpgm
52+
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat 4.0) #0
53+
store bfloat %tanh, ptr addrspace(1) %out, align 2
54+
ret void
55+
}
56+
57+
define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 {
58+
; SDAG-REAL16-LABEL: tanh_bf16_constant_100:
59+
; SDAG-REAL16: ; %bb.0:
60+
; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
61+
; SDAG-REAL16-NEXT: v_tanh_bf16_e32 v0.l, 0x42c8
62+
; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
63+
; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
64+
; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
65+
; SDAG-REAL16-NEXT: s_endpgm
66+
;
67+
; SDAG-FAKE16-LABEL: tanh_bf16_constant_100:
68+
; SDAG-FAKE16: ; %bb.0:
69+
; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
70+
; SDAG-FAKE16-NEXT: v_tanh_bf16_e32 v0, 0x42c8
71+
; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
72+
; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
73+
; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
74+
; SDAG-FAKE16-NEXT: s_endpgm
75+
%tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat 100.0) #0
76+
store bfloat %tanh, ptr addrspace(1) %out, align 2
77+
ret void
78+
}
79+
80+
attributes #0 = { nounwind readnone }
81+
attributes #1 = { nounwind }

llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,51 @@
11
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
22
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
33

4+
v_tanh_bf16 v5, v1
5+
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]
6+
7+
v_tanh_bf16 v5, v127
8+
// GFX1250: v_tanh_bf16_e32 v5, v127 ; encoding: [0x7f,0x95,0x0a,0x7e]
9+
10+
v_tanh_bf16 v5, s1
11+
// GFX1250: v_tanh_bf16_e32 v5, s1 ; encoding: [0x01,0x94,0x0a,0x7e]
12+
13+
v_tanh_bf16 v5, s105
14+
// GFX1250: v_tanh_bf16_e32 v5, s105 ; encoding: [0x69,0x94,0x0a,0x7e]
15+
16+
v_tanh_bf16 v5, vcc_lo
17+
// GFX1250: v_tanh_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0x94,0x0a,0x7e]
18+
19+
v_tanh_bf16 v5, vcc_hi
20+
// GFX1250: v_tanh_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0x94,0x0a,0x7e]
21+
22+
v_tanh_bf16 v5, ttmp15
23+
// GFX1250: v_tanh_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0x94,0x0a,0x7e]
24+
25+
v_tanh_bf16 v5, m0
26+
// GFX1250: v_tanh_bf16_e32 v5, m0 ; encoding: [0x7d,0x94,0x0a,0x7e]
27+
28+
v_tanh_bf16 v5, exec_lo
29+
// GFX1250: v_tanh_bf16_e32 v5, exec_lo ; encoding: [0x7e,0x94,0x0a,0x7e]
30+
31+
v_tanh_bf16 v5, exec_hi
32+
// GFX1250: v_tanh_bf16_e32 v5, exec_hi ; encoding: [0x7f,0x94,0x0a,0x7e]
33+
34+
v_tanh_bf16 v5, null
35+
// GFX1250: v_tanh_bf16_e32 v5, null ; encoding: [0x7c,0x94,0x0a,0x7e]
36+
37+
v_tanh_bf16 v5, -1
38+
// GFX1250: v_tanh_bf16_e32 v5, -1 ; encoding: [0xc1,0x94,0x0a,0x7e]
39+
40+
v_tanh_bf16 v5, 0.5
41+
// GFX1250: v_tanh_bf16_e32 v5, 0.5 ; encoding: [0xf0,0x94,0x0a,0x7e]
42+
43+
v_tanh_bf16 v5, src_scc
44+
// GFX1250: v_tanh_bf16_e32 v5, src_scc ; encoding: [0xfd,0x94,0x0a,0x7e]
45+
46+
v_tanh_bf16 v127, 0x8000
47+
// GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
48+
449
v_cvt_f32_bf16 v5, v1
550
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
651

llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,54 @@
11
// NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5
22
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s
33

4+
v_tanh_bf16 v5, v1
5+
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]
6+
7+
v_tanh_bf16 v5, v127
8+
// GFX1250: v_tanh_bf16_e32 v5, v127 ; encoding: [0x7f,0x95,0x0a,0x7e]
9+
10+
v_tanh_bf16 v5, s1
11+
// GFX1250: v_tanh_bf16_e32 v5, s1 ; encoding: [0x01,0x94,0x0a,0x7e]
12+
13+
v_tanh_bf16 v5, s105
14+
// GFX1250: v_tanh_bf16_e32 v5, s105 ; encoding: [0x69,0x94,0x0a,0x7e]
15+
16+
v_tanh_bf16 v5, vcc_lo
17+
// GFX1250: v_tanh_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0x94,0x0a,0x7e]
18+
19+
v_tanh_bf16 v5, vcc_hi
20+
// GFX1250: v_tanh_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0x94,0x0a,0x7e]
21+
22+
v_tanh_bf16 v5, ttmp15
23+
// GFX1250: v_tanh_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0x94,0x0a,0x7e]
24+
25+
v_tanh_bf16 v5, m0
26+
// GFX1250: v_tanh_bf16_e32 v5, m0 ; encoding: [0x7d,0x94,0x0a,0x7e]
27+
28+
v_tanh_bf16 v5, exec_lo
29+
// GFX1250: v_tanh_bf16_e32 v5, exec_lo ; encoding: [0x7e,0x94,0x0a,0x7e]
30+
31+
v_tanh_bf16 v5, exec_hi
32+
// GFX1250: v_tanh_bf16_e32 v5, exec_hi ; encoding: [0x7f,0x94,0x0a,0x7e]
33+
34+
v_tanh_bf16 v5, null
35+
// GFX1250: v_tanh_bf16_e32 v5, null ; encoding: [0x7c,0x94,0x0a,0x7e]
36+
37+
v_tanh_bf16 v5, -1
38+
// GFX1250: v_tanh_bf16_e32 v5, -1 ; encoding: [0xc1,0x94,0x0a,0x7e]
39+
40+
v_tanh_bf16 v5, 0.5
41+
// GFX1250: v_tanh_bf16_e32 v5, 0.5 ; encoding: [0xf0,0x94,0x0a,0x7e]
42+
43+
v_tanh_bf16 v5, src_scc
44+
// GFX1250: v_tanh_bf16_e32 v5, src_scc ; encoding: [0xfd,0x94,0x0a,0x7e]
45+
46+
v_tanh_bf16 v127, 0x8000
47+
// GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00]
48+
49+
v_tanh_bf16 v5.h, v1.h
50+
// GFX1250: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f]
51+
452
v_cvt_f32_bf16 v5, v1
553
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
654

llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,62 @@
22
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s
33
// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s
44

5+
v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
6+
// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
7+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
8+
9+
v_tanh_bf16 v5, v1 quad_perm:[0,1,2,3]
10+
// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0xff]
11+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
12+
13+
v_tanh_bf16 v5, v1 row_mirror
14+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x40,0x01,0xff]
15+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
16+
17+
v_tanh_bf16 v5, v1 row_half_mirror
18+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x41,0x01,0xff]
19+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
20+
21+
v_tanh_bf16 v5, v1 row_shl:1
22+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x01,0x01,0xff]
23+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
24+
25+
v_tanh_bf16 v5, v1 row_shl:15
26+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x0f,0x01,0xff]
27+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
28+
29+
v_tanh_bf16 v5, v1 row_shr:1
30+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x11,0x01,0xff]
31+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
32+
33+
v_tanh_bf16 v5, v1 row_shr:15
34+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1f,0x01,0xff]
35+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
36+
37+
v_tanh_bf16 v5, v1 row_ror:1
38+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x21,0x01,0xff]
39+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
40+
41+
v_tanh_bf16 v5, v1 row_ror:15
42+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x2f,0x01,0xff]
43+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
44+
45+
v_tanh_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
46+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x50,0x01,0xff]
47+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
48+
49+
v_tanh_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
50+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x5f,0x01,0x01]
51+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
52+
53+
v_tanh_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
54+
// GFX1250: v_tanh_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x60,0x09,0x13]
55+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
56+
57+
v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
58+
// GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
59+
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
60+
561
v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0]
662
// GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff]
763
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU

0 commit comments

Comments
 (0)