Skip to content

Commit d258457

Browse files
shiltianrampitec
andauthored
[AMDGPU] Add support for v_cvt_f32_fp8 on gfx1250 (#147579)
Co-authored-by: Mekhanoshin, Stanislav <Stanislav.Mekhanoshin@amd.com>
1 parent 57d2d89 commit d258457

27 files changed

+655
-18
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8, "V16fV2iV4iV16fiIiI
429429

430430
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_bf8, "fiIi", "nc", "fp8-conversion-insts")
431431
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8, "fiIi", "nc", "fp8-conversion-insts")
432+
TARGET_BUILTIN(__builtin_amdgcn_cvt_f32_fp8_e5m3, "fiIi", "nc", "fp8e5m3-insts")
432433
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_bf8, "V2fiIb", "nc", "fp8-conversion-insts")
433434
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f32_fp8, "V2fiIb", "nc", "fp8-conversion-insts")
434435
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f32, "iffiIb", "nc", "fp8-conversion-insts")

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,41 @@ void test_cvt_pk_f16_bf8(global half2* out, short a)
139139
{
140140
out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a);
141141
}
142+
143+
// CHECK-LABEL: @test_cvt_f32_fp8_e5m3(
144+
// CHECK-NEXT: entry:
145+
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
146+
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
147+
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
148+
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
149+
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
150+
// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
151+
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
152+
// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP0]], i32 0)
153+
// CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP1]] to i32
154+
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
155+
// CHECK-NEXT: store i32 [[CONV]], ptr addrspace(1) [[TMP2]], align 4
156+
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
157+
// CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP3]], i32 1)
158+
// CHECK-NEXT: [[CONV1:%.*]] = fptosi float [[TMP4]] to i32
159+
// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
160+
// CHECK-NEXT: store i32 [[CONV1]], ptr addrspace(1) [[TMP5]], align 4
161+
// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
162+
// CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP6]], i32 2)
163+
// CHECK-NEXT: [[CONV2:%.*]] = fptosi float [[TMP7]] to i32
164+
// CHECK-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
165+
// CHECK-NEXT: store i32 [[CONV2]], ptr addrspace(1) [[TMP8]], align 4
166+
// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
167+
// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.f32.fp8.e5m3(i32 [[TMP9]], i32 3)
168+
// CHECK-NEXT: [[CONV3:%.*]] = fptosi float [[TMP10]] to i32
169+
// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
170+
// CHECK-NEXT: store i32 [[CONV3]], ptr addrspace(1) [[TMP11]], align 4
171+
// CHECK-NEXT: ret void
172+
//
173+
void test_cvt_f32_fp8_e5m3(global int* out, int a)
174+
{
175+
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 0);
176+
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 1);
177+
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
178+
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
179+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,8 @@ void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
2727
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
2828
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
2929
}
30+
31+
void test_cvt_f32_fp8_e5m3(global int* out, int a)
32+
{
33+
*out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
34+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3402,6 +3402,12 @@ def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">,
34023402
[llvm_i32_ty, llvm_i32_ty],
34033403
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
34043404

3405+
// llvm.amdgcn.cvt.f32.fp8.e5m3 float vdst, int srcA, imm byte_sel [0..3]
3406+
def int_amdgcn_cvt_f32_fp8_e5m3 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8_e5m3">,
3407+
DefaultAttrsIntrinsic<[llvm_float_ty],
3408+
[llvm_i32_ty, llvm_i32_ty],
3409+
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
3410+
34053411
// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel
34063412
// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes.
34073413
def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">,

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,12 @@ def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
819819
"Has fp8 and bf8 conversion instructions"
820820
>;
821821

822+
def FeatureFP8E5M3Insts : SubtargetFeature<"fp8e5m3-insts",
823+
"HasFP8E5M3Insts",
824+
"true",
825+
"Has fp8 e5m3 format support"
826+
>;
827+
822828
def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
823829
"HasCvtFP8Vop1Bug",
824830
"true",
@@ -1937,6 +1943,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19371943
FeatureAtomicBufferPkAddBF16Inst,
19381944
FeatureFlatAtomicFaddF32Inst,
19391945
FeatureFP8ConversionInsts,
1946+
FeatureFP8E5M3Insts,
19401947
FeaturePackedTID,
19411948
FeatureVcmpxPermlaneHazard,
19421949
FeatureSALUFloatInsts,
@@ -2573,6 +2580,12 @@ def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
25732580
def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
25742581
AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
25752582

2583+
def HasFP8E5M3Insts : Predicate<"Subtarget->hasFP8E5M3Insts()">,
2584+
AssemblerPredicate<(all_of FeatureFP8E5M3Insts)>;
2585+
2586+
def NotHasFP8E5M3Insts : Predicate<"!Subtarget->hasFP8E5M3Insts()">,
2587+
AssemblerPredicate<(all_of (not FeatureFP8E5M3Insts))>;
2588+
25762589
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
25772590
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
25782591

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4600,6 +4600,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
46004600
case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
46014601
case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
46024602
case Intrinsic::amdgcn_cvt_f32_fp8:
4603+
case Intrinsic::amdgcn_cvt_f32_fp8_e5m3:
46034604
case Intrinsic::amdgcn_cvt_f32_bf8:
46044605
case Intrinsic::amdgcn_cvt_off_f32_i4:
46054606
case Intrinsic::amdgcn_cvt_pk_f32_fp8:

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9566,13 +9566,13 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
95669566
}
95679567
}
95689568

9569-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
9569+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp) && !IsVOP3CvtSrDpp)
95709570
addOptionalImmOperand(Inst, Operands, OptionalIdx,
9571-
AMDGPUOperand::ImmTyByteSel);
9571+
AMDGPUOperand::ImmTyClamp);
95729572

9573-
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
9573+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
95749574
addOptionalImmOperand(Inst, Operands, OptionalIdx,
9575-
AMDGPUOperand::ImmTyClamp);
9575+
AMDGPUOperand::ImmTyByteSel);
95769576

95779577
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::omod))
95789578
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
165165
bool HasMAIInsts = false;
166166
bool HasFP8Insts = false;
167167
bool HasFP8ConversionInsts = false;
168+
bool HasFP8E5M3Insts = false;
168169
bool HasCvtFP8Vop1Bug = false;
169170
bool HasPkFmacF16Inst = false;
170171
bool HasAtomicFMinFMaxF32GlobalInsts = false;
@@ -861,6 +862,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
861862

862863
bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
863864

865+
bool hasFP8E5M3Insts() const { return HasFP8E5M3Insts; }
866+
864867
bool hasPkFmacF16Inst() const {
865868
return HasPkFmacF16Inst;
866869
}

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -676,13 +676,14 @@ let HasClamp = 0, HasOMod = 0, HasExtDPP = 0, HasExtVOP3DPP = 0,
676676
}
677677
}
678678

679-
class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
679+
class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT, bit _HasClamp = 0> :
680+
VOPProfile<[DstVT, i32, untyped, untyped]> {
681+
let HasClamp = _HasClamp;
680682
let HasFP8SrcByteSel = 1;
681683
let HasOpSel = 0;
682684
let HasExtDPP = 1;
683685
let HasExtVOP3DPP = 1;
684686
let HasExtSDWA = 0;
685-
let HasClamp = 0;
686687
let HasOMod = 0;
687688
let HasModifiers = 0;
688689
}
@@ -695,7 +696,12 @@ def V_CVT_F16_F8_Fake16_Profile : VOP3_Profile_Fake16<V_CVT_F16_F8_Profile>;
695696

696697
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
697698
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
698-
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
699+
// FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
700+
let SubtargetPredicate = isGFX12PlusNot12_50 in
701+
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
702+
let SubtargetPredicate = isGFX125xOnly in
703+
defm V_CVT_F32_FP8_gfx1250 : VOP1Inst<"v_cvt_f32_fp8_gfx1250", VOPProfile_Base_CVT_F_F8_ByteSel<f32, 1>>;
704+
699705
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
700706

701707
let True16Predicate = UseFakeTrue16Insts in {
@@ -714,9 +720,19 @@ class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst, bit HasOpSe
714720
(inst $src0, (as_i32timm $byte_sel)))
715721
>;
716722

717-
let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
718-
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
719-
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
723+
let OtherPredicates = [HasFP8ConversionInsts] in {
724+
// FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
725+
let SubtargetPredicate = isGFX12PlusNot12_50 in
726+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
727+
let SubtargetPredicate = isGFX125xOnly in {
728+
def : GCNPat<(int_amdgcn_cvt_f32_fp8 i32:$src0, timm:$byte_sel),
729+
(V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.NONE, (as_i32timm $byte_sel))>;
730+
def : GCNPat<(int_amdgcn_cvt_f32_fp8_e5m3 i32:$src0, timm:$byte_sel),
731+
(V_CVT_F32_FP8_gfx1250_e64 $src0, DSTCLAMP.ENABLE, (as_i32timm $byte_sel))>;
732+
}
733+
// FIXME: This differs from downstream due to changes that haven't been upstreamed yet.
734+
let SubtargetPredicate = isGFX12Plus in
735+
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
720736
}
721737

722738
class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
@@ -1038,7 +1054,9 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250<
10381054
VOP1_Real_FULL_with_name<GFX1250Gen, op, opName#"_fake16", asmName>;
10391055
}
10401056

1041-
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
1057+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Not12_50Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
1058+
defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX1250Gen, 0x06c, "V_CVT_F32_FP8_gfx1250", "v_cvt_f32_fp8">;
1059+
10421060
defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
10431061

10441062
defm V_CVT_PK_F32_FP8_fake16 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8_fake16", "v_cvt_pk_f32_fp8">;

0 commit comments

Comments
 (0)