Skip to content

Commit eda3161

Browse files
changpengrampitec
andauthored
AMDGPU: Implement tensor load and store instructions for gfx1250 (#146636)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
1 parent 1995fd9 commit eda3161

19 files changed

+625
-17
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
640640
// GFX1250+ only builtins.
641641
//===----------------------------------------------------------------------===//
642642

643+
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
644+
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
645+
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
646+
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
647+
643648
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
644649
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
645650
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
2+
// REQUIRES: amdgpu-registered-target
3+
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
4+
5+
typedef int v4i __attribute__((ext_vector_type(4)));
6+
typedef int v8i __attribute__((ext_vector_type(8)));
7+
8+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
9+
// CHECK-GFX1250-NEXT: entry:
10+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
11+
// CHECK-GFX1250-NEXT: ret void
12+
//
13+
void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
14+
{
15+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
16+
}
17+
18+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
19+
// CHECK-GFX1250-NEXT: entry:
20+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
21+
// CHECK-GFX1250-NEXT: ret void
22+
//
23+
void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
24+
{
25+
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
26+
}
27+
28+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
29+
// CHECK-GFX1250-NEXT: entry:
30+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
31+
// CHECK-GFX1250-NEXT: ret void
32+
//
33+
void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
34+
{
35+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
36+
}
37+
38+
// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
39+
// CHECK-GFX1250-NEXT: entry:
40+
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
41+
// CHECK-GFX1250-NEXT: ret void
42+
//
43+
void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
44+
{
45+
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
46+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
// REQUIRES: amdgpu-registered-target
22
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
33

4+
typedef int v4i __attribute__((ext_vector_type(4)));
5+
typedef int v8i __attribute__((ext_vector_type(8)));
6+
47
void test_setprio_inc_wg(short a) {
58
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
69
}
@@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
1619
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
1720
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
1821
}
22+
23+
void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
24+
{
25+
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
26+
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
27+
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
28+
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
29+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3580,6 +3580,41 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
35803580
[IntrNoMem, IntrSpeculatable]
35813581
>;
35823582

3583+
class AMDGPUTensorLoadStore:
3584+
Intrinsic<
3585+
[],
3586+
[llvm_v4i32_ty, // D# group 0
3587+
llvm_v8i32_ty, // D# group 1
3588+
llvm_v4i32_ty, // D# group 2
3589+
llvm_v4i32_ty, // D# group 3
3590+
llvm_i32_ty], // cachepolicy:
3591+
// bits [0-2] = th
3592+
// bits [3-4] = scope
3593+
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
3594+
"", [SDNPMemOperand]
3595+
>;
3596+
3597+
class AMDGPUTensorLoadStoreD2:
3598+
Intrinsic<
3599+
[],
3600+
[llvm_v4i32_ty, // D# group 0
3601+
llvm_v8i32_ty, // D# group 1
3602+
llvm_i32_ty], // cachepolicy:
3603+
// bits [0-2] = th
3604+
// bits [3-4] = scope
3605+
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
3606+
"", [SDNPMemOperand]
3607+
>;
3608+
3609+
def int_amdgcn_tensor_load_to_lds :
3610+
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
3611+
def int_amdgcn_tensor_store_from_lds :
3612+
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
3613+
def int_amdgcn_tensor_load_to_lds_d2 :
3614+
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2;
3615+
def int_amdgcn_tensor_store_from_lds_d2 :
3616+
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
3617+
35833618
/// Emit an addrspacecast without null pointer checking.
35843619
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
35853620
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
33483348
MI.eraseFromParent();
33493349
return;
33503350
}
3351+
case Intrinsic::amdgcn_tensor_load_to_lds:
3352+
case Intrinsic::amdgcn_tensor_store_from_lds: {
3353+
constrainOpWithReadfirstlane(B, MI, 1);
3354+
constrainOpWithReadfirstlane(B, MI, 2);
3355+
constrainOpWithReadfirstlane(B, MI, 3);
3356+
constrainOpWithReadfirstlane(B, MI, 4);
3357+
return;
3358+
}
3359+
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
3360+
case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
3361+
constrainOpWithReadfirstlane(B, MI, 1);
3362+
constrainOpWithReadfirstlane(B, MI, 2);
3363+
return;
3364+
}
33513365
default: {
33523366
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
33533367
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
53545368
}
53555369
case Intrinsic::amdgcn_pops_exiting_wave_id:
53565370
return getDefaultMappingSOP(MI);
5371+
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
5372+
case Intrinsic::amdgcn_tensor_store_from_lds_d2:
5373+
case Intrinsic::amdgcn_tensor_load_to_lds:
5374+
case Intrinsic::amdgcn_tensor_store_from_lds: {
5375+
// Lie and claim everything is legal, even all operands need to be
5376+
// SGPRs. applyMapping will have to deal with it with readfirstlane.
5377+
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
5378+
if (MI.getOperand(I).isReg()) {
5379+
Register Reg = MI.getOperand(I).getReg();
5380+
auto OpBank = getRegBankID(Reg, MRI);
5381+
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
5382+
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
5383+
}
5384+
}
5385+
break;
5386+
}
53575387
case Intrinsic::amdgcn_s_prefetch_data: {
53585388
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
53595389
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
17841784
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
17851785
bool validateMIMGD16(const MCInst &Inst);
17861786
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
1787+
bool validateTensorR128(const MCInst &Inst);
17871788
bool validateMIMGMSAA(const MCInst &Inst);
17881789
bool validateOpSel(const MCInst &Inst);
17891790
bool validateTrue16OpSel(const MCInst &Inst);
@@ -4280,6 +4281,18 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
42804281
return true;
42814282
}
42824283

4284+
bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
4285+
const unsigned Opc = Inst.getOpcode();
4286+
const MCInstrDesc &Desc = MII.get(Opc);
4287+
4288+
if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
4289+
return true;
4290+
4291+
int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
4292+
4293+
return R128Idx < 0 || !Inst.getOperand(R128Idx).getImm();
4294+
}
4295+
42834296
static bool IsRevOpcode(const unsigned Opcode)
42844297
{
42854298
switch (Opcode) {
@@ -5113,14 +5126,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
51135126
return PrintError("scope and th combination is not valid");
51145127
}
51155128

5116-
bool IsStore = TID.mayStore();
5117-
bool IsAtomic =
5118-
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
5119-
5120-
if (IsAtomic) {
5129+
unsigned THType = AMDGPU::getTemporalHintType(TID);
5130+
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
51215131
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
51225132
return PrintError("invalid th value for atomic instructions");
5123-
} else if (IsStore) {
5133+
} else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
51245134
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
51255135
return PrintError("invalid th value for store instructions");
51265136
} else {
@@ -5205,6 +5215,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
52055215
Error(IDLoc, "missing dim operand");
52065216
return false;
52075217
}
5218+
if (!validateTensorR128(Inst)) {
5219+
Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
5220+
"instruction must set modifier r128=0");
5221+
return false;
5222+
}
52085223
if (!validateMIMGMSAA(Inst)) {
52095224
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
52105225
"invalid dim; must be MSAA type");

llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
173173

174174
const unsigned Opcode = MI->getOpcode();
175175
const MCInstrDesc &TID = MII.get(Opcode);
176-
bool IsStore = TID.mayStore();
177-
bool IsAtomic =
178-
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
176+
unsigned THType = AMDGPU::getTemporalHintType(TID);
177+
bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);
179178

180179
O << " th:";
181180

182-
if (IsAtomic) {
181+
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
183182
O << "TH_ATOMIC_";
184183
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
185184
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
@@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
196195
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
197196
O << formatHex(TH);
198197
else {
199-
// This will default to printing load variants when neither MayStore nor
200-
// MayLoad flag is present which is the case with instructions like
201-
// image_get_resinfo.
202198
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
203199
switch (TH) {
204200
case AMDGPU::CPol::TH_NT:

llvm/lib/Target/AMDGPU/MIMGInstructions.td

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
20192019
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
20202020
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
20212021
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;
2022+
2023+
//===----------------------------------------------------------------------===//
2024+
// VIMAGE Tensor Instructions
2025+
//===----------------------------------------------------------------------===//
2026+
2027+
class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
2028+
InstSI<(outs ), (ins ), "", []>,
2029+
SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {
2030+
2031+
let isPseudo = 1;
2032+
let isCodeGenOnly = 1;
2033+
string Mnemonic = opName;
2034+
2035+
let VALU = 1;
2036+
let maybeAtomic = 0;
2037+
let TENSOR_CNT = 1;
2038+
let mayLoad = 1;
2039+
let mayStore = 1;
2040+
let Uses = [EXEC, TENSORcnt];
2041+
let Defs = [TENSORcnt];
2042+
let SchedRW = [WriteVMEM, WriteLDS];
2043+
let UseNamedOperandTable = 1;
2044+
let hasSideEffects = 0;
2045+
2046+
bit UpTo2D = _UpTo2D;
2047+
let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
2048+
(ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
2049+
SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
2050+
string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
2051+
}
2052+
2053+
let SubtargetPredicate = isGFX1250Plus in {
2054+
def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
2055+
def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
2056+
def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
2057+
def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
2058+
} // End SubtargetPredicate = isGFX1250Plus.
2059+
2060+
class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
2061+
(node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
2062+
(inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
2063+
>;
2064+
2065+
class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
2066+
(node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
2067+
(inst $vaddr0, $vaddr1, 0, $cpol)
2068+
>;
2069+
2070+
let SubtargetPredicate = isGFX1250Plus in {
2071+
def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
2072+
def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
2073+
def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
2074+
def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
2075+
}
2076+
2077+
class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
2078+
InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
2079+
VIMAGEe<op> {
2080+
2081+
// copy relevant pseudo op flags
2082+
let SubtargetPredicate = ps.SubtargetPredicate;
2083+
let TSFlags = ps.TSFlags;
2084+
let mayLoad = ps.mayLoad;
2085+
let mayStore = ps.mayStore;
2086+
let UseNamedOperandTable = ps.UseNamedOperandTable;
2087+
let SchedRW = ps.SchedRW;
2088+
2089+
// D# group 2 and 3 set to NULL for 2D or less.
2090+
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
2091+
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
2092+
2093+
// set to 0 based on SPG.
2094+
let vaddr4 = 0;
2095+
let rsrc = 0;
2096+
let vdata = 0;
2097+
let d16 = 0;
2098+
let a16 = 0;
2099+
let tfe = 0;
2100+
let dmask = 1; // sp3
2101+
let dim = 1; // sp3
2102+
}
2103+
2104+
multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
2105+
let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
2106+
foreach DSuffix = ["_D2", ""] in {
2107+
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
2108+
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
2109+
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
2110+
}
2111+
}
2112+
}
2113+
2114+
defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
2115+
defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,7 @@ enum : uint64_t {
107107
DisableWQM = UINT64_C(1) << 36,
108108
Gather4 = UINT64_C(1) << 37,
109109

110-
// Reserved, must be 0.
111-
Reserved0 = UINT64_C(1) << 38,
110+
TENSOR_CNT = UINT64_C(1) << 38,
112111

113112
SCALAR_STORE = UINT64_C(1) << 39,
114113
FIXED_SIZE = UINT64_C(1) << 40,

llvm/lib/Target/AMDGPU/SIInstrFormats.td

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "",
6868

6969
field bit Gather4 = 0;
7070

71+
// wait count to manage tensor loads/stores.
72+
field bit TENSOR_CNT = 0;
73+
7174
// This is an s_store_dword* instruction that requires a cache flush
7275
// on wave termination. It is necessary to distinguish from mayStore
7376
// SMEM instructions like the cache flush ones.
@@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "",
201204
let TSFlags{36} = DisableWQM;
202205
let TSFlags{37} = Gather4;
203206

204-
// Reserved, must be 0.
205-
let TSFlags{38} = 0;
207+
let TSFlags{38} = TENSOR_CNT;
206208

207209
let TSFlags{39} = ScalarStore;
208210
let TSFlags{40} = FixedSize;

0 commit comments

Comments
 (0)