Skip to content

Commit 4729242

Browse files
authored
AMDGPU: Add MC layer support for load transpose instructions for gfx1250 (#146024)
Co-authored with @jayfoad
1 parent 7dfcf48 commit 4729242

File tree

8 files changed

+436
-13
lines changed

8 files changed

+436
-13
lines changed

clang/test/CodeGenOpenCL/amdgpu-features.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@
108108
// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
109109
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
110110
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
111-
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+wavefrontsize32"
111+
// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32"
112112

113113
// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
114114

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,6 +1094,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts",
10941094
"Has v_bitop3_b32/v_bitop3_b16 instructions"
10951095
>;
10961096

1097+
def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts",
1098+
"HasTransposeLoadF4F6Insts",
1099+
"true",
1100+
"Has ds_load_tr4/tr6 and global_load_tr4/tr6 instructions"
1101+
>;
1102+
10971103
def FeaturePrngInst : SubtargetFeature<"prng-inst",
10981104
"HasPrngInst",
10991105
"true",
@@ -1933,6 +1939,7 @@ def FeatureISAVersion12_50 : FeatureSet<
19331939
FeatureScalarDwordx3Loads,
19341940
FeatureDPPSrc1SGPR,
19351941
FeatureBitOp3Insts,
1942+
FeatureTransposeLoadF4F6Insts,
19361943
FeatureBF16ConversionInsts,
19371944
FeatureCvtPkF16F32Inst,
19381945
FeatureMinimum3Maximum3PKF16,
@@ -2627,6 +2634,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
26272634
def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">,
26282635
AssemblerPredicate<(all_of FeatureBitOp3Insts)>;
26292636

2637+
def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">,
2638+
AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>;
2639+
26302640
def HasPrngInst : Predicate<"Subtarget->hasPrngInst()">,
26312641
AssemblerPredicate<(all_of FeaturePrngInst)>;
26322642

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,19 @@ multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
783783
defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
784784
} // let SubtargetPredicate = isGFX12Plus
785785

786+
let SubtargetPredicate = isGFX1250Plus in {
787+
788+
let WaveSizePredicate = isWave32, mayStore = 0 in {
789+
let OtherPredicates = [HasTransposeLoadF4F6Insts] in {
790+
defm DS_LOAD_TR4_B64 : DS_1A_RET_NoM0<"ds_load_tr4_b64", VReg_64>;
791+
defm DS_LOAD_TR6_B96 : DS_1A_RET_NoM0<"ds_load_tr6_b96", VReg_96>;
792+
} // let OtherPredicates = [HasTransposeLoadF4F6Insts]
793+
defm DS_LOAD_TR8_B64 : DS_1A_RET_NoM0<"ds_load_tr8_b64", VReg_64>;
794+
defm DS_LOAD_TR16_B128 : DS_1A_RET_NoM0<"ds_load_tr16_b128", VReg_128>;
795+
} // let WaveSizePredicate = isWave32, mayStore = 0
796+
797+
} // let SubtargetPredicate = isGFX1250Plus
798+
786799
let WaveSizePredicate = isWave64, SubtargetPredicate = HasGFX950Insts, mayStore = 0 in {
787800
defm DS_READ_B64_TR_B4 : DS_1A_RET_NoM0<"ds_read_b64_tr_b4", VReg_64>;
788801
defm DS_READ_B64_TR_B8 : DS_1A_RET_NoM0<"ds_read_b64_tr_b8", VReg_64>;
@@ -1332,6 +1345,11 @@ defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
13321345
defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
13331346
defm DS_BPERMUTE_FI_B32 : DS_Real_gfx12<0x0cd>;
13341347

1348+
defm DS_LOAD_TR4_B64 : DS_Real_gfx12<0x0fa>;
1349+
defm DS_LOAD_TR6_B96 : DS_Real_gfx12<0x0fb>;
1350+
defm DS_LOAD_TR16_B128 : DS_Real_gfx12<0x0fc>;
1351+
defm DS_LOAD_TR8_B64 : DS_Real_gfx12<0x0fd>;
1352+
13351353
defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx12<0x0e0,
13361354
"ds_bvh_stack_push4_pop1_rtn_b32", true>;
13371355
defm DS_BVH_STACK_PUSH8_POP1_RTN_B32 : DS_Real_gfx12<0x0e1>;
@@ -1345,6 +1363,10 @@ let AssemblerPredicate = isGFX12Plus in {
13451363
def : AMDGPUMnemonicAlias<"ds_subrev_rtn_u64", "ds_rsub_rtn_u64">;
13461364
}
13471365

1366+
// Aliases that have existed since these instructions were introduced.
1367+
def : MnemonicAlias<"ds_load_tr_b64", "ds_load_tr8_b64">, Requires<[isGFX1250Plus]>;
1368+
def : MnemonicAlias<"ds_load_tr_b128", "ds_load_tr16_b128">, Requires<[isGFX1250Plus]>;
1369+
13481370
//===----------------------------------------------------------------------===//
13491371
// GFX11.
13501372
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 77 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,19 +1092,23 @@ let SubtargetPredicate = isGFX12Plus in {
10921092
}
10931093

10941094
let WaveSizePredicate = isWave32 in {
1095-
let Mnemonic = "global_load_tr_b128" in
1096-
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
1097-
let Mnemonic = "global_load_tr_b64" in
1098-
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
1099-
}
1100-
let WaveSizePredicate = isWave64 in {
1101-
let Mnemonic = "global_load_tr_b128" in
1102-
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
1103-
let Mnemonic = "global_load_tr_b64" in
1104-
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
1095+
defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128", VReg_128>;
1096+
defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64", VReg_64>;
11051097
}
11061098
} // End SubtargetPredicate = isGFX12Plus
11071099

1100+
let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in {
1101+
let Mnemonic = "global_load_tr_b128" in
1102+
defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
1103+
let Mnemonic = "global_load_tr_b64" in
1104+
defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
1105+
}
1106+
1107+
let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX1250Plus in {
1108+
defm GLOBAL_LOAD_TR6_B96 : FLAT_Global_Load_Pseudo <"global_load_tr6_b96", VReg_96>;
1109+
defm GLOBAL_LOAD_TR4_B64 : FLAT_Global_Load_Pseudo <"global_load_tr4_b64", VReg_64>;
1110+
}
1111+
11081112
let SubtargetPredicate = isGFX10Plus in {
11091113
defm GLOBAL_ATOMIC_FCMPSWAP :
11101114
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
@@ -2809,6 +2813,13 @@ multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op,
28092813
defm _SADDR : VFLAT_Real_gfx12<op, name>;
28102814
}
28112815

2816+
multiclass VGLOBAL_Real_AllAddr_gfx1200<bits<8> op> {
2817+
let AssemblerPredicate = isGFX12Not12_50 in {
2818+
defm "" : VFLAT_Real_gfx12<op>;
2819+
defm _SADDR : VFLAT_Real_gfx12<op>;
2820+
}
2821+
}
2822+
28122823
multiclass VGLOBAL_Real_AllAddr_gfx12_w64<bits<8> op,
28132824
string name = get_FLAT_ps<NAME>.Mnemonic> :
28142825
VFLAT_Aliases_gfx12<name> {
@@ -2951,8 +2962,8 @@ defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_a
29512962
defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">;
29522963
defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
29532964

2954-
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057>;
2955-
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058>;
2965+
defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>;
2966+
defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>;
29562967

29572968
defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>;
29582969
defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>;
@@ -2992,6 +3003,60 @@ defm SCRATCH_STORE_SHORT_D16_HI : VSCRATCH_Real_AllAddr_gfx12<0x25, "scratch_
29923003
defm SCRATCH_LOAD_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x53>;
29933004
defm SCRATCH_STORE_BLOCK : VSCRATCH_Real_AllAddr_gfx12<0x54>;
29943005

3006+
//===----------------------------------------------------------------------===//
3007+
// GFX1250
3008+
//===----------------------------------------------------------------------===//
3009+
3010+
multiclass VFLAT_Real_gfx1250<bits<8> op,
3011+
string name = get_FLAT_ps<NAME>.Mnemonic> {
3012+
defvar ps = !cast<FLAT_Pseudo>(NAME);
3013+
def _gfx1250 : VFLAT_Real<op, ps, name>,
3014+
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250> {
3015+
let AssemblerPredicate = isGFX125xOnly;
3016+
let DecoderNamespace = "GFX1250";
3017+
3018+
let Inst{25-24} = {ps.is_flat_global, ps.is_flat_scratch};
3019+
}
3020+
}
3021+
3022+
multiclass VFLAT_Aliases_gfx1250<string name> {
3023+
defvar ps = get_FLAT_ps<NAME>;
3024+
if !ne(ps.Mnemonic, name) then
3025+
def : MnemonicAlias<ps.Mnemonic, name>, Requires<[isGFX125xOnly]>;
3026+
}
3027+
3028+
multiclass VFLAT_Real_Base_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
3029+
VFLAT_Aliases_gfx1250<name> {
3030+
defm "" : VFLAT_Real_gfx1250<op, name>;
3031+
}
3032+
3033+
multiclass VFLAT_Real_RTN_gfx1250<bits<8> op, string name> {
3034+
defm _RTN : VFLAT_Real_gfx1250<op, name>;
3035+
}
3036+
3037+
multiclass VFLAT_Real_SADDR_gfx1250<bits<8> op, string name> {
3038+
defm _SADDR : VFLAT_Real_gfx1250<op, name>;
3039+
}
3040+
3041+
multiclass VFLAT_Real_SADDR_RTN_gfx1250<bits<8> op, string name> {
3042+
defm _SADDR_RTN : VFLAT_Real_gfx1250<op, name>;
3043+
}
3044+
3045+
multiclass VFLAT_Real_AllAddr_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
3046+
VFLAT_Real_Base_gfx1250<op, name>,
3047+
VFLAT_Real_SADDR_gfx1250<op, name>;
3048+
3049+
multiclass VFLAT_Real_Atomics_gfx1250<bits<8> op, string name = get_FLAT_ps<NAME>.Mnemonic> :
3050+
VFLAT_Real_AllAddr_gfx1250<op, name>,
3051+
VFLAT_Real_RTN_gfx1250<op, name>,
3052+
VFLAT_Real_SADDR_RTN_gfx1250<op, name>;
3053+
3054+
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
3055+
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
3056+
3057+
defm GLOBAL_LOAD_TR4_B64 : VFLAT_Real_AllAddr_gfx1250<0x073>;
3058+
defm GLOBAL_LOAD_TR6_B96 : VFLAT_Real_AllAddr_gfx1250<0x074>;
3059+
29953060
def True16D16Table : GenericTable {
29963061
let FilterClass = "True16D16Table";
29973062
let CppTypeName = "True16D16Info";

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
231231
bool HasPseudoScalarTrans = false;
232232
bool HasRestrictedSOffset = false;
233233
bool HasBitOp3Insts = false;
234+
bool HasTransposeLoadF4F6Insts = false;
234235
bool HasPrngInst = false;
235236
bool HasBVHDualAndBVH8Insts = false;
236237
bool HasPermlane16Swap = false;
@@ -1372,6 +1373,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
13721373
return HasMinimum3Maximum3PKF16;
13731374
}
13741375

1376+
bool hasTransposeLoadF4F6Insts() const { return HasTransposeLoadF4F6Insts; }
1377+
13751378
/// \returns true if the target has s_wait_xcnt insertion. Supported for
13761379
/// GFX1250.
13771380
bool hasWaitXCnt() const { return HasWaitXcnt; }

llvm/lib/TargetParser/TargetParser.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
443443
Features["gfx1250-insts"] = true;
444444
Features["bitop3-insts"] = true;
445445
Features["prng-inst"] = true;
446+
Features["transpose-load-f4f6-insts"] = true;
446447
Features["fp8-conversion-insts"] = true;
447448
Features["permlane16-swap"] = true;
448449
Features["ashr-pk-insts"] = true;

0 commit comments

Comments
 (0)