Skip to content

Commit 45ca943

Browse files
committed
[AMDGPU] Select no-return atomic intrinsics in tblgen
This is to avoid relying on the post-isel hook. This change also enable the saddr pattern selection for atomic intrinsics in GlobalISel. Differential Revision: https://reviews.llvm.org/D123583
1 parent ed58a01 commit 45ca943

File tree

7 files changed

+131
-70
lines changed

7 files changed

+131
-70
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstructions.td

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,31 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
540540
// TODO: Add GISelPredicateCode for the ret and noret PatFrags once
541541
// GlobalISelEmitter allows pattern matches where src and dst def count
542542
// mismatch.
543+
544+
multiclass ret_noret_op {
545+
let PredicateCode = [{ return !(SDValue(N, 0).use_empty()); }],
546+
GISelPredicateCode = [{ return true; }] in {
547+
def "_ret" : PatFrag<(ops node:$ptr, node:$data),
548+
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
549+
}
550+
551+
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
552+
GISelPredicateCode = [{ return false; }] in {
553+
def "_noret" : PatFrag<(ops node:$ptr, node:$data),
554+
(!cast<SDPatternOperator>(NAME) node:$ptr, node:$data)>;
555+
}
556+
}
557+
558+
defm int_amdgcn_flat_atomic_fadd : ret_noret_op;
559+
defm int_amdgcn_flat_atomic_fadd_v2bf16 : ret_noret_op;
560+
defm int_amdgcn_flat_atomic_fmin : ret_noret_op;
561+
defm int_amdgcn_flat_atomic_fmax : ret_noret_op;
562+
defm int_amdgcn_global_atomic_fadd : ret_noret_op;
563+
defm int_amdgcn_global_atomic_fadd_v2bf16 : ret_noret_op;
564+
defm int_amdgcn_global_atomic_fmin : ret_noret_op;
565+
defm int_amdgcn_global_atomic_fmax : ret_noret_op;
566+
defm int_amdgcn_ds_fadd_v2bf16 : ret_noret_op;
567+
543568
multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
544569
let PredicateCode = [{ return (SDValue(N, 0).use_empty()); }],
545570
GISelPredicateCode = [{ return false; }] in {

llvm/lib/Target/AMDGPU/BUFInstructions.td

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,9 +1188,9 @@ let SubtargetPredicate = isGFX90APlus in {
11881188
let SubtargetPredicate = isGFX90AOnly;
11891189
}
11901190

1191-
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
1192-
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
1193-
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
1191+
defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_add_f64", VReg_64, f64>;
1192+
defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_min_f64", VReg_64, f64>;
1193+
defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Pseudo_Atomics<"buffer_atomic_max_f64", VReg_64, f64>;
11941194
} // End SubtargetPredicate = isGFX90APlus
11951195

11961196
def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
@@ -1381,10 +1381,11 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
13811381
// buffer_atomic patterns
13821382
//===----------------------------------------------------------------------===//
13831383

1384-
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst> {
1384+
multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
13851385
foreach RtnMode = ["ret", "noret"] in {
13861386

1387-
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode # "_" # vt.Size);
1387+
defvar Op = !cast<SDPatternOperator>(OpPrefix # "_" # RtnMode
1388+
# !if(isIntr, "", "_" # vt.Size));
13881389
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
13891390

13901391
def : GCNPat<
@@ -1592,6 +1593,9 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, v2f16, "BUFFER_ATOMIC_P
15921593
}
15931594

15941595
let SubtargetPredicate = isGFX90APlus in {
1596+
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64", 1>;
1597+
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64", 1>;
1598+
defm : BufferAtomicPat<"int_amdgcn_global_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64", 1>;
15951599
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32">;
15961600
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
15971601

llvm/lib/Target/AMDGPU/DSInstructions.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1025,9 +1025,13 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
10251025
let SubtargetPredicate = isGFX940Plus in {
10261026
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
10271027
def : GCNPat <
1028-
(v2i16 (int_amdgcn_ds_fadd_v2bf16 i32:$ptr, v2i16:$src)),
1028+
(v2i16 (int_amdgcn_ds_fadd_v2bf16_ret i32:$ptr, v2i16:$src)),
10291029
(DS_PK_ADD_RTN_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
10301030
>;
1031+
def : GCNPat <
1032+
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
1033+
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
1034+
>;
10311035
}
10321036

10331037
def : Pat <

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 64 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -711,17 +711,17 @@ defm FLAT_ATOMIC_FMAX_X2 : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
711711
} // End SubtargetPredicate = isGFX7GFX10
712712

713713
let SubtargetPredicate = isGFX90APlus in {
714-
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64, int_amdgcn_flat_atomic_fadd>;
715-
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmin>;
716-
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64, int_amdgcn_flat_atomic_fmax>;
717-
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64, int_amdgcn_global_atomic_fadd>;
718-
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
719-
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
714+
defm FLAT_ATOMIC_ADD_F64 : FLAT_Atomic_Pseudo<"flat_atomic_add_f64", VReg_64, f64>;
715+
defm FLAT_ATOMIC_MIN_F64 : FLAT_Atomic_Pseudo<"flat_atomic_min_f64", VReg_64, f64>;
716+
defm FLAT_ATOMIC_MAX_F64 : FLAT_Atomic_Pseudo<"flat_atomic_max_f64", VReg_64, f64>;
717+
defm GLOBAL_ATOMIC_ADD_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_add_f64", VReg_64, f64>;
718+
defm GLOBAL_ATOMIC_MIN_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_min_f64", VReg_64, f64>;
719+
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
720720
} // End SubtargetPredicate = isGFX90APlus
721721

722722
let SubtargetPredicate = isGFX940Plus in {
723-
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32, int_amdgcn_flat_atomic_fadd>;
724-
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_flat_atomic_fadd>;
723+
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
724+
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
725725
defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
726726
defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
727727
} // End SubtargetPredicate = isGFX940Plus
@@ -897,15 +897,15 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
897897
defm GLOBAL_ATOMIC_FCMPSWAP :
898898
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, null_frag, v2f32, VReg_64>;
899899
defm GLOBAL_ATOMIC_FMIN :
900-
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32, int_amdgcn_global_atomic_fmin>;
900+
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin", VGPR_32, f32>;
901901
defm GLOBAL_ATOMIC_FMAX :
902-
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32, int_amdgcn_global_atomic_fmax>;
902+
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax", VGPR_32, f32>;
903903
defm GLOBAL_ATOMIC_FCMPSWAP_X2 :
904904
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", VReg_64, f64, null_frag, v2f64, VReg_128>;
905905
defm GLOBAL_ATOMIC_FMIN_X2 :
906-
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64, int_amdgcn_global_atomic_fmin>;
906+
FLAT_Global_Atomic_Pseudo<"global_atomic_fmin_x2", VReg_64, f64>;
907907
defm GLOBAL_ATOMIC_FMAX_X2 :
908-
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64, int_amdgcn_global_atomic_fmax>;
908+
FLAT_Global_Atomic_Pseudo<"global_atomic_fmax_x2", VReg_64, f64>;
909909
} // End SubtargetPredicate = isGFX10Plus, is_flat_global = 1
910910

911911
let is_flat_global = 1 in {
@@ -920,10 +920,10 @@ let OtherPredicates = [HasAtomicFaddInsts] in {
920920

921921
let OtherPredicates = [isGFX90APlus] in {
922922
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
923-
"global_atomic_add_f32", VGPR_32, f32, int_amdgcn_global_atomic_fadd
923+
"global_atomic_add_f32", VGPR_32, f32
924924
>;
925925
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
926-
"global_atomic_pk_add_f16", VGPR_32, v2f16, int_amdgcn_global_atomic_fadd
926+
"global_atomic_pk_add_f16", VGPR_32, v2f16
927927
>;
928928
} // End OtherPredicates = [isGFX90APlus]
929929
} // End is_flat_global = 1
@@ -1029,13 +1029,30 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt,
10291029
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
10301030
}
10311031

1032+
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
1033+
ValueType data_vt = vt, bit isIntr = 0> {
1034+
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
1035+
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
1036+
1037+
def : GCNPat <(vt (rtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
1038+
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
1039+
1040+
def : GCNPat <(vt (noRtnNode (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
1041+
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
1042+
}
1043+
1044+
multiclass FlatSignedIntrPat <string inst, string node, ValueType vt,
1045+
ValueType data_vt = vt> {
1046+
defm : FlatSignedAtomicPat<inst, node, vt, data_vt, /* isIntr */ 1>;
1047+
}
1048+
10321049
class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
10331050
(node (GlobalOffset i64:$vaddr, i16:$offset), vt:$data),
10341051
(inst VReg_64:$vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
10351052
>;
10361053

1037-
class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
1038-
ValueType data_vt = vt> : GCNPat <
1054+
class FlatSignedAtomicPatRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
1055+
ValueType data_vt = vt> : GCNPat <
10391056
(vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
10401057
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
10411058
>;
@@ -1237,7 +1254,7 @@ multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, V
12371254

12381255
multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator node,
12391256
ValueType vt, ValueType data_vt = vt> {
1240-
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
1257+
def : FlatSignedAtomicPatRtn <!cast<FLAT_Pseudo>(nortn_inst_name#"_RTN"), node, vt, data_vt> {
12411258
let AddedComplexity = 10;
12421259
}
12431260

@@ -1247,13 +1264,12 @@ multiclass GlobalFLATAtomicPatsRtn<string nortn_inst_name, SDPatternOperator nod
12471264
}
12481265

12491266
multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
1250-
ValueType data_vt = vt> {
1251-
defvar rtnNode = !cast<PatFrags>(node#"_ret_"#vt.Size);
1252-
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
1267+
ValueType data_vt = vt, bit isIntr = 0> {
1268+
defvar rtnNode = !cast<PatFrags>(node # "_ret" # !if(isIntr, "", "_" # vt.Size));
1269+
defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_" # vt.Size));
12531270

12541271
let AddedComplexity = 10 in {
1255-
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst), noRtnNode, vt, data_vt>;
1256-
def : FlatSignedAtomicPat <!cast<FLAT_Pseudo>(inst#"_RTN"), rtnNode, vt, data_vt>;
1272+
defm : FlatSignedAtomicPat <inst, node, vt, data_vt, isIntr>;
12571273
}
12581274

12591275
let AddedComplexity = 11 in {
@@ -1262,6 +1278,11 @@ multiclass GlobalFLATAtomicPats<string inst, string node, ValueType vt,
12621278
}
12631279
}
12641280

1281+
multiclass GlobalFLATAtomicIntrPats<string inst, string node, ValueType vt,
1282+
ValueType data_vt = vt> {
1283+
defm : GlobalFLATAtomicPats<inst, node, vt, data_vt, /* isIntr */ 1>;
1284+
}
1285+
12651286
multiclass GlobalFLATNoRtnAtomicPats<FLAT_Pseudo inst, SDPatternOperator node,
12661287
ValueType vt> {
12671288
def : FlatSignedAtomicPatNoRtn <inst, node, vt> {
@@ -1427,6 +1448,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f3
14271448
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
14281449
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
14291450
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
1451+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
1452+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
1453+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
1454+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
14301455
}
14311456

14321457
let OtherPredicates = [HasAtomicFaddInsts] in {
@@ -1440,19 +1465,26 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16
14401465
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
14411466
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
14421467
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
1443-
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64_RTN, atomic_load_fadd_flat_ret_64, f64>;
1444-
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F64, atomic_load_fadd_flat_noret_64, f64>;
1445-
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64_RTN, atomic_load_fmin_flat_ret_64, f64>;
1446-
def : FlatSignedAtomicPat <FLAT_ATOMIC_MIN_F64, atomic_load_fmin_flat_noret_64, f64>;
1447-
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64_RTN, atomic_load_fmax_flat_ret_64, f64>;
1448-
def : FlatSignedAtomicPat <FLAT_ATOMIC_MAX_F64, atomic_load_fmax_flat_noret_64, f64>;
1468+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", f32>;
1469+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", f64>;
1470+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", v2f16>;
1471+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
1472+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
1473+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
1474+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_flat", f64>;
1475+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_flat", f64>;
1476+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", f64>;
1477+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MIN_F64", "int_amdgcn_flat_atomic_fmin", f64>;
1478+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_MAX_F64", "int_amdgcn_flat_atomic_fmax", f64>;
14491479
}
14501480

14511481
let OtherPredicates = [isGFX940Plus] in {
1452-
def : FlatSignedAtomicPat <FLAT_ATOMIC_ADD_F32_RTN, atomic_load_fadd_flat_32, f32>;
1453-
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_F16_RTN, atomic_load_fadd_v2f16_flat_32, v2f16>;
1454-
def : FlatSignedAtomicPat <FLAT_ATOMIC_PK_ADD_BF16_RTN, int_amdgcn_flat_atomic_fadd_v2bf16, v2i16>;
1455-
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_PK_ADD_BF16", int_amdgcn_global_atomic_fadd_v2bf16, v2i16>;
1482+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
1483+
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_v2f16_flat", v2f16>;
1484+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", f32>;
1485+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", v2f16>;
1486+
defm : FlatSignedIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
1487+
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
14561488
}
14571489

14581490
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10

llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -319,10 +319,10 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret(double addrspace(1)* %pt
319319
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
320320
; GFX90A: ; %bb.0: ; %main_body
321321
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
322+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
322323
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
323-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
324-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
325-
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
324+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
325+
; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v2, v[0:1], s[0:1] glc
326326
; GFX90A-NEXT: s_endpgm
327327
main_body:
328328
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@@ -333,10 +333,10 @@ define amdgpu_kernel void @global_atomic_fmin_f64_noret(double addrspace(1)* %pt
333333
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
334334
; GFX90A: ; %bb.0: ; %main_body
335335
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
336+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
336337
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
337-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
338-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
339-
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
338+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
339+
; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
340340
; GFX90A-NEXT: s_endpgm
341341
main_body:
342342
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)
@@ -347,10 +347,10 @@ define amdgpu_kernel void @global_atomic_fmax_f64_noret(double addrspace(1)* %pt
347347
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
348348
; GFX90A: ; %bb.0: ; %main_body
349349
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
350+
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
350351
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
351-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
352-
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
353-
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
352+
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
353+
; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
354354
; GFX90A-NEXT: s_endpgm
355355
main_body:
356356
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %ptr, double %data)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(float addrspace(1)* %pt
7474
; GFX90A: ; %bb.0:
7575
; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8
7676
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
77+
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
7778
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
78-
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
79-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
80-
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2048 glc
79+
; GFX90A-NEXT: v_mov_b32_e32 v0, s2
80+
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] offset:2048 glc
8181
; GFX90A-NEXT: s_endpgm
8282
%gep = getelementptr float, float addrspace(1)* %ptr, i64 512
8383
%ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* %gep, float %data)

0 commit comments

Comments
 (0)