Skip to content

Commit c962f2b

Browse files
changpengrampitecshiltian
authored
AMDGPU: Implement builtins for gfx1250 wmma instructions (#148991)
Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com> Co-authored-by: Shilei Tian <Shilei.Tian@amd.com>
1 parent 01dd892 commit c962f2b

File tree

4 files changed

+913
-1
lines changed

4 files changed

+913
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,5 +676,45 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
676676
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
677677
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
678678

679+
// GFX1250 WMMA builtins
680+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
681+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
682+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x32_bf16, "V8yIbV16yIbV16yIsV8yIbIb", "nc", "gfx1250-insts,wavefrontsize32")
683+
TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16, "V8yIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
684+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
685+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
686+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
687+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
688+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
689+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
690+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
691+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
692+
TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32")
693+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
694+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
695+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
696+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
697+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
698+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
699+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
700+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
701+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
702+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
703+
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
704+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
705+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
706+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
707+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
708+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
709+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
710+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
711+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
712+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
713+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
714+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
715+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
716+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
717+
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
718+
679719
#undef BUILTIN
680720
#undef TARGET_BUILTIN

clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp

Lines changed: 198 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,46 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
822822
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
823823
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
824824
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
825-
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
825+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
826+
// GFX1250 WMMA builtins
827+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
828+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
829+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
830+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
831+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
832+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
833+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
834+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
835+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
836+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
837+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
838+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
839+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
840+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
841+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
842+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
843+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
844+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
845+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
846+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
847+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
848+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
849+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
850+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
851+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
852+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
853+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
854+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
855+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
856+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
857+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
858+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
859+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
860+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
861+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
862+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
863+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
864+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
826865

827866
// These operations perform a matrix multiplication and accumulation of
828867
// the form:
@@ -837,6 +876,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
837876
// "false".
838877
bool AppendFalseForOpselArg = false;
839878
unsigned BuiltinWMMAOp;
879+
// Need return type when D and C are of different types.
880+
bool NeedReturnType = false;
840881

841882
switch (BuiltinID) {
842883
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -975,6 +1016,160 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
9751016
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
9761017
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
9771018
break;
1019+
// GFX1250 WMMA builtins
1020+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
1021+
ArgsForMatchingMatrixTypes = {5, 1};
1022+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
1023+
break;
1024+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
1025+
ArgsForMatchingMatrixTypes = {5, 1};
1026+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
1027+
break;
1028+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
1029+
ArgsForMatchingMatrixTypes = {5, 1};
1030+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
1031+
break;
1032+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
1033+
ArgsForMatchingMatrixTypes = {5, 1};
1034+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
1035+
break;
1036+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
1037+
ArgsForMatchingMatrixTypes = {5, 1};
1038+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
1039+
break;
1040+
case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
1041+
NeedReturnType = true;
1042+
ArgsForMatchingMatrixTypes = {1, 5};
1043+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
1044+
break;
1045+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
1046+
ArgsForMatchingMatrixTypes = {3, 0};
1047+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
1048+
break;
1049+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
1050+
ArgsForMatchingMatrixTypes = {3, 0};
1051+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
1052+
break;
1053+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
1054+
ArgsForMatchingMatrixTypes = {3, 0};
1055+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
1056+
break;
1057+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
1058+
ArgsForMatchingMatrixTypes = {3, 0};
1059+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
1060+
break;
1061+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
1062+
ArgsForMatchingMatrixTypes = {3, 0};
1063+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
1064+
break;
1065+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
1066+
ArgsForMatchingMatrixTypes = {3, 0};
1067+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
1068+
break;
1069+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
1070+
ArgsForMatchingMatrixTypes = {3, 0};
1071+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
1072+
break;
1073+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
1074+
ArgsForMatchingMatrixTypes = {3, 0};
1075+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
1076+
break;
1077+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
1078+
ArgsForMatchingMatrixTypes = {3, 0};
1079+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
1080+
break;
1081+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
1082+
ArgsForMatchingMatrixTypes = {3, 0};
1083+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
1084+
break;
1085+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
1086+
ArgsForMatchingMatrixTypes = {3, 0};
1087+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
1088+
break;
1089+
case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
1090+
ArgsForMatchingMatrixTypes = {3, 0};
1091+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
1092+
break;
1093+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
1094+
ArgsForMatchingMatrixTypes = {3, 0};
1095+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
1096+
break;
1097+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
1098+
ArgsForMatchingMatrixTypes = {3, 0};
1099+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
1100+
break;
1101+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
1102+
ArgsForMatchingMatrixTypes = {3, 0};
1103+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
1104+
break;
1105+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
1106+
ArgsForMatchingMatrixTypes = {3, 0};
1107+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
1108+
break;
1109+
case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
1110+
ArgsForMatchingMatrixTypes = {4, 1};
1111+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
1112+
break;
1113+
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
1114+
ArgsForMatchingMatrixTypes = {3, 0, 1};
1115+
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
1116+
break;
1117+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
1118+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1119+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
1120+
break;
1121+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
1122+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1123+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
1124+
break;
1125+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
1126+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1127+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
1128+
break;
1129+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
1130+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1131+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
1132+
break;
1133+
case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
1134+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1135+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
1136+
break;
1137+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
1138+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1139+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
1140+
break;
1141+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
1142+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1143+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
1144+
break;
1145+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
1146+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1147+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
1148+
break;
1149+
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
1150+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1151+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
1152+
break;
1153+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
1154+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1155+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
1156+
break;
1157+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
1158+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1159+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
1160+
break;
1161+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
1162+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1163+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
1164+
break;
1165+
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
1166+
ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
1167+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
1168+
break;
1169+
case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
1170+
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
1171+
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
1172+
break;
9781173
}
9791174

9801175
SmallVector<Value *, 6> Args;
@@ -984,6 +1179,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
9841179
Args.push_back(Builder.getFalse());
9851180

9861181
SmallVector<llvm::Type *, 6> ArgTypes;
1182+
if (NeedReturnType)
1183+
ArgTypes.push_back(ConvertType(E->getType()));
9871184
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
9881185
ArgTypes.push_back(Args[ArgIdx]->getType());
9891186

0 commit comments

Comments
 (0)