Skip to content

Commit c75ab01

Browse files
committed
RFC02658:CLANG: DMF VSX Vector bfloat16 GER 2x (rank-2 update)
1 parent d5da826 commit c75ab01

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

clang/include/clang/Basic/BuiltinsPPC.def

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,27 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4spp, "vW1024*W256V", true,
10921092
"mma,paired-vector-memops")
10931093
UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4spp, "vW1024*W256Vi255i15i15", true,
10941094
"mma,paired-vector-memops")
1095+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
1096+
"mma,paired-vector-memops")
1097+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf32ger, "vW512*VV",
1098+
"mma,paired-vector-memops")
1099+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf64ger, "vW512*W256V",
1100+
"mma,paired-vector-memops")
1101+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf16ger2, "vW512*VVi15i15i3",
1102+
"mma,paired-vector-memops")
1103+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf32ger, "vW512*VVi15i15",
1104+
"mma,paired-vector-memops")
1105+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf64ger, "vW512*W256Vi15i3",
1106+
"mma,paired-vector-memops")
1107+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvbf16ger2, "vW512*VV",
1108+
"mma,paired-vector-memops")
1109+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvbf16ger2, "vW512*VVi15i15i3",
1110+
"mma,paired-vector-memops")
1111+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_dmxvbf16gerx2, "vW1024*W256V",
1112+
"mma,paired-vector-memops")
1113+
UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmdmxvbf16gerx2, "vW1024*W256Vi255i15i3",
1114+
"mma,paired-vector-memops")
1115+
10951116

10961117
// MMA builtins with positive/negative multiply/accumulate.
10971118
UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV",
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
2+
// Update then manual applied to commonize the checks for AIX and LoP.
3+
// RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future \
4+
// RUN: -emit-llvm %s -o - | FileCheck %s
5+
// RUN: %clang_cc1 -O3 -triple powerpc64-ibm-aix -target-cpu future \
6+
// RUN: -emit-llvm %s -o - | FileCheck %s
7+
8+
// CHECK-LABEL: void @test_dmxvbf16gerx2(
9+
// CHECK-NEXT: [[ENTRY:.*:]]
10+
// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
11+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]])
12+
// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]]
13+
// CHECK-NEXT: ret void
14+
//
15+
void test_dmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
16+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
17+
__vector_pair vp = *((__vector_pair *)vpp);
18+
__builtin_mma_dmxvbf16gerx2(&vdmr, vp, vc);
19+
*((__dmr1024 *)resp) = vdmr;
20+
}
21+
22+
// CHECK-LABEL: void @test_dmxvbf16gerx2nn(
23+
// CHECK-NEXT: [[ENTRY:.*:]]
24+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
25+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
26+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
27+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
28+
// CHECK-NEXT: ret void
29+
//
30+
void test_dmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
31+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
32+
__vector_pair vp = *((__vector_pair *)vpp);
33+
__builtin_mma_dmxvbf16gerx2nn(&vdmr, vp, vc);
34+
*((__dmr1024 *)resp) = vdmr;
35+
}
36+
37+
// CHECK-LABEL: void @test_dmxvbf16gerx2np(
38+
// CHECK-NEXT: [[ENTRY:.*:]]
39+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
40+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
41+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
42+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
43+
// CHECK-NEXT: ret void
44+
//
45+
void test_dmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
46+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
47+
__vector_pair vp = *((__vector_pair *)vpp);
48+
__builtin_mma_dmxvbf16gerx2np(&vdmr, vp, vc);
49+
*((__dmr1024 *)resp) = vdmr;
50+
}
51+
52+
// CHECK-LABEL: void @test_dmxvbf16gerx2pn(
53+
// CHECK-NEXT: [[ENTRY:.*:]]
54+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
55+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
56+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
57+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
58+
// CHECK-NEXT: ret void
59+
//
60+
void test_dmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
61+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
62+
__vector_pair vp = *((__vector_pair *)vpp);
63+
__builtin_mma_dmxvbf16gerx2pn(&vdmr, vp, vc);
64+
*((__dmr1024 *)resp) = vdmr;
65+
}
66+
67+
// CHECK-LABEL: void @test_dmxvbf16gerx2pp(
68+
// CHECK-NEXT: [[ENTRY:.*:]]
69+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
70+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
71+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]])
72+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
73+
// CHECK-NEXT: ret void
74+
//
75+
void test_dmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
76+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
77+
__vector_pair vp = *((__vector_pair *)vpp);
78+
__builtin_mma_dmxvbf16gerx2pp(&vdmr, vp, vc);
79+
*((__dmr1024 *)resp) = vdmr;
80+
}
81+
82+
// CHECK-LABEL: void @test_pmdmxvbf16gerx2(
83+
// CHECK-NEXT: [[ENTRY:.*:]]
84+
// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
85+
// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
86+
// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
87+
// CHECK-NEXT: ret void
88+
//
89+
void test_pmdmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
90+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
91+
__vector_pair vp = *((__vector_pair *)vpp);
92+
__builtin_mma_pmdmxvbf16gerx2(&vdmr, vp, vc, 0, 0, 0);
93+
*((__dmr1024 *)resp) = vdmr;
94+
}
95+
96+
// CHECK-LABEL: void @test_pmdmxvbf16gerx2nn(
97+
// CHECK-NEXT: [[ENTRY:.*:]]
98+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
99+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
100+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
101+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
102+
// CHECK-NEXT: ret void
103+
//
104+
void test_pmdmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
105+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
106+
__vector_pair vp = *((__vector_pair *)vpp);
107+
__builtin_mma_pmdmxvbf16gerx2nn(&vdmr, vp, vc, 0, 0, 0);
108+
*((__dmr1024 *)resp) = vdmr;
109+
}
110+
111+
// CHECK-LABEL: void @test_pmdmxvbf16gerx2np(
112+
// CHECK-NEXT: [[ENTRY:.*:]]
113+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
114+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
115+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
116+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
117+
// CHECK-NEXT: ret void
118+
//
119+
void test_pmdmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
120+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
121+
__vector_pair vp = *((__vector_pair *)vpp);
122+
__builtin_mma_pmdmxvbf16gerx2np(&vdmr, vp, vc, 0, 0, 0);
123+
*((__dmr1024 *)resp) = vdmr;
124+
}
125+
126+
// CHECK-LABEL: void @test_pmdmxvbf16gerx2pn(
127+
// CHECK-NEXT: [[ENTRY:.*:]]
128+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
129+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
130+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
131+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
132+
// CHECK-NEXT: ret void
133+
//
134+
void test_pmdmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
135+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
136+
__vector_pair vp = *((__vector_pair *)vpp);
137+
__builtin_mma_pmdmxvbf16gerx2pn(&vdmr, vp, vc, 0, 0, 0);
138+
*((__dmr1024 *)resp) = vdmr;
139+
}
140+
141+
// CHECK-LABEL: void @test_pmdmxvbf16gerx2pp(
142+
// CHECK-NEXT: [[ENTRY:.*:]]
143+
// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]]
144+
// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]]
145+
// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0)
146+
// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]]
147+
// CHECK-NEXT: ret void
148+
//
149+
void test_pmdmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) {
150+
__dmr1024 vdmr = *((__dmr1024 *)vdmrp);
151+
__vector_pair vp = *((__vector_pair *)vpp);
152+
__builtin_mma_pmdmxvbf16gerx2pp(&vdmr, vp, vc, 0, 0, 0);
153+
*((__dmr1024 *)resp) = vdmr;
154+
}
155+
156+
// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
157+
// CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0}
158+
// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
159+
// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
160+
// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0}
161+
// CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0}

clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,32 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc)
1212
__builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc);
1313
__builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0);
1414

15+
// DMF VSX Vector bfloat16 GER 2x builtins.
16+
__builtin_mma_dmxvbf16gerx2(&vdmr, vp, vc);
17+
__builtin_mma_dmxvbf16gerx2nn(&vdmr, vp, vc);
18+
__builtin_mma_dmxvbf16gerx2np(&vdmr, vp, vc);
19+
__builtin_mma_dmxvbf16gerx2pn(&vdmr, vp, vc);
20+
__builtin_mma_dmxvbf16gerx2pp(&vdmr, vp, vc);
21+
__builtin_mma_pmdmxvbf16gerx2(&vdmr, vp, vc, 0, 0, 0);
22+
__builtin_mma_pmdmxvbf16gerx2nn(&vdmr, vp, vc, 0, 0, 0);
23+
__builtin_mma_pmdmxvbf16gerx2np(&vdmr, vp, vc, 0, 0, 0);
24+
__builtin_mma_pmdmxvbf16gerx2pn(&vdmr, vp, vc, 0, 0, 0);
25+
__builtin_mma_pmdmxvbf16gerx2pp(&vdmr, vp, vc, 0, 0, 0);
26+
1527
// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops
1628
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops
1729
// CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops
1830
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops
1931
// CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops
2032
// CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops
33+
// CHECK: error: '__builtin_mma_dmxvbf16gerx2' needs target feature mma,paired-vector-memops
34+
// CHECK: error: '__builtin_mma_dmxvbf16gerx2nn' needs target feature mma,paired-vector-memops
35+
// CHECK: error: '__builtin_mma_dmxvbf16gerx2np' needs target feature mma,paired-vector-memops
36+
// CHECK: error: '__builtin_mma_dmxvbf16gerx2pn' needs target feature mma,paired-vector-memops
37+
// CHECK: error: '__builtin_mma_dmxvbf16gerx2pp' needs target feature mma,paired-vector-memops
38+
// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2' needs target feature mma,paired-vector-memops
39+
// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2nn' needs target feature mma,paired-vector-memops
40+
// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2np' needs target feature mma,paired-vector-memops
41+
// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pn' needs target feature mma,paired-vector-memops
42+
// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pp' needs target feature mma,paired-vector-memops
2143
}

0 commit comments

Comments
 (0)