From c75ab01db8a950c6b837703f835f01bdedf82d38 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Fri, 4 Jul 2025 14:53:15 -0500 Subject: [PATCH 1/3] RFC02658:CLANG: DMF VSX Vector bfloat16 GER 2x (rank-2 update) --- clang/include/clang/Basic/BuiltinsPPC.def | 21 +++ .../PowerPC/builtins-dmf-vsx-vector-float.c | 161 ++++++++++++++++++ .../PowerPC/ppc-future-mma-builtin-err.c | 22 +++ 3 files changed, 204 insertions(+) create mode 100644 clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def index 354531e83991d..4fff12e9ca7c7 100644 --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -1092,6 +1092,27 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4spp, "vW1024*W256V", true, "mma,paired-vector-memops") UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4spp, "vW1024*W256Vi255i15i15", true, "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf32ger, "vW512*VV", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf64ger, "vW512*W256V", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf16ger2, "vW512*VVi15i15i3", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf32ger, "vW512*VVi15i15", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf64ger, "vW512*W256Vi15i3", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvbf16ger2, "vW512*VV", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvbf16ger2, "vW512*VVi15i15i3", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_dmxvbf16gerx2, "vW1024*W256V", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmdmxvbf16gerx2, "vW1024*W256Vi255i15i3", + "mma,paired-vector-memops") + // MMA builtins with positive/negative multiply/accumulate. UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV", diff --git a/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c new file mode 100644 index 0000000000000..953815ecc42b6 --- /dev/null +++ b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c @@ -0,0 +1,161 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// Update then manual applied to commonize the checks for AIX and LoP. +// RUN: %clang_cc1 -O3 -triple powerpc64le-unknown-unknown -target-cpu future \ +// RUN: -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -O3 -triple powerpc64-ibm-aix -target-cpu future \ +// RUN: -emit-llvm %s -o - | FileCheck %s + +// CHECK-LABEL: void @test_dmxvbf16gerx2( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]] +// CHECK-NEXT: ret void +// +void test_dmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvbf16gerx2(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvbf16gerx2nn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvbf16gerx2nn(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvbf16gerx2np( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvbf16gerx2np(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvbf16gerx2pn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvbf16gerx2pn(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvbf16gerx2pp( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvbf16gerx2pp(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvbf16gerx2( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvbf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvbf16gerx2(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvbf16gerx2nn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvbf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvbf16gerx2nn(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvbf16gerx2np( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvbf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvbf16gerx2np(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvbf16gerx2pn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvbf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvbf16gerx2pn(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvbf16gerx2pp( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvbf16gerx2pp(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0} +// CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0} +// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} +// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"} +// CHECK: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0} +// CHECK: [[META7]] = !{!"__dmr1024", [[META4]], i64 0} diff --git a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c index 1b8d345ac7ec7..7023fa265aa1e 100644 --- a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c +++ b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c @@ -12,10 +12,32 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) __builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc); __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0); + // DMF VSX Vector bfloat16 GER 2x builtins. + __builtin_mma_dmxvbf16gerx2(&vdmr, vp, vc); + __builtin_mma_dmxvbf16gerx2nn(&vdmr, vp, vc); + __builtin_mma_dmxvbf16gerx2np(&vdmr, vp, vc); + __builtin_mma_dmxvbf16gerx2pn(&vdmr, vp, vc); + __builtin_mma_dmxvbf16gerx2pp(&vdmr, vp, vc); + __builtin_mma_pmdmxvbf16gerx2(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvbf16gerx2nn(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvbf16gerx2np(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvbf16gerx2pn(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvbf16gerx2pp(&vdmr, vp, vc, 0, 0, 0); + // CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvbf16gerx2' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvbf16gerx2nn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvbf16gerx2np' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvbf16gerx2pn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvbf16gerx2pp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2nn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2np' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pp' needs target feature mma,paired-vector-memops } From 211cfc9e355436f031a600813fc448e5fb0fb881 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Mon, 7 Jul 2025 14:31:16 -0500 Subject: [PATCH 2/3] add dmf vsx vector 16-bit floating point builtin --- clang/include/clang/Basic/BuiltinsPPC.def | 4 + .../PowerPC/builtins-dmf-vsx-vector-float.c | 148 ++++++++++++++++++ .../PowerPC/ppc-future-mma-builtin-err.c | 38 ++++- 3 files changed, 184 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def index 4fff12e9ca7c7..a75a2278328b6 100644 --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -1112,6 +1112,10 @@ UNALIASED_CUSTOM_MMA_BUILTIN(mma_dmxvbf16gerx2, "vW1024*W256V", "mma,paired-vector-memops") UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmdmxvbf16gerx2, "vW1024*W256Vi255i15i3", "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_dmxvf16gerx2, "vW1024*W256V", + "mma,paired-vector-memops") +UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmdmxvf16gerx2, "vW1024*W256Vi255i15i3", + "mma,paired-vector-memops") // MMA builtins with positive/negative multiply/accumulate. diff --git a/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c index 953815ecc42b6..8fc9a68a5a613 100644 --- a/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c +++ b/clang/test/CodeGen/PowerPC/builtins-dmf-vsx-vector-float.c @@ -153,6 +153,154 @@ void test_pmdmxvbf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector uns *((__dmr1024 *)resp) = vdmr; } +// CHECK-LABEL: void @test_dmxvf16gerx2( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6:![0-9]+]] +// CHECK-NEXT: ret void +// +void test_dmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvf16gerx2(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvf16gerx2nn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvf16gerx2nn(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvf16gerx2np( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvf16gerx2np(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvf16gerx2pn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvf16gerx2pn(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_dmxvf16gerx2pp( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]]) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_dmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_dmxvf16gerx2pp(&vdmr, vp, vc); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvf16gerx2( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1> [[TMP0]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP1]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvf16gerx2(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvf16gerx2(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvf16gerx2nn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2nn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvf16gerx2nn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvf16gerx2nn(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvf16gerx2np( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2np(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvf16gerx2np(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvf16gerx2np(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvf16gerx2pn( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pn(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvf16gerx2pn(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvf16gerx2pn(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + +// CHECK-LABEL: void @test_pmdmxvf16gerx2pp( +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP1:%.*]] = load <256 x i1>, ptr [[VPP:%.*]], align 32, !tbaa [[TBAA2]] +// CHECK-NEXT: [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2pp(<1024 x i1> [[TMP0]], <256 x i1> [[TMP1]], <16 x i8> [[VC:%.*]], i32 0, i32 0, i32 0) +// CHECK-NEXT: store <1024 x i1> [[TMP2]], ptr [[RESP:%.*]], align 128, !tbaa [[TBAA6]] +// CHECK-NEXT: ret void +// +void test_pmdmxvf16gerx2pp(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc, unsigned char *resp) { + __dmr1024 vdmr = *((__dmr1024 *)vdmrp); + __vector_pair vp = *((__vector_pair *)vpp); + __builtin_mma_pmdmxvf16gerx2pp(&vdmr, vp, vc, 0, 0, 0); + *((__dmr1024 *)resp) = vdmr; +} + // CHECK: [[TBAA2]] = !{[[META3:![0-9]+]], [[META3]], i64 0} // CHECK: [[META3]] = !{!"__vector_pair", [[META4:![0-9]+]], i64 0} // CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} diff --git a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c index 7023fa265aa1e..b70dd739652b8 100644 --- a/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c +++ b/clang/test/CodeGen/PowerPC/ppc-future-mma-builtin-err.c @@ -12,7 +12,15 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) __builtin_mma_dmxvi8gerx4spp(&vdmr, vp, vc); __builtin_mma_pmdmxvi8gerx4spp(&vdmr, vp, vc, 0, 0, 0); +// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops + // DMF VSX Vector bfloat16 GER 2x builtins. + __builtin_mma_dmxvbf16gerx2(&vdmr, vp, vc); __builtin_mma_dmxvbf16gerx2nn(&vdmr, vp, vc); __builtin_mma_dmxvbf16gerx2np(&vdmr, vp, vc); @@ -24,12 +32,6 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) __builtin_mma_pmdmxvbf16gerx2pn(&vdmr, vp, vc, 0, 0, 0); __builtin_mma_pmdmxvbf16gerx2pp(&vdmr, vp, vc, 0, 0, 0); -// CHECK: error: '__builtin_mma_dmxvi8gerx4' needs target feature mma,paired-vector-memops -// CHECK: error: '__builtin_mma_pmdmxvi8gerx4' needs target feature mma,paired-vector-memops -// CHECK: error: '__builtin_mma_dmxvi8gerx4pp' needs target feature mma,paired-vector-memops -// CHECK: error: '__builtin_mma_pmdmxvi8gerx4pp' needs target feature mma,paired-vector-memops -// CHECK: error: '__builtin_mma_dmxvi8gerx4spp' needs target feature mma,paired-vector-memops -// CHECK: error: '__builtin_mma_pmdmxvi8gerx4spp' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_dmxvbf16gerx2' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_dmxvbf16gerx2nn' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_dmxvbf16gerx2np' needs target feature mma,paired-vector-memops @@ -40,4 +42,28 @@ void test_mma(unsigned char *vdmrp, unsigned char *vpp, vector unsigned char vc) // CHECK: error: '__builtin_mma_pmdmxvbf16gerx2np' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pn' needs target feature mma,paired-vector-memops // CHECK: error: '__builtin_mma_pmdmxvbf16gerx2pp' needs target feature mma,paired-vector-memops + + // DMF VSX Vector 16-bitFloating-point GER 2x builtins. + + __builtin_mma_dmxvf16gerx2(&vdmr, vp, vc); + __builtin_mma_dmxvf16gerx2nn(&vdmr, vp, vc); + __builtin_mma_dmxvf16gerx2np(&vdmr, vp, vc); + __builtin_mma_dmxvf16gerx2pn(&vdmr, vp, vc); + __builtin_mma_dmxvf16gerx2pp(&vdmr, vp, vc); + __builtin_mma_pmdmxvf16gerx2(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvf16gerx2nn(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvf16gerx2np(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvf16gerx2pn(&vdmr, vp, vc, 0, 0, 0); + __builtin_mma_pmdmxvf16gerx2pp(&vdmr, vp, vc, 0, 0, 0); + +// CHECK: error: '__builtin_mma_dmxvf16gerx2' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvf16gerx2nn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvf16gerx2np' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvf16gerx2pn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_dmxvf16gerx2pp' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvf16gerx2' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvf16gerx2nn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvf16gerx2np' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvf16gerx2pn' needs target feature mma,paired-vector-memops +// CHECK: error: '__builtin_mma_pmdmxvf16gerx2pp' needs target feature mma,paired-vector-memops } From 4cfc00bee22486b777fc5cc1efc475b2dc62949c Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Wed, 9 Jul 2025 14:40:28 -0500 Subject: [PATCH 3/3] fix err in rebase --- clang/include/clang/Basic/BuiltinsPPC.def | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def index a75a2278328b6..3cae3aa3509c2 100644 --- a/clang/include/clang/Basic/BuiltinsPPC.def +++ b/clang/include/clang/Basic/BuiltinsPPC.def @@ -1092,6 +1092,8 @@ UNALIASED_CUSTOM_BUILTIN(mma_dmxvi8gerx4spp, "vW1024*W256V", true, "mma,paired-vector-memops") UNALIASED_CUSTOM_BUILTIN(mma_pmdmxvi8gerx4spp, "vW1024*W256Vi255i15i15", true, "mma,paired-vector-memops") + +// MMA builtins with positive/negative multiply/accumulate. UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV", "mma,paired-vector-memops") UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf32ger, "vW512*VV", @@ -1117,25 +1119,6 @@ UNALIASED_CUSTOM_MMA_BUILTIN(mma_dmxvf16gerx2, "vW1024*W256V", UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmdmxvf16gerx2, "vW1024*W256Vi255i15i3", "mma,paired-vector-memops") - -// MMA builtins with positive/negative multiply/accumulate. -UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf16ger2, "vW512*VV", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf32ger, "vW512*VV", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvf64ger, "vW512*W256V", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf16ger2, "vW512*VVi15i15i3", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf32ger, "vW512*VVi15i15", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvf64ger, "vW512*W256Vi15i3", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_xvbf16ger2, "vW512*VV", - "mma,paired-vector-memops") -UNALIASED_CUSTOM_MMA_BUILTIN(mma_pmxvbf16ger2, "vW512*VVi15i15i3", - "mma,paired-vector-memops") - // FIXME: Obviously incomplete. #undef BUILTIN