[SYCL][Joint Matrix] Implement bfloat16 32x64x32 combination support (#13590)

YuriPlyakhin · web-flow · commit 3fbf1011c4b9 · 2025-02-11T16:36:00.000-08:00
diff --git a/sycl/test-e2e/Matrix/Inputs/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/Inputs/element_wise_all_ops_impl.hpp
@@ -229,8 +229,11 @@ int main() {
         test_ewops_c<float, 16, 16>();
 // This combination is not currently supported for sub group size = 32 in IGC
 #if (!defined(SG_SZ) || SG_SZ != 32)
+        test_ewops_ab<bfloat16, 1, 32, use::a, layout::row_major, 1>();
         test_ewops_ab<bfloat16, 32, 16, use::a, layout::row_major, 1>();
+        test_ewops_ab<bfloat16, 32, 32, use::a, layout::row_major, 1>();
         test_ewops_ab<bfloat16, 16, 64, use::b, layout::ext_intel_packed, 2>();
+        test_ewops_ab<bfloat16, 32, 64, use::b, layout::ext_intel_packed, 2>();
         test_ewops_c<float, 1, 64>();
         test_ewops_c<float, 32, 64>();
 #endif
diff --git a/sycl/test-e2e/Matrix/Inputs/element_wise_ops_impl.hpp b/sycl/test-e2e/Matrix/Inputs/element_wise_ops_impl.hpp
@@ -133,7 +133,9 @@ int main() {
       // IGC
       passed &= test<bfloat16, float, 16, 16, 16, 2, class pvc_bf16_16x16x16>();
       passed &= test<bfloat16, float, 1, 64, 16, 2, class pvc_bf16_1x64x16>();
+      passed &= test<bfloat16, float, 1, 64, 32, 2, class pvc_bf16_1x64x32>();
       passed &= test<bfloat16, float, 32, 64, 16, 2, class pvc_bf16_32x64x16>();
+      passed &= test<bfloat16, float, 32, 64, 32, 2, class pvc_bf16_32x64x32>();
 #endif
       break;
     }
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -483,7 +483,17 @@ int main(
            MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
       test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16,
            MCache1, NCache1, KCache1, MCache2, NCache2, KCache2>(matrix_size);
-#endif
+// `#ifndef PREFETCH` is a workaround for GSD-10535.
+#ifndef PREFETCH
+      // The test is commented out due flaky results: GSD-10537.
+      // test<bfloat16, float, VnniFactor, /*TM*/ 1, /*TN*/ 64, /*TK*/ 32,
+      // MCache1,
+      //      NCache1, /*KCache1*/ 32, MCache2, NCache2, KCache2>(matrix_size);
+#endif // PREFETCH
+      test<bfloat16, float, VnniFactor, /*TM*/ 32, /*TN*/ 64, /*TK*/ 32,
+           MCache1, NCache1, /*KCache1*/ 32, MCache2, NCache2, KCache2>(
+          matrix_size);
+#endif // (!defined(SG_SZ) || SG_SZ != 32)
       break;
     }
 
diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_rowmajorA_rowmajorB_impl.hpp b/sycl/test-e2e/Matrix/Inputs/joint_matrix_rowmajorA_rowmajorB_impl.hpp
@@ -127,8 +127,12 @@ int main() {
                               bfloat16, float>();
         res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16,
                               float>();
+        res += gemm_row_major<1, 64, 32, class bf16_1x64x32, bfloat16, bfloat16,
+                              float>();
         res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16,
                               bfloat16, float>();
+        res += gemm_row_major<32, 64, 32, class bf16_32x64x32, bfloat16,
+                              bfloat16, float>();
       }
       break;
     }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -11,11 +11,12 @@
 
 // REQUIRES: aspect-ext_intel_matrix
 
-// RUN: %{build} -mllvm -inline-threshold=2000 %fp-model-precise -o %t.out -DMANUAL_UNROLL -DVNNI
+// RUN: %{build} -mllvm -inline-threshold=5000 %fp-model-precise -o %t.out -DMANUAL_UNROLL -DVNNI
 // RUN: %{run} %t.out
 
 // -mllvm -inline-threshold=2000 added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
+// -inline-threshold increased to 5000 to workaround bug in IGC: GSD-10534
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -11,11 +11,12 @@
 
 // REQUIRES: aspect-ext_intel_matrix, gpu
 
-// RUN: %{build} -mllvm -inline-threshold=2000 %fp-model-precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL -DVNNI
+// RUN: %{build} -mllvm -inline-threshold=5000 %fp-model-precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL -DVNNI
 // RUN: %{run} %t_gpu.out
 
 // -mllvm -inline-threshold=2000 added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
+// -inline-threshold increased to 5000 to workaround bug in IGC: GSD-10534
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"

Original file line number	Diff line number	Diff line change
`@@ -127,8 +127,12 @@ int main() {`
`127`	`127`	`bfloat16, float>();`
`128`	`128`	`res += gemm_row_major<1, 64, 16, class bf16_1x64x16, bfloat16, bfloat16,`
`129`	`129`	`float>();`
	`130`	`+ res += gemm_row_major<1, 64, 32, class bf16_1x64x32, bfloat16, bfloat16,`
	`131`	`+ float>();`
`130`	`132`	`res += gemm_row_major<32, 64, 16, class bf16_32x64x16, bfloat16,`
`131`	`133`	`bfloat16, float>();`
	`134`	`+ res += gemm_row_major<32, 64, 32, class bf16_32x64x32, bfloat16,`
	`135`	`+ bfloat16, float>();`
`132`	`136`	`}`
`133`	`137`	`break;`
`134`	`138`	`}`