@@ -926,6 +926,78 @@ def fun(float(N, K) I, float(N) O0) -> (O) {
926
926
)TC" );
927
927
}
928
928
929
+ /*
930
+ * Check that a 2D mean with these parameters does not produce a library call.
931
+ * The call is not produced because the band is tiled by 32 and 512 threads are
932
+ * mapped to the band.
933
+ * In practice, check that the library call does not appear in the code.
934
+ */
935
+ TEST_F (PolyhedralMapperTest, Mean2DNonParametric_512threads) {
936
+ string tc = R"TC(
937
+ def fun(float(36864, 1024) I) -> (O) {
938
+ O(n) +=! I(n, r_n)
939
+ O(n) = O(n) / (1024)
940
+ }
941
+ )TC" ;
942
+ auto mappingOptions =
943
+ DefaultOptions ()
944
+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
945
+ .outerScheduleAllowSkewing (false )
946
+ .outerSchedulePositiveOrthant (true )
947
+ .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
948
+ .intraTileScheduleAllowSkewing (false )
949
+ .intraTileSchedulePositiveOrthant (true )
950
+ .fixParametersBeforeScheduling (false )
951
+ .tile (18 , 32 )
952
+ .unroll (16 )
953
+ .tileImperfectlyNested (false )
954
+ .matchLibraryCalls (true )
955
+ .mapToThreads ({512 })
956
+ .mapToBlocks ({16384 })
957
+ .useSharedMemory (true )
958
+ .usePrivateMemory (false )
959
+ .unrollCopyShared (true );
960
+
961
+ auto code = codegenMapped (tc, mappingOptions);
962
+ using tc::code::cuda::kCUBReductionName ;
963
+ EXPECT_TRUE (code.find (kCUBReductionName ) == std::string::npos);
964
+ }
965
+
966
+ /*
967
+ * Check that a 2D mean with these parameters produce a reduction library call.
968
+ * In practice, check that the library call appears in the code.
969
+ */
970
+ TEST_F (PolyhedralMapperTest, Mean2DNonParametric_32threads) {
971
+ string tc = R"TC(
972
+ def fun(float(36864, 1024) I) -> (O) {
973
+ O(n) +=! I(n, r_n)
974
+ O(n) = O(n) / (1024)
975
+ }
976
+ )TC" ;
977
+ auto mappingOptions =
978
+ DefaultOptions ()
979
+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
980
+ .outerScheduleAllowSkewing (false )
981
+ .outerSchedulePositiveOrthant (true )
982
+ .intraTileScheduleFusionStrategy (tc::FusionStrategy::Min)
983
+ .intraTileScheduleAllowSkewing (false )
984
+ .intraTileSchedulePositiveOrthant (true )
985
+ .fixParametersBeforeScheduling (false )
986
+ .tile (18 , 32 )
987
+ .unroll (16 )
988
+ .tileImperfectlyNested (false )
989
+ .matchLibraryCalls (true )
990
+ .mapToThreads ({32 })
991
+ .mapToBlocks ({16384 })
992
+ .useSharedMemory (true )
993
+ .usePrivateMemory (false )
994
+ .unrollCopyShared (true );
995
+
996
+ auto code = codegenMapped (tc, mappingOptions);
997
+ using tc::code::cuda::kCUBReductionName ;
998
+ EXPECT_TRUE (code.find (kCUBReductionName ) != std::string::npos);
999
+ }
1000
+
929
1001
static const string kTcMM = R"TC(
930
1002
def fun(float(M, K) A, float(K, N) B) -> (C) {
931
1003
C(m, n) +=! A(m, r_k) * B(r_k, n)
0 commit comments