@@ -833,6 +833,73 @@ def group_norm(
833
833
checkRtol (outputs[0 ] - y, {I}, D * H * W, 1e-6 );
834
834
}
835
835
836
+ // For some obscure reason, the succession of these 2 kernels results in an
837
+ // out of bounds memory acces on Volta but not on Pascal.
838
+ TEST (Kronecker, IllegalAccessVoltaBug) {
839
+ constexpr static auto Kronecker3_1 = " Kronecker3_1" ;
840
+ constexpr static auto TC = R"TC(
841
+ def Kronecker3_1(float(D2, N2) W2, float(M, N0, N1, N2) X) -> (XW2) {
842
+ XW2(m, n0, n1, d2) +=! X(m, n0, n1, r_n2) * W2(d2, r_n2)
843
+ }
844
+ )TC" ;
845
+ at::Tensor W2 = at::CUDA (at::kFloat ).rand ({16 , 32 });
846
+ at::Tensor X = at::CUDA (at::kFloat ).rand ({256 , 32 , 32 , 32 });
847
+
848
+ auto options1 =
849
+ tc::CudaMappingOptions::makeNaiveMappingOptions ()
850
+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
851
+ .outerScheduleAllowSkewing (false )
852
+ .outerSchedulePositiveOrthant (true )
853
+ .intraTileScheduleFusionStrategy (
854
+ tc::FusionStrategy::Preserve3Coincident)
855
+ .intraTileScheduleAllowSkewing (false )
856
+ .intraTileSchedulePositiveOrthant (true )
857
+ .fixParametersBeforeScheduling (true )
858
+ .tile (256 )
859
+ .unroll (16 )
860
+ .tileImperfectlyNested (false )
861
+ .matchLibraryCalls (true )
862
+ .mapToThreads (16 , 8 )
863
+ .mapToBlocks (4 , 2 )
864
+ .useSharedMemory (true )
865
+ .usePrivateMemory (true )
866
+ .unrollCopyShared (true )
867
+ .useReadOnlyCache (false );
868
+
869
+ std::vector<at::Tensor> inputs1{W2, X};
870
+ auto pExecutor1 =
871
+ tc::aten::compile<tc::CudaBackend>(TC, Kronecker3_1, inputs1, options1);
872
+ auto outputs1 = tc::aten::prepareOutputs (TC, Kronecker3_1, inputs1);
873
+ tc::aten::run (*pExecutor1, inputs1, outputs1);
874
+
875
+ auto options2 =
876
+ tc::CudaMappingOptions::makeNaiveMappingOptions ()
877
+ .outerScheduleFusionStrategy (tc::FusionStrategy::Preserve3Coincident)
878
+ .outerScheduleAllowSkewing (false )
879
+ .outerSchedulePositiveOrthant (true )
880
+ .intraTileScheduleFusionStrategy (
881
+ tc::FusionStrategy::Preserve3Coincident)
882
+ .intraTileScheduleAllowSkewing (false )
883
+ .intraTileSchedulePositiveOrthant (true )
884
+ .fixParametersBeforeScheduling (true )
885
+ .tile (2 , 32 )
886
+ .unroll (32 )
887
+ .tileImperfectlyNested (false )
888
+ .matchLibraryCalls (true )
889
+ .mapToThreads (64 )
890
+ .mapToBlocks (2 , 1 , 64 )
891
+ .useSharedMemory (false )
892
+ .usePrivateMemory (true )
893
+ .unrollCopyShared (true )
894
+ .useReadOnlyCache (false );
895
+
896
+ std::vector<at::Tensor> inputs2{W2, X};
897
+ auto pExecutor2 =
898
+ tc::aten::compile<tc::CudaBackend>(TC, Kronecker3_1, inputs2, options2);
899
+ auto outputs2 = tc::aten::prepareOutputs (TC, Kronecker3_1, inputs2);
900
+ tc::aten::run (*pExecutor2, inputs2, outputs2);
901
+ }
902
+
836
903
int main (int argc, char ** argv) {
837
904
::testing::InitGoogleTest (&argc, argv);
838
905
::gflags::ParseCommandLineFlags (&argc, &argv, true );
0 commit comments