Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit bc83925

Browse files
nicolasvasilacheftynse
authored andcommitted
[Bug] Illegal memory access on Volta
This captures a hard to reproduce illegal memory access on Volta. This seem to happen on Kronecker_1 consistently during tuning. When try to reproduce I couldn't originally see an issue. It is only when I put together the last 2 options in a sequence that I saw a reproducible problem, oly on Volta. Second set of eyes most welcome here.
1 parent 54a5b8c commit bc83925

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

test/cuda/test_tc_mapper_bugs.cc

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,73 @@ def group_norm(
833833
checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6);
834834
}
835835

836+
// For some obscure reason, the succession of these 2 kernels results in an
837+
// out of bounds memory acces on Volta but not on Pascal.
838+
TEST(Kronecker, IllegalAccessVoltaBug) {
839+
constexpr static auto Kronecker3_1 = "Kronecker3_1";
840+
constexpr static auto TC = R"TC(
841+
def Kronecker3_1(float(D2, N2) W2, float(M, N0, N1, N2) X) -> (XW2) {
842+
XW2(m, n0, n1, d2) +=! X(m, n0, n1, r_n2) * W2(d2, r_n2)
843+
}
844+
)TC";
845+
at::Tensor W2 = at::CUDA(at::kFloat).rand({16, 32});
846+
at::Tensor X = at::CUDA(at::kFloat).rand({256, 32, 32, 32});
847+
848+
auto options1 =
849+
tc::CudaMappingOptions::makeNaiveMappingOptions()
850+
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
851+
.outerScheduleAllowSkewing(false)
852+
.outerSchedulePositiveOrthant(true)
853+
.intraTileScheduleFusionStrategy(
854+
tc::FusionStrategy::Preserve3Coincident)
855+
.intraTileScheduleAllowSkewing(false)
856+
.intraTileSchedulePositiveOrthant(true)
857+
.fixParametersBeforeScheduling(true)
858+
.tile(256)
859+
.unroll(16)
860+
.tileImperfectlyNested(false)
861+
.matchLibraryCalls(true)
862+
.mapToThreads(16, 8)
863+
.mapToBlocks(4, 2)
864+
.useSharedMemory(true)
865+
.usePrivateMemory(true)
866+
.unrollCopyShared(true)
867+
.useReadOnlyCache(false);
868+
869+
std::vector<at::Tensor> inputs1{W2, X};
870+
auto pExecutor1 =
871+
tc::aten::compile<tc::CudaBackend>(TC, Kronecker3_1, inputs1, options1);
872+
auto outputs1 = tc::aten::prepareOutputs(TC, Kronecker3_1, inputs1);
873+
tc::aten::run(*pExecutor1, inputs1, outputs1);
874+
875+
auto options2 =
876+
tc::CudaMappingOptions::makeNaiveMappingOptions()
877+
.outerScheduleFusionStrategy(tc::FusionStrategy::Preserve3Coincident)
878+
.outerScheduleAllowSkewing(false)
879+
.outerSchedulePositiveOrthant(true)
880+
.intraTileScheduleFusionStrategy(
881+
tc::FusionStrategy::Preserve3Coincident)
882+
.intraTileScheduleAllowSkewing(false)
883+
.intraTileSchedulePositiveOrthant(true)
884+
.fixParametersBeforeScheduling(true)
885+
.tile(2, 32)
886+
.unroll(32)
887+
.tileImperfectlyNested(false)
888+
.matchLibraryCalls(true)
889+
.mapToThreads(64)
890+
.mapToBlocks(2, 1, 64)
891+
.useSharedMemory(false)
892+
.usePrivateMemory(true)
893+
.unrollCopyShared(true)
894+
.useReadOnlyCache(false);
895+
896+
std::vector<at::Tensor> inputs2{W2, X};
897+
auto pExecutor2 =
898+
tc::aten::compile<tc::CudaBackend>(TC, Kronecker3_1, inputs2, options2);
899+
auto outputs2 = tc::aten::prepareOutputs(TC, Kronecker3_1, inputs2);
900+
tc::aten::run(*pExecutor2, inputs2, outputs2);
901+
}
902+
836903
int main(int argc, char** argv) {
837904
::testing::InitGoogleTest(&argc, argv);
838905
::gflags::ParseCommandLineFlags(&argc, &argv, true);

0 commit comments

Comments
 (0)