partialTargetTiles: ensure that tiling schedule applies to entire domain

nicolasvasilache · Sven Verdoolaege · commit 4fa54d56aecf · 2018-05-10T14:26:23.000+02:00
The tiling schedule should be total on the domain.
Otherwise, it may drop some domain elements from consideration,
skewing the result of the function.
In the extreme case, the domain of the schedule is disjoint from
the domain, resulting in the perceived absence of any tiles and
therefore also of partial tiles.
This would cause the new test case to fail.

The problem is that in infixScheduleMupa. the combination
of a zero-dimensional isl::multi_union_pw_aff with explicit domain
(the initial value) and a zero-dimensional isl::multi_union_pw_aff
without explicit domain (the one from a band inserted by
ScheduleTree::makeEmptyBand) would result in
a zero-dimensional isl::multi_union_pw_aff with an _empty_ domain.
Arguably, this is a bug in isl.
Work around this issue by setting an explicit domain
on the partial schedule of the band created by ScheduleTree::makeEmptyBand.
This explicit domain is also more in line with the rest
of the code base and allows the partial schedule to be converted
to an isl::union_map directly, if needed.
diff --git a/tc/core/polyhedral/schedule_tree.cc b/tc/core/polyhedral/schedule_tree.cc
@@ -212,7 +212,8 @@ ScheduleTreeUPtr ScheduleTree::makeEmptyBand(const ScheduleTree* root) {
   auto domain = root->elemAs<ScheduleTreeElemDomain>();
   CHECK(domain);
   auto space = domain->domain_.get_space().set_from_params();
-  auto zero = isl::multi_union_pw_aff::zero(space);
+  auto mv = isl::multi_val::zero(space);
+  auto zero = isl::multi_union_pw_aff(domain->domain_, mv);
   return ScheduleTree::makeBand(zero);
 }
 
diff --git a/tc/core/polyhedral/separation.cc b/tc/core/polyhedral/separation.cc
@@ -39,6 +39,7 @@ isl::union_set partialTargetTiles(
   // Mapping between prefix values and target values
   // for some common domain element
   // P -> T
+  CHECK(domain.is_subset(scheduleMap.domain()));
   auto target = domain.apply(scheduleMap).unwrap();
   // Mapping between prefix values and target values
   // for some common domain element, extended to complete target tiles.
diff --git a/test/cuda/test_tc_mapper_bugs.cc b/test/cuda/test_tc_mapper_bugs.cc
@@ -774,6 +774,65 @@ TEST(Convolution, NestedExpressions) {
   CHECK_EQ(at::Scalar(B[10]).toFloat(), 1);
 }
 
+// Previous versions of TC would map the reduction in the code below
+// to CUB, despite the fact that not every thread gets assigned
+// an instance of the reduction.
+// Check that this no longer happens.
+TEST(GroupNorm, ReductionDeadlockBug) {
+  auto group_norm = "group_norm";
+  auto TC = std::string(R"TC(
+def group_norm(
+    float(N, G, D, H, W) I, float(G, D) gamma, float(G, D) beta)
+    -> (O, mean, var)
+{
+    mean(n, g) +=! I(n, g, r_d, r_h, r_w)
+    var(n, g) +=! I(n, g, r_d, r_h, r_w) * I(n, g, r_d, r_h, r_w)
+    O(n, g, d, h, w) = gamma(g, d)
+      * ( I(n, g, d, h, w) - mean(n, g) * 1.0 )
+      * rsqrt( var(n, g) * 1.0
+            - mean(n, g) * mean(n, g) * 1.0 * 1.0
+            + 1e-5)
+      + beta(g, d)
+}
+  )TC");
+
+  uint32_t N = 4, C = 8, G = 4, D = C / G, H = 6, W = 6;
+  at::Tensor I = at::CUDA(at::kFloat).rand({N, G, D, H, W});
+  at::Tensor gamma = at::CUDA(at::kFloat).rand({G, D}).fill_(1.0f);
+  at::Tensor beta = at::CUDA(at::kFloat).rand({G, D}).fill_(0.0f);
+  std::vector<at::Tensor> inputs = {I, gamma, beta};
+  auto options = tc::CudaMappingOptions::makeNaiveMappingOptions()
+                     .outerScheduleFusionStrategy(tc::FusionStrategy::Min)
+                     .outerScheduleAllowSkewing(false)
+                     .outerSchedulePositiveOrthant(true)
+                     .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min)
+                     .intraTileScheduleAllowSkewing(false)
+                     .intraTileSchedulePositiveOrthant(true)
+                     .tile(2, 6, 8, 48)
+                     .unroll(4)
+                     .tileImperfectlyNested(false)
+                     .matchLibraryCalls(true)
+                     .mapToThreads(6, 12)
+                     .mapToBlocks(8)
+                     .useSharedMemory(true)
+                     .usePrivateMemory(true)
+                     .unrollCopyShared(false);
+  auto pExecutor =
+      tc::aten::compile<tc::CudaBackend>(TC, group_norm, inputs, options);
+  auto outputs = tc::aten::prepareOutputs(TC, group_norm, inputs);
+  tc::aten::run(*pExecutor, inputs, outputs);
+  cudaDeviceSynchronize();
+
+  auto v = I.view({N, G, -1});
+  auto mean = v.mean(-1, true);
+  auto var = v.var(-1, true).view({N, G, 1});
+  auto x = (v - mean) / (var + 1e-5f).sqrt();
+  auto y = x.view({N, G, D, H, W});
+  cudaDeviceSynchronize();
+
+  checkRtol(outputs[0] - y, {I}, D * H * W, 1e-6);
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);

Original file line number	Diff line number	Diff line change
`@@ -212,7 +212,8 @@ ScheduleTreeUPtr ScheduleTree::makeEmptyBand(const ScheduleTree* root) {`
`212`	`212`	`auto domain = root->elemAs<ScheduleTreeElemDomain>();`
`213`	`213`	`CHECK(domain);`
`214`	`214`	`auto space = domain->domain_.get_space().set_from_params();`
`215`		`- auto zero = isl::multi_union_pw_aff::zero(space);`
	`215`	`+ auto mv = isl::multi_val::zero(space);`
	`216`	`+ auto zero = isl::multi_union_pw_aff(domain->domain_, mv);`
`216`	`217`	`return ScheduleTree::makeBand(zero);`
`217`	`218`	`}`
`218`	`219`