promoteToRegistersBelow: sort tensor reference groups

ftynse · ftynse · commit 1e5ad912b951 · 2018-07-26T10:32:56.000+02:00
Follow the same strategy as with shared memory promotion: first, sort
tensors in decreasing order of the total number of references; then, for
each tensor, sort groups based on the number of references in this
group.  Tensor groups with more references are expected to benefit more
from promotion as more global memory accesses may be avoided thanks to
explicit caching in faster layers of the memory hierarchy.  Note that
since there is no limit on the number of registers to use, all groups
that can be promoted into registers are promoted, and the sorting has no
effect on the outcome.  Such limit will be introduced next.
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -656,8 +656,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   auto mapping =
       collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
   auto schedule = partialSchedule(scop.scheduleRoot(), scope);
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      schedule.intersect_domain(mapping), scop.body);
+  auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
+      schedule.intersect_domain(mapping), scop.body));
 
   auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
   auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
@@ -673,10 +673,10 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   // identical dimensions without affecting the result of the checks.
   partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);
 
-  for (auto& tensorGroups : groupMap) {
+  for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
-
-    // TODO: sorting of groups and counting the number of promoted elements
+    sortTensorGroups(tensorGroups.second);
+    // TODO: counting the number of promoted elements
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();