cuda::MappedScop: introduce maxPrivateElements mapping option

ftynse · ftynse · commit 74d3e8512b21 · 2018-07-26T13:45:32.000+02:00
This mapping option controls the maximum number of elements per thread
that are promoted into the private memory (hopefully, registers, but we
cannot guarantee this at the CUDA level).  The value is optional in the
protocol buffers.  When not provided, query the maximum number of
threads per block from CUDA device properties and divide it by the
number of threads in the block to obtain the per-thread limitation.
Note that using all registers in a single block will likely limit the
occupancy of SMs, potentially degrading performance.  Introducing the
limiting factor is primarily motivated by this effect, and it lets the
caller to require the mapper to use less registers, potentially
increasing the occupancy.  Since register allocation is performed by the
downstream compiler, this option is a mere recommendation and is
expressed in terms of (untyped) elements rather than actual registers.
It would be impossible to account for all registers required by the main
computation (that is, necessary to store the data loaded from memory
during operations) at the CUDA level, that also contribute to the
register pressure of the kernel.

Although limiting the number of promoted elements number of registers
available per thread may seem too constraining for occupancy, it is
strictly better than the current approach where we may promote even more
elements, which then get spilled into the slow local memory.
diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
+  ownedProto_.set_max_private_elements(nElements);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
@@ -197,6 +197,7 @@ class CudaMappingOptions {
   CudaMappingOptions& useReadOnlyCache(bool b);
   CudaMappingOptions& privateDepth(uint32_t depth);
   CudaMappingOptions& sharedDepth(uint32_t depth);
+  CudaMappingOptions& maxPrivateElements(uint64_t nElements);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
   }
   prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
   prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
+  if (cudaOptions.proto().has_max_private_elements()) {
+    prn.printValueOption(
+        "maxPrivateElements", cudaOptions.proto().max_private_elements());
+  }
   prn.endStmt();
   return prn;
 }
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 9. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
+    auto blockSizes = cudaOptions.block.extractVector();
+    auto nThreadsPerBlock = std::accumulate(
+        blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
+    auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
+        ? cudaOptions.proto().max_private_elements()
+        : queryRegistersPerBlock() / nThreadsPerBlock;
+    promoteToRegistersAtDepth(
+        *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
   }
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -685,7 +685,6 @@ size_t promoteToRegistersBelow(
   for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
     sortTensorGroups(tensorGroups.second);
-    // TODO: counting the number of promoted elements
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
   optional uint32 private_depth = 9;
   // Depth of promotion to shared memory, ignored if use_shared_memory is false.
   optional uint32 shared_depth = 10;
+  // Maximum number of elements to promote to registers per thread.  If not
+  // provided, the number 32-bit registers per thread will be used.
+  optional uint64 max_private_elements = 11;
 }
 
 message CpuMappingOptionsProto {
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
           "usePrivateMemory",
           &tc::CudaMappingOptions::usePrivateMemory,
           "Create thread-local copies of data in private memory")
+      .def(
+          "maxPrivateElements",
+          &tc::CudaMappingOptions::maxPrivateElements,
+          "The maximum number of elements per thread for which thread-local "
+          "copies are created")
       .def(
           "unrollCopyShared",
           &tc::CudaMappingOptions::unrollCopyShared,
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
                             .tile(32, 32, 32)
                             .privateDepth(5)
                             .useSharedMemory(false)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
   auto declPos = code.find("float _O_0");
@@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
                             .tile(32, 32, 32)
                             .maxSharedMemory(32768)
                             .useSharedMemory(true)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
 
@@ -587,14 +589,35 @@ TEST_F(MatMulBias, RegistersAtRoot) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
   // after tiling by 32.
   expectFourOElementsPromoted(code);
 }
 
+TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) {
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.  Require sufficient unrolling to actually hit registers.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .unroll(512)
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3);
+  auto code = emitCode(mscop);
+
+  // Not expecting O to be promoted because 4 elements must be promoted and
+  // only 3 were indicated as available in promoteToRegistersBelow.
+  auto oDeclPos = code.find("float _O_0;");
+  EXPECT_TRUE(oDeclPos == std::string::npos)
+      << "not expected O to be promoted to registers";
+
+  expectNoABCPromotion(code);
+}
+
 TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
   // Disable automatic promotion to registers because we are going to call it
   // manually.  Require no unrolling so as to make promotion to registers
@@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100);
   auto code = emitCode(mscop);
   auto oDeclPos = code.find("float _O_0;");
 
@@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) {
       mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
   ASSERT_GT(nodes.size(), 0u);
   auto node = nodes[0];
-  promoteToRegistersBelow(*mscop, node);
+  promoteToRegistersBelow(*mscop, node, 100);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(`
`40`	`40`	`}`
`41`	`41`	`prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());`
`42`	`42`	`prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());`
	`43`	`+ if (cudaOptions.proto().has_max_private_elements()) {`
	`44`	`+ prn.printValueOption(`
	`45`	`+ "maxPrivateElements", cudaOptions.proto().max_private_elements());`
	`46`	`+ }`
`43`	`47`	`prn.endStmt();`
`44`	`48`	`return prn;`
`45`	`49`	`}`
Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {`
`74`	`74`	`optional uint32 private_depth = 9;`
`75`	`75`	`// Depth of promotion to shared memory, ignored if use_shared_memory is false.`
`76`	`76`	`optional uint32 shared_depth = 10;`
	`77`	`+ // Maximum number of elements to promote to registers per thread. If not`
	`78`	`+ // provided, the number 32-bit registers per thread will be used.`
	`79`	`+ optional uint64 max_private_elements = 11;`
`77`	`80`	`}`
`78`	`81`
`79`	`82`	`message CpuMappingOptionsProto {`