diff --git a/tc/core/cuda/cuda.cc b/tc/core/cuda/cuda.cc
index 108e058cd..57baa2ed7 100644
--- a/tc/core/cuda/cuda.cc
+++ b/tc/core/cuda/cuda.cc
@@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
 
 namespace {
 
-std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
+std::tuple<std::vector<std::string>, std::vector<size_t>, std::vector<size_t>>
+init() {
   int deviceCount = 0;
   auto err_id = cudaGetDeviceCount(&deviceCount);
   if (err_id == 35 or err_id == 30) {
@@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
   }
   std::vector<std::string> gpuNames;
   std::vector<size_t> sharedMemSizes;
+  std::vector<size_t> registersPerBlock;
   gpuNames.reserve(deviceCount);
   for (int i = 0; i < deviceCount; ++i) {
     cudaDeviceProp deviceProp;
     TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i));
     gpuNames.emplace_back(deviceProp.name);
     sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock);
+    registersPerBlock.emplace_back(deviceProp.regsPerBlock);
   }
-  return std::make_tuple(gpuNames, sharedMemSizes);
+  return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock);
 }
 
 } // namespace
@@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
   static thread_local bool inited = false;
   if (!inited) {
     auto infos = init();
-    pInfo = std::unique_ptr<CudaGPUInfo>(
-        new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos)));
+    pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo(
+        std::get<0>(infos), std::get<1>(infos), std::get<2>(infos)));
     inited = true;
   }
   return *pInfo;
@@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
   }
   return sharedMemSizes_.at(CurrentGPUId());
 }
+
+size_t CudaGPUInfo::RegistersPerBlock() const {
+  if (NumberGPUs() == 0) {
+    return 0; // no registers if no GPUs
+  }
+  return registersPerBlock_.at(CurrentGPUId());
+}
 } // namespace tc
diff --git a/tc/core/cuda/cuda.h b/tc/core/cuda/cuda.h
index a9fe1383a..fa5e68b98 100644
--- a/tc/core/cuda/cuda.h
+++ b/tc/core/cuda/cuda.h
@@ -98,8 +98,11 @@ struct WithCudaDevice {
 class CudaGPUInfo {
   CudaGPUInfo(
       const std::vector<std::string>& gpuNames,
-      const std::vector<size_t>& sharedMemSizes)
-      : gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {}
+      const std::vector<size_t>& sharedMemSizes,
+      const std::vector<size_t>& registersPerBlock)
+      : gpuNames_(gpuNames),
+        sharedMemSizes_(sharedMemSizes),
+        registersPerBlock_(registersPerBlock) {}
 
  public:
   static CudaGPUInfo& GPUInfo();
@@ -112,9 +115,11 @@ class CudaGPUInfo {
   std::string GetGPUName(int id = -1) const;
   std::string getCudaDeviceStr() const;
   size_t SharedMemorySize() const;
+  size_t RegistersPerBlock() const;
 
   std::vector<std::string> gpuNames_;
   std::vector<size_t> sharedMemSizes_;
+  std::vector<size_t> registersPerBlock_;
 };
 
 struct CudaProfiler {
diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
index 09d7edf8c..ba911aa90 100644
--- a/tc/core/cuda/cuda_mapping_options.cc
+++ b/tc/core/cuda/cuda_mapping_options.cc
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
+  ownedProto_.set_max_private_elements(nElements);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
index aa8530307..ab6ce5e11 100644
--- a/tc/core/cuda/cuda_mapping_options.h
+++ b/tc/core/cuda/cuda_mapping_options.h
@@ -197,6 +197,7 @@ class CudaMappingOptions {
   CudaMappingOptions& useReadOnlyCache(bool b);
   CudaMappingOptions& privateDepth(uint32_t depth);
   CudaMappingOptions& sharedDepth(uint32_t depth);
+  CudaMappingOptions& maxPrivateElements(uint64_t nElements);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
index 9ffa95bcc..a223fcb80 100644
--- a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
+++ b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
   }
   prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
   prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
+  if (cudaOptions.proto().has_max_private_elements()) {
+    prn.printValueOption(
+        "maxPrivateElements", cudaOptions.proto().max_private_elements());
+  }
   prn.endStmt();
   return prn;
 }
diff --git a/tc/core/gpu.h b/tc/core/gpu.h
index 6846304fd..06c8d680d 100644
--- a/tc/core/gpu.h
+++ b/tc/core/gpu.h
@@ -36,4 +36,15 @@ inline size_t querySharedMemorySize() {
 #endif
 }
 
+/// Get the maximum number of registers per block provided by the GPU device
+/// active in the current thread.  The call is forwarded to the GPU driver.
+/// If the thread has no associated GPU, return 0.
+inline size_t queryRegistersPerBlock() {
+#if TC_WITH_CUDA && !defined(NO_CUDA_SDK)
+  return CudaGPUInfo::GPUInfo().RegistersPerBlock();
+#else
+  return 0;
+#endif
+}
+
 } // namespace tc
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
index b0b129cbc..faa7d9361 100644
--- a/tc/core/polyhedral/cuda/mapped_scop.cc
+++ b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
 
   // 9. Promote to registers below the loops mapped to threads.
   if (cudaOptions.proto().use_private_memory()) {
-    promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
+    auto blockSizes = cudaOptions.block.extractVector();
+    auto nThreadsPerBlock = std::accumulate(
+        blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
+    auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
+        ? cudaOptions.proto().max_private_elements()
+        : queryRegistersPerBlock() / nThreadsPerBlock;
+    promoteToRegistersAtDepth(
+        *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
   }
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
index b1bca6923..f73246666 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc
@@ -446,6 +446,51 @@ bool isInThreadMappedScope(
   return false;
 }
 
+static std::vector<std::pair<isl::id, TensorGroupsInfo>> sortTensorGroupMap(
+    TensorGroups&& groupMap) {
+  // Prepare groups for sorting, to have specified order necessary for
+  // reproducibility and tests.
+  using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
+  std::vector<TensorGroupList> groupLists(
+      std::make_move_iterator(groupMap.begin()),
+      std::make_move_iterator(groupMap.end()));
+
+  // Computes the total number of references in all groups.
+  auto refsCount = [](const TensorGroupsInfo& info) {
+    size_t refs = 0;
+    for (auto const& group : info) {
+      refs += group->referenceIds().size();
+    }
+    return refs;
+  };
+
+  // Sort by the total number of references, then by name.  Because names are
+  // guarenteed to be unique, the order is total.
+  std::sort(
+      groupLists.begin(),
+      groupLists.end(),
+      [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
+        auto r1 = refsCount(l1.second);
+        auto r2 = refsCount(l2.second);
+        return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
+      });
+  return groupLists;
+}
+
+/* Sorts the given vector of tensor groups in place following the number of
+ * references in the group in decreasing order.  This prioritize groups with
+ * more references as they are more likely to benefit from promotion.
+ */
+static void sortTensorGroups(TensorGroupsInfo& tensorGroups) {
+  std::sort(
+      tensorGroups.begin(),
+      tensorGroups.end(),
+      [](const std::unique_ptr<TensorReferenceGroup>& group1,
+         const std::unique_ptr<TensorReferenceGroup>& group2) {
+        return group1->referenceIds().size() > group2->referenceIds().size();
+      });
+}
+
 /*
  * Promote to shared memory in "scop" below "node".  Use at most
  * "remainingMemory" bytes, and update the variable to reflect the amount of
@@ -474,49 +519,14 @@ void promoteToSharedBelow(
   auto partialSched = partialSchedule(root, node);
   auto mapping = collectMappingsTo<mapping::BlockId>(scop);
 
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      partialSched.intersect_domain(mapping), scop.body);
+  auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
+      partialSched.intersect_domain(mapping), scop.body));
   // Pure affine schedule without (mapping) filters.
   auto partialSchedMupa = partialScheduleMupa(root, node);
 
-  // Prepare groups for sorting, to have specified order necessary for
-  // reproducibility and tests.
-  using TensorGroupList = std::pair<isl::id, TensorGroupsInfo>;
-  std::vector<TensorGroupList> groupLists(
-      std::make_move_iterator(groupMap.begin()),
-      std::make_move_iterator(groupMap.end()));
-
-  // Computes the total number of references in all groups.
-  auto refsCount = [](const TensorGroupsInfo& info) {
-    size_t refs = 0;
-    for (auto const& group : info) {
-      refs += group->referenceIds().size();
-    }
-    return refs;
-  };
-
-  // Sort by the total number of references, then by name.  Because names are
-  // guarenteed to be unique, the order is total.
-  std::sort(
-      groupLists.begin(),
-      groupLists.end(),
-      [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) {
-        auto r1 = refsCount(l1.second);
-        auto r2 = refsCount(l2.second);
-        return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2;
-      });
   for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
-    // Sort the reference groups to prioritize groups with more references as
-    // they are more likely to benefit from promotion.
-    std::sort(
-        tensorGroups.second.begin(),
-        tensorGroups.second.end(),
-        [refsCount](
-            const std::unique_ptr<TensorReferenceGroup>& group1,
-            const std::unique_ptr<TensorReferenceGroup>& group2) {
-          return group1->referenceIds().size() > group2->referenceIds().size();
-        });
+    sortTensorGroups(tensorGroups.second);
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();
@@ -620,8 +630,17 @@ void promoteToSharedAtDepth(
  * of "mscop".  Throw if promotion would violate the well-formedness of the
  * schedule tree, in particular in cases of promotion immediately below
  * a set/sequence node or immediately above a thread-specific marker node.
+ * Promote at most "maxElements" elements per thread and return the difference
+ * between "maxElements" and the number of actuall promoted elements.  Note
+ * that this function does not differentitate types and sizes of the promoted
+ * elements because register allocation cannot be controlled at the CUDA level
+ * anyway.  Instead, the "maxElements" value controls how much register
+ * promotion is performed overall.
  */
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    size_t maxElements) {
   // Cannot promote below a sequence or a set node.  Promotion may insert an
   // extension node, but sequence/set must be followed by filters.
   if (scope->as<detail::ScheduleTreeSequence>() ||
@@ -646,8 +665,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   auto mapping =
       collectMappingsTo<mapping::ThreadId>(scop).intersect(blockMapping);
   auto schedule = partialSchedule(scop.scheduleRoot(), scope);
-  auto groupMap = TensorReferenceGroup::accessedWithin(
-      schedule.intersect_domain(mapping), scop.body);
+  auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin(
+      schedule.intersect_domain(mapping), scop.body));
 
   auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule());
   auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule());
@@ -663,10 +682,9 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   // identical dimensions without affecting the result of the checks.
   partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule);
 
-  for (auto& tensorGroups : groupMap) {
+  for (auto& tensorGroups : groupLists) {
     auto tensorId = tensorGroups.first;
-
-    // TODO: sorting of groups and counting the number of promoted elements
+    sortTensorGroups(tensorGroups.second);
 
     for (auto& group : tensorGroups.second) {
       auto sizes = group->approximationSizes();
@@ -674,6 +692,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
       if (sizes.size() == 0) {
         continue;
       }
+      // Do not promote if requires more registers than remaining.
+      auto nElements = std::accumulate(
+          sizes.begin(), sizes.end(), 1u, std::multiplies<size_t>());
+      if (nElements > maxElements) {
+        continue;
+      }
       if (!isPromotableToRegistersBelow(
               *group, root, scope, partialSchedMupa, threadSchedule)) {
         continue;
@@ -693,13 +717,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
           std::move(group),
           scope,
           partialSched);
+      maxElements -= nElements;
     }
   }
 
   // Return immediately if nothing was promoted.
   if (scope->numChildren() == 0 ||
       !matchOne(extension(sequence(any())), scope->child({0}))) {
-    return;
+    return maxElements;
   }
 
   // If promoting above thread mapping, insert synchronizations.
@@ -715,15 +740,19 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
   if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
     scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
   }
+  return maxElements;
 }
 
 /*
  * Promote to registers below "depth" schedule dimensions.  Split bands if
  * necessary to create promotion scopes.  Do not promote if it would require
  * splitting the band mapped to threads as we assume only one band can be
- * mapped.
+ * mapped.  Use at most "maxElements" per thread in all promoted subtrees.
  */
-void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
+void promoteToRegistersAtDepth(
+    MappedScop& mscop,
+    size_t depth,
+    size_t maxElements) {
   using namespace detail;
 
   auto root = mscop.scop().scheduleRoot();
@@ -757,7 +786,7 @@ void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
   auto scopes = functional::Map(findScope, bands);
 
   for (auto scope : scopes) {
-    promoteToRegistersBelow(mscop, scope);
+    maxElements = promoteToRegistersBelow(mscop, scope, maxElements);
   }
 }
 
diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
index 2dc264949..fcc6dfdb6 100644
--- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
+++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h
@@ -41,9 +41,15 @@ void promoteToSharedAtDepth(
     std::size_t sharedMemorySize,
     bool unrollCopies);
 
-void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
+size_t promoteToRegistersBelow(
+    MappedScop& mscop,
+    detail::ScheduleTree* scope,
+    std::size_t maxElements = SIZE_MAX);
 
-void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
+void promoteToRegistersAtDepth(
+    MappedScop& scop,
+    std::size_t depth,
+    std::size_t maxElements = SIZE_MAX);
 
 } // namespace cuda
 } // namespace polyhedral
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
index 8beaf46dc..52301f5a2 100644
--- a/tc/proto/mapping_options.proto
+++ b/tc/proto/mapping_options.proto
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
   optional uint32 private_depth = 9;
   // Depth of promotion to shared memory, ignored if use_shared_memory is false.
   optional uint32 shared_depth = 10;
+  // Maximum number of elements to promote to registers per thread.  If not
+  // provided, the number 32-bit registers per thread will be used.
+  optional uint64 max_private_elements = 11;
 }
 
 message CpuMappingOptionsProto {
diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc
index a18fb4ca7..3062c21e4 100644
--- a/tensor_comprehensions/pybinds/tclib.cc
+++ b/tensor_comprehensions/pybinds/tclib.cc
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
           "usePrivateMemory",
           &tc::CudaMappingOptions::usePrivateMemory,
           "Create thread-local copies of data in private memory")
+      .def(
+          "maxPrivateElements",
+          &tc::CudaMappingOptions::maxPrivateElements,
+          "The maximum number of elements per thread for which thread-local "
+          "copies are created")
       .def(
           "unrollCopyShared",
           &tc::CudaMappingOptions::unrollCopyShared,
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
index 0fb7405b8..51a2fa057 100644
--- a/test/test_cuda_mapper_memory_promotion.cc
+++ b/test/test_cuda_mapper_memory_promotion.cc
@@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
                             .tile(32, 32, 32)
                             .privateDepth(5)
                             .useSharedMemory(false)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
   auto declPos = code.find("float _O_0");
@@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
                             .tile(32, 32, 32)
                             .maxSharedMemory(32768)
                             .useSharedMemory(true)
-                            .usePrivateMemory(true);
+                            .usePrivateMemory(true)
+                            .maxPrivateElements(100);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
 
@@ -587,7 +589,7 @@ TEST_F(MatMulBias, RegistersAtRoot) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
@@ -595,6 +597,27 @@ TEST_F(MatMulBias, RegistersAtRoot) {
   expectFourOElementsPromoted(code);
 }
 
+TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) {
+  // Disable automatic promotion to registers because we are going to call it
+  // manually.  Require sufficient unrolling to actually hit registers.
+  auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
+                            .unroll(512)
+                            .useSharedMemory(false)
+                            .usePrivateMemory(false);
+
+  auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3);
+  auto code = emitCode(mscop);
+
+  // Not expecting O to be promoted because 4 elements must be promoted and
+  // only 3 were indicated as available in promoteToRegistersBelow.
+  auto oDeclPos = code.find("float _O_0;");
+  EXPECT_TRUE(oDeclPos == std::string::npos)
+      << "not expected O to be promoted to registers";
+
+  expectNoABCPromotion(code);
+}
+
 TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
   // Disable automatic promotion to registers because we are going to call it
   // manually.  Require no unrolling so as to make promotion to registers
@@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
                             .usePrivateMemory(false);
 
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
+  promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100);
   auto code = emitCode(mscop);
   auto oDeclPos = code.find("float _O_0;");
 
@@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) {
       mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
   ASSERT_GT(nodes.size(), 0u);
   auto node = nodes[0];
-  promoteToRegistersBelow(*mscop, node);
+  promoteToRegistersBelow(*mscop, node, 100);
   auto code = emitCode(mscop);
 
   // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads