Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 74d3e85

Browse files
committed
cuda::MappedScop: introduce maxPrivateElements mapping option
This mapping option controls the maximum number of elements per thread that are promoted into the private memory (hopefully, registers, but we cannot guarantee this at the CUDA level). The value is optional in the protocol buffers. When not provided, query the maximum number of threads per block from CUDA device properties and divide it by the number of threads in the block to obtain the per-thread limitation. Note that using all registers in a single block will likely limit the occupancy of SMs, potentially degrading performance. Introducing the limiting factor is primarily motivated by this effect, and it lets the caller to require the mapper to use less registers, potentially increasing the occupancy. Since register allocation is performed by the downstream compiler, this option is a mere recommendation and is expressed in terms of (untyped) elements rather than actual registers. It would be impossible to account for all registers required by the main computation (that is, necessary to store the data loaded from memory during operations) at the CUDA level, that also contribute to the register pressure of the kernel. Although limiting the number of promoted elements number of registers available per thread may seem too constraining for occupancy, it is strictly better than the current approach where we may promote even more elements, which then get spilled into the slow local memory.
1 parent c200a4e commit 74d3e85

File tree

8 files changed

+54
-7
lines changed

8 files changed

+54
-7
lines changed

tc/core/cuda/cuda_mapping_options.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
299299
return *this;
300300
}
301301

302+
CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) {
303+
ownedProto_.set_max_private_elements(nElements);
304+
return *this;
305+
}
306+
302307
CudaMappingOptions& CudaMappingOptions::mapToThreads(
303308
const std::string& commaSeparatedSizes) {
304309
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);

tc/core/cuda/cuda_mapping_options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ class CudaMappingOptions {
197197
CudaMappingOptions& useReadOnlyCache(bool b);
198198
CudaMappingOptions& privateDepth(uint32_t depth);
199199
CudaMappingOptions& sharedDepth(uint32_t depth);
200+
CudaMappingOptions& maxPrivateElements(uint64_t nElements);
200201
///@}
201202

202203
/// Static constructors for predefined strategies.

tc/core/cuda/cuda_mapping_options_cpp_printer.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<(
4040
}
4141
prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
4242
prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
43+
if (cudaOptions.proto().has_max_private_elements()) {
44+
prn.printValueOption(
45+
"maxPrivateElements", cudaOptions.proto().max_private_elements());
46+
}
4347
prn.endStmt();
4448
return prn;
4549
}

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1086,7 +1086,14 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10861086

10871087
// 9. Promote to registers below the loops mapped to threads.
10881088
if (cudaOptions.proto().use_private_memory()) {
1089-
promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
1089+
auto blockSizes = cudaOptions.block.extractVector();
1090+
auto nThreadsPerBlock = std::accumulate(
1091+
blockSizes.begin(), blockSizes.end(), 1, std::multiplies<size_t>());
1092+
auto nElementsPerThread = cudaOptions.proto().has_max_private_elements()
1093+
? cudaOptions.proto().max_private_elements()
1094+
: queryRegistersPerBlock() / nThreadsPerBlock;
1095+
promoteToRegistersAtDepth(
1096+
*mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread);
10901097
}
10911098

10921099
LOG_IF(INFO, FLAGS_debug_tc_mapper)

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,6 @@ size_t promoteToRegistersBelow(
685685
for (auto& tensorGroups : groupLists) {
686686
auto tensorId = tensorGroups.first;
687687
sortTensorGroups(tensorGroups.second);
688-
// TODO: counting the number of promoted elements
689688

690689
for (auto& group : tensorGroups.second) {
691690
auto sizes = group->approximationSizes();

tc/proto/mapping_options.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ message CudaMappingOptionsProto {
7474
optional uint32 private_depth = 9;
7575
// Depth of promotion to shared memory, ignored if use_shared_memory is false.
7676
optional uint32 shared_depth = 10;
77+
// Maximum number of elements to promote to registers per thread. If not
78+
// provided, the number 32-bit registers per thread will be used.
79+
optional uint64 max_private_elements = 11;
7780
}
7881

7982
message CpuMappingOptionsProto {

tensor_comprehensions/pybinds/tclib.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) {
672672
"usePrivateMemory",
673673
&tc::CudaMappingOptions::usePrivateMemory,
674674
"Create thread-local copies of data in private memory")
675+
.def(
676+
"maxPrivateElements",
677+
&tc::CudaMappingOptions::maxPrivateElements,
678+
"The maximum number of elements per thread for which thread-local "
679+
"copies are created")
675680
.def(
676681
"unrollCopyShared",
677682
&tc::CudaMappingOptions::unrollCopyShared,

test/test_cuda_mapper_memory_promotion.cc

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) {
539539
.tile(32, 32, 32)
540540
.privateDepth(5)
541541
.useSharedMemory(false)
542-
.usePrivateMemory(true);
542+
.usePrivateMemory(true)
543+
.maxPrivateElements(100);
543544

544545
auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
545546
auto declPos = code.find("float _O_0");
@@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
567568
.tile(32, 32, 32)
568569
.maxSharedMemory(32768)
569570
.useSharedMemory(true)
570-
.usePrivateMemory(true);
571+
.usePrivateMemory(true)
572+
.maxPrivateElements(100);
571573

572574
auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
573575

@@ -587,14 +589,35 @@ TEST_F(MatMulBias, RegistersAtRoot) {
587589
.usePrivateMemory(false);
588590

589591
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
590-
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
592+
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4);
591593
auto code = emitCode(mscop);
592594

593595
// Expecting 4 elements because we map the loop i in O[i][j] to 8 threads
594596
// after tiling by 32.
595597
expectFourOElementsPromoted(code);
596598
}
597599

600+
TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) {
601+
// Disable automatic promotion to registers because we are going to call it
602+
// manually. Require sufficient unrolling to actually hit registers.
603+
auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions()
604+
.unroll(512)
605+
.useSharedMemory(false)
606+
.usePrivateMemory(false);
607+
608+
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
609+
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3);
610+
auto code = emitCode(mscop);
611+
612+
// Not expecting O to be promoted because 4 elements must be promoted and
613+
// only 3 were indicated as available in promoteToRegistersBelow.
614+
auto oDeclPos = code.find("float _O_0;");
615+
EXPECT_TRUE(oDeclPos == std::string::npos)
616+
<< "not expected O to be promoted to registers";
617+
618+
expectNoABCPromotion(code);
619+
}
620+
598621
TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
599622
// Disable automatic promotion to registers because we are going to call it
600623
// manually. Require no unrolling so as to make promotion to registers
@@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
605628
.usePrivateMemory(false);
606629

607630
auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
608-
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
631+
promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100);
609632
auto code = emitCode(mscop);
610633
auto oDeclPos = code.find("float _O_0;");
611634

@@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) {
631654
mscop->scop().scheduleRoot(), ScheduleTreeType::Band);
632655
ASSERT_GT(nodes.size(), 0u);
633656
auto node = nodes[0];
634-
promoteToRegistersBelow(*mscop, node);
657+
promoteToRegistersBelow(*mscop, node, 100);
635658
auto code = emitCode(mscop);
636659

637660
// Expecting 4 elements because we map the loop i in O[i][j] to 8 threads

0 commit comments

Comments
 (0)