diff --git a/tc/core/cuda/cuda.cc b/tc/core/cuda/cuda.cc index 108e058cd..57baa2ed7 100644 --- a/tc/core/cuda/cuda.cc +++ b/tc/core/cuda/cuda.cc @@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof"); namespace { -std::tuple, std::vector> init() { +std::tuple, std::vector, std::vector> +init() { int deviceCount = 0; auto err_id = cudaGetDeviceCount(&deviceCount); if (err_id == 35 or err_id == 30) { @@ -44,14 +45,16 @@ std::tuple, std::vector> init() { } std::vector gpuNames; std::vector sharedMemSizes; + std::vector registersPerBlock; gpuNames.reserve(deviceCount); for (int i = 0; i < deviceCount; ++i) { cudaDeviceProp deviceProp; TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i)); gpuNames.emplace_back(deviceProp.name); sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock); + registersPerBlock.emplace_back(deviceProp.regsPerBlock); } - return std::make_tuple(gpuNames, sharedMemSizes); + return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock); } } // namespace @@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() { static thread_local bool inited = false; if (!inited) { auto infos = init(); - pInfo = std::unique_ptr( - new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos))); + pInfo = std::unique_ptr(new CudaGPUInfo( + std::get<0>(infos), std::get<1>(infos), std::get<2>(infos))); inited = true; } return *pInfo; @@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const { } return sharedMemSizes_.at(CurrentGPUId()); } + +size_t CudaGPUInfo::RegistersPerBlock() const { + if (NumberGPUs() == 0) { + return 0; // no registers if no GPUs + } + return registersPerBlock_.at(CurrentGPUId()); +} } // namespace tc diff --git a/tc/core/cuda/cuda.h b/tc/core/cuda/cuda.h index a9fe1383a..fa5e68b98 100644 --- a/tc/core/cuda/cuda.h +++ b/tc/core/cuda/cuda.h @@ -98,8 +98,11 @@ struct WithCudaDevice { class CudaGPUInfo { CudaGPUInfo( const std::vector& gpuNames, - const std::vector& sharedMemSizes) - : gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {} + const std::vector& sharedMemSizes, + const std::vector& registersPerBlock) + : gpuNames_(gpuNames), + sharedMemSizes_(sharedMemSizes), + registersPerBlock_(registersPerBlock) {} public: static CudaGPUInfo& GPUInfo(); @@ -112,9 +115,11 @@ class CudaGPUInfo { std::string GetGPUName(int id = -1) const; std::string getCudaDeviceStr() const; size_t SharedMemorySize() const; + size_t RegistersPerBlock() const; std::vector gpuNames_; std::vector sharedMemSizes_; + std::vector registersPerBlock_; }; struct CudaProfiler { diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc index 09d7edf8c..ba911aa90 100644 --- a/tc/core/cuda/cuda_mapping_options.cc +++ b/tc/core/cuda/cuda_mapping_options.cc @@ -299,6 +299,11 @@ CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) { return *this; } +CudaMappingOptions& CudaMappingOptions::maxPrivateElements(uint64_t nElements) { + ownedProto_.set_max_private_elements(nElements); + return *this; +} + CudaMappingOptions& CudaMappingOptions::mapToThreads( const std::string& commaSeparatedSizes) { auto sizes = parseCommaSeparatedIntegers(commaSeparatedSizes); diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h index aa8530307..ab6ce5e11 100644 --- a/tc/core/cuda/cuda_mapping_options.h +++ b/tc/core/cuda/cuda_mapping_options.h @@ -197,6 +197,7 @@ class CudaMappingOptions { CudaMappingOptions& useReadOnlyCache(bool b); CudaMappingOptions& privateDepth(uint32_t depth); CudaMappingOptions& sharedDepth(uint32_t depth); + CudaMappingOptions& maxPrivateElements(uint64_t nElements); ///@} /// Static constructors for predefined strategies. diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc index 9ffa95bcc..a223fcb80 100644 --- a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc +++ b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc @@ -40,6 +40,10 @@ CudaMappingOptionsCppPrinter& operator<<( } prn.printValueOption("privateDepth", cudaOptions.proto().private_depth()); prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth()); + if (cudaOptions.proto().has_max_private_elements()) { + prn.printValueOption( + "maxPrivateElements", cudaOptions.proto().max_private_elements()); + } prn.endStmt(); return prn; } diff --git a/tc/core/gpu.h b/tc/core/gpu.h index 6846304fd..06c8d680d 100644 --- a/tc/core/gpu.h +++ b/tc/core/gpu.h @@ -36,4 +36,15 @@ inline size_t querySharedMemorySize() { #endif } +/// Get the maximum number of registers per block provided by the GPU device +/// active in the current thread. The call is forwarded to the GPU driver. +/// If the thread has no associated GPU, return 0. +inline size_t queryRegistersPerBlock() { +#if TC_WITH_CUDA && !defined(NO_CUDA_SDK) + return CudaGPUInfo::GPUInfo().RegistersPerBlock(); +#else + return 0; +#endif +} + } // namespace tc diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc index b0b129cbc..faa7d9361 100644 --- a/tc/core/polyhedral/cuda/mapped_scop.cc +++ b/tc/core/polyhedral/cuda/mapped_scop.cc @@ -1086,7 +1086,14 @@ std::unique_ptr MappedScop::makeWithOuterBlockInnerThreadStrategy( // 9. Promote to registers below the loops mapped to threads. if (cudaOptions.proto().use_private_memory()) { - promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth()); + auto blockSizes = cudaOptions.block.extractVector(); + auto nThreadsPerBlock = std::accumulate( + blockSizes.begin(), blockSizes.end(), 1, std::multiplies()); + auto nElementsPerThread = cudaOptions.proto().has_max_private_elements() + ? cudaOptions.proto().max_private_elements() + : queryRegistersPerBlock() / nThreadsPerBlock; + promoteToRegistersAtDepth( + *mappedScop, cudaOptions.proto().private_depth(), nElementsPerThread); } LOG_IF(INFO, FLAGS_debug_tc_mapper) diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc index b1bca6923..f73246666 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.cc @@ -446,6 +446,51 @@ bool isInThreadMappedScope( return false; } +static std::vector> sortTensorGroupMap( + TensorGroups&& groupMap) { + // Prepare groups for sorting, to have specified order necessary for + // reproducibility and tests. + using TensorGroupList = std::pair; + std::vector groupLists( + std::make_move_iterator(groupMap.begin()), + std::make_move_iterator(groupMap.end())); + + // Computes the total number of references in all groups. + auto refsCount = [](const TensorGroupsInfo& info) { + size_t refs = 0; + for (auto const& group : info) { + refs += group->referenceIds().size(); + } + return refs; + }; + + // Sort by the total number of references, then by name. Because names are + // guarenteed to be unique, the order is total. + std::sort( + groupLists.begin(), + groupLists.end(), + [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) { + auto r1 = refsCount(l1.second); + auto r2 = refsCount(l2.second); + return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2; + }); + return groupLists; +} + +/* Sorts the given vector of tensor groups in place following the number of + * references in the group in decreasing order. This prioritize groups with + * more references as they are more likely to benefit from promotion. + */ +static void sortTensorGroups(TensorGroupsInfo& tensorGroups) { + std::sort( + tensorGroups.begin(), + tensorGroups.end(), + [](const std::unique_ptr& group1, + const std::unique_ptr& group2) { + return group1->referenceIds().size() > group2->referenceIds().size(); + }); +} + /* * Promote to shared memory in "scop" below "node". Use at most * "remainingMemory" bytes, and update the variable to reflect the amount of @@ -474,49 +519,14 @@ void promoteToSharedBelow( auto partialSched = partialSchedule(root, node); auto mapping = collectMappingsTo(scop); - auto groupMap = TensorReferenceGroup::accessedWithin( - partialSched.intersect_domain(mapping), scop.body); + auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin( + partialSched.intersect_domain(mapping), scop.body)); // Pure affine schedule without (mapping) filters. auto partialSchedMupa = partialScheduleMupa(root, node); - // Prepare groups for sorting, to have specified order necessary for - // reproducibility and tests. - using TensorGroupList = std::pair; - std::vector groupLists( - std::make_move_iterator(groupMap.begin()), - std::make_move_iterator(groupMap.end())); - - // Computes the total number of references in all groups. - auto refsCount = [](const TensorGroupsInfo& info) { - size_t refs = 0; - for (auto const& group : info) { - refs += group->referenceIds().size(); - } - return refs; - }; - - // Sort by the total number of references, then by name. Because names are - // guarenteed to be unique, the order is total. - std::sort( - groupLists.begin(), - groupLists.end(), - [refsCount](const TensorGroupList& l1, const TensorGroupList& l2) { - auto r1 = refsCount(l1.second); - auto r2 = refsCount(l2.second); - return r1 == r2 ? l1.first.get_name() < l2.first.get_name() : r1 < r2; - }); for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; - // Sort the reference groups to prioritize groups with more references as - // they are more likely to benefit from promotion. - std::sort( - tensorGroups.second.begin(), - tensorGroups.second.end(), - [refsCount]( - const std::unique_ptr& group1, - const std::unique_ptr& group2) { - return group1->referenceIds().size() > group2->referenceIds().size(); - }); + sortTensorGroups(tensorGroups.second); for (auto& group : tensorGroups.second) { auto sizes = group->approximationSizes(); @@ -620,8 +630,17 @@ void promoteToSharedAtDepth( * of "mscop". Throw if promotion would violate the well-formedness of the * schedule tree, in particular in cases of promotion immediately below * a set/sequence node or immediately above a thread-specific marker node. + * Promote at most "maxElements" elements per thread and return the difference + * between "maxElements" and the number of actuall promoted elements. Note + * that this function does not differentitate types and sizes of the promoted + * elements because register allocation cannot be controlled at the CUDA level + * anyway. Instead, the "maxElements" value controls how much register + * promotion is performed overall. */ -void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { +size_t promoteToRegistersBelow( + MappedScop& mscop, + detail::ScheduleTree* scope, + size_t maxElements) { // Cannot promote below a sequence or a set node. Promotion may insert an // extension node, but sequence/set must be followed by filters. if (scope->as() || @@ -646,8 +665,8 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { auto mapping = collectMappingsTo(scop).intersect(blockMapping); auto schedule = partialSchedule(scop.scheduleRoot(), scope); - auto groupMap = TensorReferenceGroup::accessedWithin( - schedule.intersect_domain(mapping), scop.body); + auto groupLists = sortTensorGroupMap(TensorReferenceGroup::accessedWithin( + schedule.intersect_domain(mapping), scop.body)); auto threadSchedule = mscop.threadMappingSchedule(mscop.schedule()); auto blockSchedule = mscop.blockMappingSchedule(mscop.schedule()); @@ -663,10 +682,9 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { // identical dimensions without affecting the result of the checks. partialSchedMupa = partialSchedMupa.flat_range_product(blockSchedule); - for (auto& tensorGroups : groupMap) { + for (auto& tensorGroups : groupLists) { auto tensorId = tensorGroups.first; - - // TODO: sorting of groups and counting the number of promoted elements + sortTensorGroups(tensorGroups.second); for (auto& group : tensorGroups.second) { auto sizes = group->approximationSizes(); @@ -674,6 +692,12 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { if (sizes.size() == 0) { continue; } + // Do not promote if requires more registers than remaining. + auto nElements = std::accumulate( + sizes.begin(), sizes.end(), 1u, std::multiplies()); + if (nElements > maxElements) { + continue; + } if (!isPromotableToRegistersBelow( *group, root, scope, partialSchedMupa, threadSchedule)) { continue; @@ -693,13 +717,14 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { std::move(group), scope, partialSched); + maxElements -= nElements; } } // Return immediately if nothing was promoted. if (scope->numChildren() == 0 || !matchOne(extension(sequence(any())), scope->child({0}))) { - return; + return maxElements; } // If promoting above thread mapping, insert synchronizations. @@ -715,15 +740,19 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) { if (functional::Filter(isMappingTo, ancestors).empty()) { scop.insertSyncsAroundSeqChildren(scope->child({0, 0})); } + return maxElements; } /* * Promote to registers below "depth" schedule dimensions. Split bands if * necessary to create promotion scopes. Do not promote if it would require * splitting the band mapped to threads as we assume only one band can be - * mapped. + * mapped. Use at most "maxElements" per thread in all promoted subtrees. */ -void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) { +void promoteToRegistersAtDepth( + MappedScop& mscop, + size_t depth, + size_t maxElements) { using namespace detail; auto root = mscop.scop().scheduleRoot(); @@ -757,7 +786,7 @@ void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) { auto scopes = functional::Map(findScope, bands); for (auto scope : scopes) { - promoteToRegistersBelow(mscop, scope); + maxElements = promoteToRegistersBelow(mscop, scope, maxElements); } } diff --git a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h index 2dc264949..fcc6dfdb6 100644 --- a/tc/core/polyhedral/cuda/memory_promotion_heuristic.h +++ b/tc/core/polyhedral/cuda/memory_promotion_heuristic.h @@ -41,9 +41,15 @@ void promoteToSharedAtDepth( std::size_t sharedMemorySize, bool unrollCopies); -void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope); +size_t promoteToRegistersBelow( + MappedScop& mscop, + detail::ScheduleTree* scope, + std::size_t maxElements = SIZE_MAX); -void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth); +void promoteToRegistersAtDepth( + MappedScop& scop, + std::size_t depth, + std::size_t maxElements = SIZE_MAX); } // namespace cuda } // namespace polyhedral diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto index 8beaf46dc..52301f5a2 100644 --- a/tc/proto/mapping_options.proto +++ b/tc/proto/mapping_options.proto @@ -74,6 +74,9 @@ message CudaMappingOptionsProto { optional uint32 private_depth = 9; // Depth of promotion to shared memory, ignored if use_shared_memory is false. optional uint32 shared_depth = 10; + // Maximum number of elements to promote to registers per thread. If not + // provided, the number 32-bit registers per thread will be used. + optional uint64 max_private_elements = 11; } message CpuMappingOptionsProto { diff --git a/tensor_comprehensions/pybinds/tclib.cc b/tensor_comprehensions/pybinds/tclib.cc index a18fb4ca7..3062c21e4 100644 --- a/tensor_comprehensions/pybinds/tclib.cc +++ b/tensor_comprehensions/pybinds/tclib.cc @@ -672,6 +672,11 @@ PYBIND11_MODULE(tclib, m) { "usePrivateMemory", &tc::CudaMappingOptions::usePrivateMemory, "Create thread-local copies of data in private memory") + .def( + "maxPrivateElements", + &tc::CudaMappingOptions::maxPrivateElements, + "The maximum number of elements per thread for which thread-local " + "copies are created") .def( "unrollCopyShared", &tc::CudaMappingOptions::unrollCopyShared, diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc index 0fb7405b8..51a2fa057 100644 --- a/test/test_cuda_mapper_memory_promotion.cc +++ b/test/test_cuda_mapper_memory_promotion.cc @@ -539,7 +539,8 @@ TEST_F(MatMulBias, RegisterPromotion) { .tile(32, 32, 32) .privateDepth(5) .useSharedMemory(false) - .usePrivateMemory(true); + .usePrivateMemory(true) + .maxPrivateElements(100); auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); auto declPos = code.find("float _O_0"); @@ -567,7 +568,8 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) { .tile(32, 32, 32) .maxSharedMemory(32768) .useSharedMemory(true) - .usePrivateMemory(true); + .usePrivateMemory(true) + .maxPrivateElements(100); auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); @@ -587,7 +589,7 @@ TEST_F(MatMulBias, RegistersAtRoot) { .usePrivateMemory(false); auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); - promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot()); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 4); auto code = emitCode(mscop); // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads @@ -595,6 +597,27 @@ TEST_F(MatMulBias, RegistersAtRoot) { expectFourOElementsPromoted(code); } +TEST_F(MatMulBias, RegistersAtRootNotEnoughAvailable) { + // Disable automatic promotion to registers because we are going to call it + // manually. Require sufficient unrolling to actually hit registers. + auto mappingOptions = CudaMappingOptions::makeNaiveMappingOptions() + .unroll(512) + .useSharedMemory(false) + .usePrivateMemory(false); + + auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 3); + auto code = emitCode(mscop); + + // Not expecting O to be promoted because 4 elements must be promoted and + // only 3 were indicated as available in promoteToRegistersBelow. + auto oDeclPos = code.find("float _O_0;"); + EXPECT_TRUE(oDeclPos == std::string::npos) + << "not expected O to be promoted to registers"; + + expectNoABCPromotion(code); +} + TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) { // Disable automatic promotion to registers because we are going to call it // manually. Require no unrolling so as to make promotion to registers @@ -605,7 +628,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) { .usePrivateMemory(false); auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions); - promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot()); + promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot(), 100); auto code = emitCode(mscop); auto oDeclPos = code.find("float _O_0;"); @@ -631,7 +654,7 @@ TEST_F(MatMulBias, RegistersBelowFirstBand) { mscop->scop().scheduleRoot(), ScheduleTreeType::Band); ASSERT_GT(nodes.size(), 0u); auto node = nodes[0]; - promoteToRegistersBelow(*mscop, node); + promoteToRegistersBelow(*mscop, node, 100); auto code = emitCode(mscop); // Expecting 4 elements because we map the loop i in O[i][j] to 8 threads