Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 9f7300a

Browse files
authored
Merge pull request #521 from facebookresearch/registers-at-depth
Tunable register promotion depth
2 parents d384ed3 + 1ad8e87 commit 9f7300a

16 files changed

+211
-39
lines changed

tc/autotuner/autotuner-inl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,8 @@ void setupTuningParameters(
372372
configuration.gridParams.setRange(range, "g");
373373
configuration.unrollFactor =
374374
RangeParameter(powers2(FLAGS_tuner_max_unroll_size), "unroll");
375+
configuration.privateDepth =
376+
RangeParameter({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, "pdepth");
375377
}
376378
} // namespace
377379

tc/autotuner/parameters.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ void TuningConfiguration::applyToParameters(
239239
unrollCopyShared.apply(f);
240240
useReadOnlyCache.apply(f);
241241
matchLibraryCalls.apply(f);
242+
privateDepth.apply(f);
242243
}
243244

244245
bool TuningConfiguration::isValid() const {
@@ -273,6 +274,7 @@ std::vector<ParameterView> TuningConfiguration::collectParameters() {
273274
params.emplace_back(unrollCopyShared);
274275
params.emplace_back(useReadOnlyCache);
275276
params.emplace_back(matchLibraryCalls);
277+
params.emplace_back(privateDepth);
276278

277279
return params;
278280
}
@@ -303,6 +305,7 @@ void TuningConfiguration::fromCudaMappingOptions(
303305
usePrivateMemory.selectValue(options.proto().use_private_memory());
304306
unrollCopyShared.selectValue(options.proto().unroll_copy_shared());
305307
useReadOnlyCache.selectValue(options.proto().use_readonly_cache());
308+
privateDepth.selectFromValue(options.proto().private_depth());
306309
}
307310

308311
void TuningConfiguration::fromCpuMappingOptions(
@@ -331,6 +334,7 @@ void TuningConfiguration::applyToCudaMappingOptions(
331334
options.usePrivateMemory(usePrivateMemory.value());
332335
options.unrollCopyShared(unrollCopyShared.value());
333336
options.useReadOnlyCache(useReadOnlyCache.value());
337+
options.privateDepth(privateDepth.value());
334338
}
335339

336340
void TuningConfiguration::applyToCpuMappingOptions(

tc/autotuner/parameters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ class TuningConfiguration {
190190
BoolParameter unrollCopyShared;
191191
BoolParameter useReadOnlyCache;
192192
BoolParameter matchLibraryCalls;
193+
RangeParameter privateDepth;
193194

194195
private:
195196
std::vector<std::function<bool(const TuningConfiguration&)>> validators_;
@@ -227,6 +228,7 @@ class TuningParameterFixer {
227228
llvm::Optional<bool> unrollCopyShared;
228229
llvm::Optional<bool> useReadOnlyCache;
229230
llvm::Optional<bool> matchLibraryCalls;
231+
llvm::Optional<uint32_t> privateDepth;
230232

231233
friend class TuningConfiguration;
232234
};

tc/core/cuda/cuda_mapping_options.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,11 @@ CudaMappingOptions& CudaMappingOptions::useReadOnlyCache(bool b) {
289289
return *this;
290290
}
291291

292+
CudaMappingOptions& CudaMappingOptions::privateDepth(uint32_t depth) {
293+
ownedProto_.set_private_depth(depth);
294+
return *this;
295+
}
296+
292297
CudaMappingOptions& CudaMappingOptions::mapToThreads(
293298
const std::string& commaSeparatedSizes) {
294299
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);

tc/core/cuda/cuda_mapping_options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ class CudaMappingOptions {
195195
CudaMappingOptions& maxSharedMemory(uint64_t size);
196196
CudaMappingOptions& unrollCopyShared(bool b);
197197
CudaMappingOptions& useReadOnlyCache(bool b);
198+
CudaMappingOptions& privateDepth(uint32_t depth);
198199
///@}
199200

200201
/// Static constructors for predefined strategies.

tc/core/cuda/cuda_mapping_options_cpp_printer.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ CudaMappingOptionsCppPrinter& operator<<(
3838
prn.printValueOption(
3939
"maxSharedMemory", cudaOptions.proto().max_shared_memory());
4040
}
41+
prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
4142
prn.endStmt();
4243
return prn;
4344
}

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,7 +1068,7 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10681068

10691069
// 9. Promote to registers below the loops mapped to threads.
10701070
if (cudaOptions.proto().use_private_memory()) {
1071-
promoteToRegistersBelowThreads(*mappedScop, -1ull);
1071+
promoteToRegistersAtDepth(*mappedScop, cudaOptions.proto().private_depth());
10721072
}
10731073

10741074
LOG_IF(INFO, FLAGS_debug_tc_mapper)

tc/core/polyhedral/cuda/memory_promotion_heuristic.cc

Lines changed: 76 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -437,14 +437,20 @@ bool isPromotableToRegistersBelow(
437437
}
438438

439439
/*
440-
* Starting from the root, find bands where depth is reached. Using
440+
* Starting from the root, find bands where depth is reached. If zero depth is
441+
* requested, insert a zero-dimensional band node below the root (or the
442+
* context node if present) and return it. Otherwise, use
441443
* DFSPreorder to make sure order is specified and consistent for tests.
442444
*/
443445
std::vector<detail::ScheduleTree*> bandsContainingScheduleDepth(
444446
detail::ScheduleTree* root,
445447
size_t depth) {
446448
using namespace tc::polyhedral::detail;
447449

450+
if (depth == 0) {
451+
return {insertTopLevelEmptyBand(root)};
452+
}
453+
448454
auto bands =
449455
ScheduleTree::collectDFSPreorder(root, detail::ScheduleTreeType::Band);
450456
std::function<bool(ScheduleTree * st)> containsDepth = [&](ScheduleTree* st) {
@@ -602,6 +608,15 @@ void promoteToSharedGreedy(
602608
scop.insertSyncsAroundCopies(bandNode);
603609
}
604610
}
611+
612+
/*
613+
* Check if "tree" is a band node mapped to threads. In particular, check that
614+
* "tree" is a band and a thread-specific node appears as its only child.
615+
*/
616+
inline bool isThreadMappedBand(const detail::ScheduleTree* tree) {
617+
return matchOne(band(threadSpecific(any())), tree) ||
618+
matchOne(band(threadSpecific()), tree);
619+
}
605620
} // namespace
606621

607622
void promoteGreedilyAtDepth(
@@ -698,16 +713,69 @@ void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope) {
698713
partialSched);
699714
}
700715
}
716+
717+
// Return immediately if nothing was promoted.
718+
if (scope->numChildren() == 0 ||
719+
!matchOne(extension(sequence(any())), scope->child({0}))) {
720+
return;
721+
}
722+
723+
// If promoting above thread mapping, insert synchronizations.
724+
// It is possible that promoted array elements are accessed by different
725+
// threads outside the current scope (either in different iterations of the
726+
// scope loops, or in sibling subtrees). For now, always insert
727+
// synchronizations, similarly to copies to shared memory.
728+
//
729+
// TODO: The exact check for sync insertion requires the dependences between
730+
// the elements in the scope and those before/after the scope and a check if
731+
// the dependent instances belong to the same thread.
732+
auto ancestors = scope->ancestors(root);
733+
if (functional::Filter(isMappingTo<mapping::ThreadId>, ancestors).empty()) {
734+
scop.insertSyncsAroundSeqChildren(scope->child({0, 0}));
735+
}
701736
}
702737

703-
// Promote at the positions of the thread specific markers.
704-
void promoteToRegistersBelowThreads(MappedScop& mscop, size_t nRegisters) {
705-
auto& scop = mscop.scop();
706-
auto root = scop.scheduleRoot();
707-
auto markers = findThreadSpecificMarkers(root);
738+
/*
739+
* Promote to registers below "depth" schedule dimensions. Split bands if
740+
* necessary to create promotion scopes. Do not promote if it would require
741+
* splitting the band mapped to threads as we assume only one band can be
742+
* mapped.
743+
*/
744+
void promoteToRegistersAtDepth(MappedScop& mscop, size_t depth) {
745+
using namespace detail;
708746

709-
for (auto marker : markers) {
710-
promoteToRegistersBelow(mscop, marker);
747+
auto root = mscop.scop().scheduleRoot();
748+
749+
// 1. Collect all bands with a member located at the given depth in the
750+
// overall schedule. Make sure this is the last member of the band by
751+
// splitting off the subsequent members into a different band. Ignore bands
752+
// mapped to threads if splitting is required as it would break the invariant
753+
// of a single band being mapped to threads in a subtree.
754+
// TODO: allow splitting the thread-mapped bands; for example, tile them
755+
// explicitly with block size, use the point loops for thread mapping
756+
// but ignore them in depth computation.
757+
auto bands = bandsContainingScheduleDepth(root, depth);
758+
bands = functional::Filter(
759+
[root, depth](ScheduleTree* tree) {
760+
auto band = tree->elemAs<ScheduleTreeElemBand>();
761+
return !isThreadMappedBand(tree) ||
762+
tree->scheduleDepth(root) + band->nMember() == depth;
763+
},
764+
bands);
765+
bands = bandsSplitAfterDepth(bands, root, depth);
766+
767+
// 2. We don't want copies inserted between thread-mapped bands and the
768+
// thread-specific marker, but rather below that marker. If any of the bands
769+
// are mapped to threads, take their first children as promotion scope
770+
// instead of the band itself.
771+
std::function<ScheduleTree*(ScheduleTree*)> findScope =
772+
[](ScheduleTree* tree) {
773+
return isThreadMappedBand(tree) ? tree->child({0}) : tree;
774+
};
775+
auto scopes = functional::Map(findScope, bands);
776+
777+
for (auto scope : scopes) {
778+
promoteToRegistersBelow(mscop, scope);
711779
}
712780
}
713781

tc/core/polyhedral/cuda/memory_promotion_heuristic.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ void promoteGreedilyAtDepth(
4444

4545
void promoteToRegistersBelow(MappedScop& mscop, detail::ScheduleTree* scope);
4646

47-
void promoteToRegistersBelowThreads(MappedScop& scop, std::size_t nRegisters);
47+
void promoteToRegistersAtDepth(MappedScop& scop, std::size_t depth);
48+
4849
} // namespace polyhedral
4950
} // namespace tc

tc/core/polyhedral/functional.h

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,18 @@ I Reduce(std::function<I(I&&, I&&)> red, std::vector<I>&& vec) {
100100
return res;
101101
}
102102

103+
namespace {
104+
// Allow combinations:
105+
// LHS RHS
106+
// const T and T
107+
// const T and const T
108+
template <typename LHS, typename RHS>
109+
struct is_same_allow_rhs_non_const {
110+
const static bool value =
111+
std::is_same<LHS, typename std::add_const<RHS>::type>::value;
112+
};
113+
} // namespace
114+
103115
/*
104116
* Return a copy of the vector that contains only those elements for which
105117
* function "f" returns true.
@@ -108,8 +120,12 @@ I Reduce(std::function<I(I&&, I&&)> red, std::vector<I>&& vec) {
108120
* functor) with signature <bool(T)>.
109121
*
110122
* Template argument deduction takes care of vector<const T> cases
111-
* automatically. Note that the signature of "f" must use the same type, that
112-
* is "const T".
123+
* automatically. Additionally, accept vectors of (pointers to) non-const
124+
* elements and a function with (a pointer to) a const argument.
125+
* Note that the function type is a template argument guarded by static asserts
126+
* rather than an std::function because template argument deduction fails on
127+
* lambdas in the latter case; lambdas are convertible to std::function but are
128+
* not an instance of std::function.
113129
*/
114130
template <typename Func, typename T>
115131
std::vector<T> Filter(Func f, const std::vector<T>& input) {
@@ -119,9 +135,25 @@ std::vector<T> Filter(Func f, const std::vector<T>& input) {
119135
static_assert(
120136
function_traits<Func>::n_args == 1,
121137
"Filtering function must take one argument");
138+
139+
// Allow combinations:
140+
// Function arg Vector element
141+
// T and T
142+
// const T and T
143+
// const T and const T
144+
// T* and T*
145+
// const T* and T*
146+
// const T* and const T*
147+
using arg0 = typename function_traits<Func>::template arg<0>::type;
148+
constexpr bool sameType = std::is_same<arg0, T>::value;
149+
constexpr bool sameTypeRightMbConst =
150+
is_same_allow_rhs_non_const<arg0, T>::value;
151+
constexpr bool ptrToSameTypeRightMbConst = std::is_pointer<arg0>::value &&
152+
std::is_pointer<T>::value &&
153+
is_same_allow_rhs_non_const<typename std::remove_pointer<arg0>::type,
154+
typename std::remove_pointer<T>::type>::value;
122155
static_assert(
123-
std::is_same<typename function_traits<Func>::template arg<0>::type, T>::
124-
value,
156+
sameType || sameTypeRightMbConst || ptrToSameTypeRightMbConst,
125157
"The argument of the filtering function must have the same type "
126158
"as the element type of the collection being filtered");
127159

0 commit comments

Comments
 (0)