Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit a145a43

Browse files
authored
Merge pull request #537 from facebookresearch/shared-promotion-anywhere
Tunable shared promotion depth
2 parents d02a9e9 + c91026a commit a145a43

11 files changed

+190
-198
lines changed

tc/autotuner/autotuner-inl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,8 @@ void setupTuningParameters(
374374
RangeParameter(powers2(FLAGS_tuner_max_unroll_size), "unroll");
375375
configuration.privateDepth =
376376
RangeParameter({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, "pdepth");
377+
configuration.sharedDepth =
378+
RangeParameter({0, 1, 2, 3, 4, 5, 6, 7}, "sdepth");
377379
}
378380
} // namespace
379381

tc/autotuner/parameters.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ void TuningConfiguration::applyToParameters(
240240
useReadOnlyCache.apply(f);
241241
matchLibraryCalls.apply(f);
242242
privateDepth.apply(f);
243+
sharedDepth.apply(f);
243244
}
244245

245246
bool TuningConfiguration::isValid() const {
@@ -275,6 +276,7 @@ std::vector<ParameterView> TuningConfiguration::collectParameters() {
275276
params.emplace_back(useReadOnlyCache);
276277
params.emplace_back(matchLibraryCalls);
277278
params.emplace_back(privateDepth);
279+
params.emplace_back(sharedDepth);
278280

279281
return params;
280282
}
@@ -306,6 +308,7 @@ void TuningConfiguration::fromCudaMappingOptions(
306308
unrollCopyShared.selectValue(options.proto().unroll_copy_shared());
307309
useReadOnlyCache.selectValue(options.proto().use_readonly_cache());
308310
privateDepth.selectFromValue(options.proto().private_depth());
311+
sharedDepth.selectFromValue(options.proto().shared_depth());
309312
}
310313

311314
void TuningConfiguration::fromCpuMappingOptions(
@@ -335,6 +338,7 @@ void TuningConfiguration::applyToCudaMappingOptions(
335338
options.unrollCopyShared(unrollCopyShared.value());
336339
options.useReadOnlyCache(useReadOnlyCache.value());
337340
options.privateDepth(privateDepth.value());
341+
options.sharedDepth(sharedDepth.value());
338342
}
339343

340344
void TuningConfiguration::applyToCpuMappingOptions(

tc/autotuner/parameters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ class TuningConfiguration {
191191
BoolParameter useReadOnlyCache;
192192
BoolParameter matchLibraryCalls;
193193
RangeParameter privateDepth;
194+
RangeParameter sharedDepth;
194195

195196
private:
196197
std::vector<std::function<bool(const TuningConfiguration&)>> validators_;
@@ -229,6 +230,7 @@ class TuningParameterFixer {
229230
llvm::Optional<bool> useReadOnlyCache;
230231
llvm::Optional<bool> matchLibraryCalls;
231232
llvm::Optional<uint32_t> privateDepth;
233+
llvm::Optional<uint32_t> sharedDepth;
232234

233235
friend class TuningConfiguration;
234236
};

tc/core/cuda/cuda_mapping_options.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,11 @@ CudaMappingOptions& CudaMappingOptions::privateDepth(uint32_t depth) {
294294
return *this;
295295
}
296296

297+
CudaMappingOptions& CudaMappingOptions::sharedDepth(uint32_t depth) {
298+
ownedProto_.set_shared_depth(depth);
299+
return *this;
300+
}
301+
297302
CudaMappingOptions& CudaMappingOptions::mapToThreads(
298303
const std::string& commaSeparatedSizes) {
299304
auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);

tc/core/cuda/cuda_mapping_options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ class CudaMappingOptions {
196196
CudaMappingOptions& unrollCopyShared(bool b);
197197
CudaMappingOptions& useReadOnlyCache(bool b);
198198
CudaMappingOptions& privateDepth(uint32_t depth);
199+
CudaMappingOptions& sharedDepth(uint32_t depth);
199200
///@}
200201

201202
/// Static constructors for predefined strategies.

tc/core/cuda/cuda_mapping_options_cpp_printer.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ CudaMappingOptionsCppPrinter& operator<<(
3939
"maxSharedMemory", cudaOptions.proto().max_shared_memory());
4040
}
4141
prn.printValueOption("privateDepth", cudaOptions.proto().private_depth());
42+
prn.printValueOption("sharedDepth", cudaOptions.proto().shared_depth());
4243
prn.endStmt();
4344
return prn;
4445
}

tc/core/polyhedral/cuda/mapped_scop.cc

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,8 +1049,9 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10491049
LOG_IF(INFO, FLAGS_debug_tc_mapper) << "After mapping to blocks:" << std::endl
10501050
<< *mappedScop->schedule();
10511051

1052-
// 8. Promote to shared memory below the loops mapped to blocks.
1053-
// This may split the outer band, so find the new outer band after promotion.
1052+
// 8. Promote to shared memory.
1053+
// If shared promotion depth is specified in the mapping options, use the
1054+
// specified value. Otherwise, promote below the loops mapped to blocks.
10541055
if (cudaOptions.proto().use_shared_memory()) {
10551056
size_t sharedMemorySize = cudaOptions.proto().has_max_shared_memory()
10561057
? cudaOptions.proto().max_shared_memory()
@@ -1069,29 +1070,24 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
10691070
sharedMemorySize -= reductionMemoryRequirement;
10701071
}
10711072

1072-
auto band = outerBand->as<ScheduleTreeBand>();
1073-
LOG_IF(WARNING, FLAGS_debug_tc_mapper && band->nMember() == 0)
1074-
<< "Aborting memory promotion because outer band has 0 members (NYI)";
1075-
if (band->nMember() > 0 && sharedMemorySize > 0) {
1073+
if (sharedMemorySize > 0) {
10761074
LOG_IF(
10771075
WARNING,
10781076
cudaOptions.proto().unroll_copy_shared() &&
10791077
!generic.proto.has_unroll())
10801078
<< "requested to unroll copies to shared memory without providing the unroll size";
10811079

1082-
promoteGreedilyAtDepth(
1080+
auto depth = cudaOptions.proto().has_shared_depth()
1081+
? cudaOptions.proto().shared_depth()
1082+
: std::min(
1083+
outerBand->as<ScheduleTreeBand>()->nOuterCoincident(),
1084+
mappedScop->numBlocks.view.size());
1085+
promoteToSharedAtDepth(
10831086
*mappedScop,
1084-
std::min(band->nOuterCoincident(), mappedScop->numBlocks.view.size()),
1087+
depth,
10851088
sharedMemorySize,
10861089
cudaOptions.proto().unroll_copy_shared() &&
10871090
generic.proto.has_unroll());
1088-
1089-
auto bands = ScheduleTree::collectDFSPreorder(
1090-
scop->scheduleRoot(), ScheduleTreeType::Band);
1091-
if (bands.size() == 0) { // Sanity check.
1092-
throw NoBandsException("no bands after promotion");
1093-
}
1094-
outerBand = bands[0];
10951091
}
10961092
}
10971093

0 commit comments

Comments
 (0)