Add support for __ldg

ftynse · Sven Verdoolaege · commit 30a1634109ed · 2018-05-24T09:37:59.000+02:00
CUDA supports a faster read-only cache for data that is never written.
This commits adds support in cuda/codegen to generate such accesses.
This is achieved by simply inspecting the schedule tree for all
read-only tensor references and saving their isl::id in a set that is
passed in the context.

The corresponding CudaMappingOptions are also added to control triggering of
the option.

A simple test is also added.
diff --git a/docs/source/mapping_options.rst b/docs/source/mapping_options.rst
@@ -75,6 +75,8 @@ The following options are currently available:
 
 * :code:`.unrollCopyShared(<boolean>)`: Also unroll the copies to and from shared memory introduced by the :code:`TC` mapper. If :code:`unroll` value is not provided, has no effect.
 
+* :code:`.useReaOnlyCache(<boolean>)`: Emit loads to the readonly cache when appropriate.
+
 * :code:`.matchLibraryCalls(<boolean>)`: Replace computation patterns with calls to highly optimized libraries (such as CUB, CUTLASS) when possible.
 
 * :code:`.fixParametersBeforeScheduling(<boolean>)`: Perform automatic loop scheduling taking into account specific tensor sizes. May produce faster kernels but significantly increases compilation time. Note that the *mapping* will be performed for specific tensor sizes anyway.
diff --git a/tc/autotuner/parameters.cc b/tc/autotuner/parameters.cc
@@ -235,6 +235,7 @@ void TuningConfiguration::applyToParameters(
   useSharedMemory.apply(f);
   usePrivateMemory.apply(f);
   unrollCopyShared.apply(f);
+  useReadOnlyCache.apply(f);
   matchLibraryCalls.apply(f);
 }
 
@@ -268,6 +269,7 @@ std::vector<ParameterView> TuningConfiguration::collectParameters() {
   params.emplace_back(useSharedMemory);
   params.emplace_back(usePrivateMemory);
   params.emplace_back(unrollCopyShared);
+  params.emplace_back(useReadOnlyCache);
   params.emplace_back(matchLibraryCalls);
 
   return params;
@@ -298,6 +300,7 @@ void TuningConfiguration::fromCudaMappingOptions(
   useSharedMemory.selectValue(options.proto().use_shared_memory());
   usePrivateMemory.selectValue(options.proto().use_private_memory());
   unrollCopyShared.selectValue(options.proto().unroll_copy_shared());
+  useReadOnlyCache.selectValue(options.proto().use_readonly_cache());
 }
 
 void TuningConfiguration::fromCpuMappingOptions(
@@ -325,6 +328,7 @@ void TuningConfiguration::applyToCudaMappingOptions(
   options.useSharedMemory(useSharedMemory.value());
   options.usePrivateMemory(usePrivateMemory.value());
   options.unrollCopyShared(unrollCopyShared.value());
+  options.useReadOnlyCache(useReadOnlyCache.value());
 }
 
 void TuningConfiguration::applyToCpuMappingOptions(
@@ -338,6 +342,7 @@ TuningConfiguration::TuningConfiguration()
       useSharedMemory("use shared memory"),
       usePrivateMemory("use private memory"),
       unrollCopyShared("unroll copy shared"),
+      useReadOnlyCache("use readonly cache (i.e. emit __ldg loads)"),
       matchLibraryCalls("match library calls") {
   addValidator([](const TuningConfiguration& conf) {
     auto b0v = conf.blockParams.dims.at(0).value();
@@ -419,6 +424,7 @@ void TuningConfiguration::fixParameters(
   maybeFixScalar(fixedParams.useSharedMemory, useSharedMemory);
   maybeFixScalar(fixedParams.usePrivateMemory, usePrivateMemory);
   maybeFixScalar(fixedParams.unrollCopyShared, unrollCopyShared);
+  maybeFixScalar(fixedParams.useReadOnlyCache, useReadOnlyCache);
   maybeFixScalar(fixedParams.matchLibraryCalls, matchLibraryCalls);
 }
 
@@ -568,6 +574,11 @@ TuningParameterFixer& TuningParameterFixer::fixUnrollCopyShared(bool val) {
   return *this;
 }
 
+TuningParameterFixer& TuningParameterFixer::fixUseReadOnlyCache(bool val) {
+  useReadOnlyCache = val;
+  return *this;
+}
+
 TuningParameterFixer& TuningParameterFixer::fixMatchLibraryCalls(bool val) {
   matchLibraryCalls = val;
   return *this;
diff --git a/tc/autotuner/parameters.h b/tc/autotuner/parameters.h
@@ -188,6 +188,7 @@ class TuningConfiguration {
   BoolParameter useSharedMemory;
   BoolParameter usePrivateMemory;
   BoolParameter unrollCopyShared;
+  BoolParameter useReadOnlyCache;
   BoolParameter matchLibraryCalls;
 
  private:
@@ -209,6 +210,7 @@ class TuningParameterFixer {
   TuningParameterFixer& fixUseSharedMemory(bool val);
   TuningParameterFixer& fixUsePrivateMemory(bool val);
   TuningParameterFixer& fixUnrollCopyShared(bool val);
+  TuningParameterFixer& fixUseReadOnlyCache(bool val);
   TuningParameterFixer& fixMatchLibraryCalls(bool val);
 
  private:
@@ -223,6 +225,7 @@ class TuningParameterFixer {
   llvm::Optional<bool> useSharedMemory;
   llvm::Optional<bool> usePrivateMemory;
   llvm::Optional<bool> unrollCopyShared;
+  llvm::Optional<bool> useReadOnlyCache;
   llvm::Optional<bool> matchLibraryCalls;
 
   friend class TuningConfiguration;
diff --git a/tc/core/cuda/cuda_mapping_options.cc b/tc/core/cuda/cuda_mapping_options.cc
@@ -311,6 +311,11 @@ CudaMappingOptions& CudaMappingOptions::unrollCopyShared(bool b) {
   return *this;
 }
 
+CudaMappingOptions& CudaMappingOptions::useReadOnlyCache(bool b) {
+  ownedProto_.set_use_readonly_cache(b);
+  return *this;
+}
+
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::string& commaSeparatedSizes) {
   auto sizes = parseCommaSeparatedIntegers<uint64_t>(commaSeparatedSizes);
@@ -341,7 +346,8 @@ CudaMappingOptions CudaMappingOptions::makeUnmappedMappingOptions() {
   mo.genericMappingOptions(MappingOptions::makeUnmappedMappingOptions())
       .useSharedMemory(false)
       .usePrivateMemory(false)
-      .unrollCopyShared(false);
+      .unrollCopyShared(false)
+      .useReadOnlyCache(false);
   return mo;
 }
 
diff --git a/tc/core/cuda/cuda_mapping_options.h b/tc/core/cuda/cuda_mapping_options.h
@@ -194,6 +194,7 @@ class CudaMappingOptions {
   CudaMappingOptions& usePrivateMemory(bool b);
   CudaMappingOptions& maxSharedMemory(uint64_t size);
   CudaMappingOptions& unrollCopyShared(bool b);
+  CudaMappingOptions& useReadOnlyCache(bool b);
   ///@}
 
   /// Static constructors for predefined strategies.
diff --git a/tc/core/cuda/cuda_mapping_options_cpp_printer.cc b/tc/core/cuda/cuda_mapping_options_cpp_printer.cc
@@ -32,6 +32,8 @@ CudaMappingOptionsCppPrinter& operator<<(
       "usePrivateMemory", cudaOptions.proto().use_private_memory());
   prn.printBooleanOption(
       "unrollCopyShared", cudaOptions.proto().unroll_copy_shared());
+  prn.printBooleanOption(
+      "useReadOnlyCache", cudaOptions.proto().use_readonly_cache());
   if (cudaOptions.proto().has_max_shared_memory()) {
     prn.printValueOption(
         "maxSharedMemory", cudaOptions.proto().max_shared_memory());
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -365,8 +365,32 @@ void emitReductionInit(
 }
 
 namespace {
+// RAII-style wrapper around CodegenStatementContext that wraps the output
+// streamed to context.ss into an "__ldg()" intrinsic, if the tensor
+// with the given identifier is known to be read-only.
+struct LdgWrapper {
+ public:
+  LdgWrapper(const CodegenStatementContext& context, isl::id id)
+      : readOnly_(context.readOnlySet.count(id) > 0), out_(context.ss) {
+    if (readOnly_) {
+      out_ << "__ldg(&";
+    }
+  }
+
+  ~LdgWrapper() {
+    if (readOnly_) {
+      out_ << ")";
+    }
+  }
+
+ private:
+  bool readOnly_;
+  std::ostream& out_;
+};
+
 template <typename AFF>
 void emitAccess(AFF access, const CodegenStatementContext& context) {
+  LdgWrapper ldgWrapper(context, access.get_tuple_id(isl::dim_type::out));
   context.ss << context.build().access_from(access).to_C_str();
 }
 } // namespace
@@ -584,6 +608,8 @@ void emitMappedTensorAccess(
 
   // Not promoted, emitting just the mapped subscript.
   if (!promotionInfo.groupId) {
+    auto ctx = context.scop().domain().get_ctx();
+    LdgWrapper ldgWrapper(context, isl::id(ctx, name));
     context.ss << name;
     for (auto e : subscripts) {
       context.ss << "[";
@@ -681,6 +707,29 @@ size_t& nAstNodes() {
   return n;
 }
 
+namespace {
+// Collect ids of tensors that are only read on the Scop.
+std::unordered_set<isl::id, isl::IslIdIslHash> gatherReadOnlySet(
+    const MappedScop& mscop) {
+  std::unordered_set<isl::id, isl::IslIdIslHash> readOnlySet;
+
+  if (!mscop.useReadOnlyCache) {
+    return readOnlySet;
+  }
+
+  const auto& scop = mscop.scop();
+
+  auto read = scop.reads.universe().range();
+  auto written = scop.writes.universe().range();
+  auto readOnly = read.subtract(written);
+  for (auto s : readOnly.get_set_list()) {
+    readOnlySet.emplace(s.get_tuple_id());
+  }
+
+  return readOnlySet;
+}
+} // namespace
+
 string emitCudaKernel(
     const std::string& specializedName,
     const MappedScop& mscop) {
@@ -745,7 +794,9 @@ string emitCudaKernel(
   auto root = mscop.schedule();
   astBuild = astBuild.set_iterators(Codegen::makeLoopIterators(root));
   auto astNode = astBuild.node_from(schedule);
-  AstPrinter(CodegenContext(ss, mscop, nodeInfoMap)).emit(astNode);
+
+  AstPrinter(CodegenContext(ss, mscop, nodeInfoMap, gatherReadOnlySet(mscop)))
+      .emit(astNode);
   ss << "}" << endl;
 
   return ss.str();
diff --git a/tc/core/polyhedral/cuda/codegen.h b/tc/core/polyhedral/cuda/codegen.h
@@ -83,10 +83,14 @@ struct CodegenContext {
   CodegenContext(
       std::stringstream& ss_,
       const MappedScop& s,
-      const NodeInfoMapType& i)
-      : ss(ss_), mappedScop(s), nodeInfoMap(i) {}
+      const NodeInfoMapType& i,
+      const std::unordered_set<isl::id, isl::IslIdIslHash>& ros)
+      : ss(ss_), mappedScop(s), nodeInfoMap(i), readOnlySet(ros) {}
   CodegenContext(const CodegenContext& c)
-      : ss(c.ss), mappedScop(c.mappedScop), nodeInfoMap(c.nodeInfoMap) {}
+      : ss(c.ss),
+        mappedScop(c.mappedScop),
+        nodeInfoMap(c.nodeInfoMap),
+        readOnlySet(c.readOnlySet) {}
 
   const Scop& scop() const {
     return mappedScop.scop();
@@ -95,6 +99,7 @@ struct CodegenContext {
   std::stringstream& ss;
   const MappedScop& mappedScop;
   const NodeInfoMapType& nodeInfoMap;
+  const std::unordered_set<isl::id, isl::IslIdIslHash>& readOnlySet;
 };
 
 struct CodegenStatementContext : CodegenContext {
diff --git a/tc/core/polyhedral/cuda/mapped_scop.cc b/tc/core/polyhedral/cuda/mapped_scop.cc
@@ -908,7 +908,11 @@ std::unique_ptr<MappedScop> makeSpecializedMappedScop(
   tc::Block block = mappedScop.numThreads;
   std::tie(grid, block) = tightenLaunchBounds(*scop, grid, block);
   auto res = MappedScop::makeMappedScop(
-      std::move(scop), grid, block, mappedScop.unroll);
+      std::move(scop),
+      grid,
+      block,
+      mappedScop.unroll,
+      mappedScop.useReadOnlyCache);
   res->insertMappingContext();
 
   LOG_IF(INFO, FLAGS_debug_tc_mapper)
@@ -985,7 +989,8 @@ std::unique_ptr<MappedScop> MappedScop::makeWithOuterBlockInnerThreadStrategy(
       std::move(scopUPtr),
       ::tc::Grid(cudaOptions.grid),
       ::tc::Block(cudaOptions.block),
-      generic.proto.unroll()));
+      generic.proto.unroll(),
+      cudaOptions.proto().use_readonly_cache()));
   auto& scop = mappedScop->scop_;
 
   // 1a. Optionally specialize before scheduling...
diff --git a/tc/core/polyhedral/cuda/mapped_scop.h b/tc/core/polyhedral/cuda/mapped_scop.h
@@ -61,25 +61,28 @@ class MappedScop {
       std::unique_ptr<Scop>&& scop,
       ::tc::Grid grid,
       ::tc::Block block,
-      uint64_t unroll_)
+      uint64_t unroll_,
+      bool useReadOnlyCache_)
       : scop_(std::move(scop)),
         numBlocks(grid),
         numThreads(block),
-        unroll(unroll_) {}
+        unroll(unroll_),
+        useReadOnlyCache(useReadOnlyCache_) {}
 
  public:
   static inline std::unique_ptr<MappedScop> makeOneBlockOneThread(
       std::unique_ptr<Scop>&& scop) {
     return std::unique_ptr<MappedScop>(new MappedScop(
-        std::move(scop), ::tc::Grid{1, 1, 1}, ::tc::Block{1, 1, 1}, 1));
+        std::move(scop), ::tc::Grid{1, 1, 1}, ::tc::Block{1, 1, 1}, 1, false));
   }
   static inline std::unique_ptr<MappedScop> makeMappedScop(
       std::unique_ptr<Scop>&& scop,
       ::tc::Grid grid,
       ::tc::Block block,
-      uint64_t unroll) {
+      uint64_t unroll,
+      bool useReadOnlyCache) {
     return std::unique_ptr<MappedScop>(
-        new MappedScop(std::move(scop), grid, block, unroll));
+        new MappedScop(std::move(scop), grid, block, unroll, useReadOnlyCache));
   }
 
   // Apply the hand-written OuterBlockInnerThread mapping strategy.
@@ -206,6 +209,7 @@ class MappedScop {
   const ::tc::Grid numBlocks;
   const ::tc::Block numThreads;
   const uint64_t unroll;
+  const bool useReadOnlyCache;
 
  private:
   // Information about a detected reduction that can potentially
diff --git a/tc/proto/mapping_options.proto b/tc/proto/mapping_options.proto
@@ -69,9 +69,11 @@ message CudaMappingOptionsProto {
   // Maximum size of shred memory to use, in bytes.  If not provided, all
   // shared memory available on the current active device will be used.
   optional uint64 max_shared_memory = 7;
+  // Use the readonly cache (i.e. emit __ldg loads)
+  required bool use_readonly_cache = 8;
 }
 
 message CpuMappingOptionsProto {
   // Target-independent mapping options.
   required MappingOptionsProto generic_mapping_options = 1;
-}
+}
diff --git a/tensor_comprehensions/pybinds/pybind_options.cc b/tensor_comprehensions/pybinds/pybind_options.cc
@@ -79,6 +79,10 @@ PYBIND11_MODULE(mapping_options, m) {
           &tc::CudaMappingOptions::unrollCopyShared,
           "Also unroll the copies to and from shared memory. If an unroll "
           "value is not provided, has no effect")
+      .def(
+          "useReadOnlyCache",
+          &tc::CudaMappingOptions::useReadOnlyCache,
+          "Use the readonly cache (i.e. emit __ldg loads)")
       .def(
           "scheduleFusionStrategy",
           [](tc::CudaMappingOptions& instance, const std::string& type) {
diff --git a/test/isl_cli_strategy.h b/test/isl_cli_strategy.h
@@ -32,6 +32,7 @@
 #define DEFAULT_USE_SHARED_MEMORY true
 #define DEFAULT_USE_PRIVATE_MEMORY true
 #define DEFAULT_UNROLL_COPY_SHARED false
+#define DEFAULT_USE_READONLY_CACHE false
 
 DEFINE_string(
     fusion_strategy,
@@ -58,6 +59,10 @@ DEFINE_bool(
     unroll_copy_shared,
     DEFAULT_UNROLL_COPY_SHARED,
     "Unroll copy to/from shared");
+DEFINE_bool(
+    use_readonly_cache,
+    DEFAULT_USE_READONLY_CACHE,
+    "Use the readonly cache (i.e. emit __ldg loads)");
 DEFINE_string(tile, DEFAULT_TILE, "Tile sizes (comma-separated list)");
 DEFINE_bool(
     tile_imperfectly_nested,
@@ -83,7 +88,8 @@ tc::CudaMappingOptions makeBaseCliStrategy() {
           .mapToBlocks(DEFAULT_GRID)
           .useSharedMemory(DEFAULT_USE_SHARED_MEMORY)
           .usePrivateMemory(DEFAULT_USE_PRIVATE_MEMORY)
-          .unrollCopyShared(DEFAULT_UNROLL_COPY_SHARED);
+          .unrollCopyShared(DEFAULT_UNROLL_COPY_SHARED)
+          .useReadOnlyCache(DEFAULT_USE_READONLY_CACHE);
   options.scheduleFusionStrategy(fs)
       .fixParametersBeforeScheduling(DEFAULT_FIX_PARAMETERS_BEFORE_SCHEDULING)
       .tile(DEFAULT_TILE)
@@ -136,6 +142,9 @@ tc::CudaMappingOptions makeCliStrategy(tc::CudaMappingOptions options) {
   if (FLAGS_unroll_copy_shared != DEFAULT_UNROLL_COPY_SHARED) {
     options.unrollCopyShared(FLAGS_unroll_copy_shared);
   }
+  if (FLAGS_use_readonly_cache != DEFAULT_USE_READONLY_CACHE) {
+    options.useReadOnlyCache(FLAGS_use_readonly_cache);
+  }
   if (FLAGS_unroll != DEFAULT_UNROLL_FACTOR) {
     options.unroll(FLAGS_unroll);
   }
diff --git a/test/test_cuda_mapper.cc b/test/test_cuda_mapper.cc