Merge pull request #259 from facebookresearch/cache_keys

ftynse · web-flow · commit 5e21c1add42e · 2018-04-05T09:52:40.000+02:00
Use canonicalized TC as part of cache keys
diff --git a/include/tc/autotuner/utils/utils.h b/include/tc/autotuner/utils/utils.h
@@ -49,6 +49,11 @@ std::vector<CudaMappingOptions> restoreCandidates(
     const std::vector<const DLTensor*>& inputs,
     const std::vector<const DLTensor*>& outputs);
 
+std::vector<CudaMappingOptions> restoreCandidates(
+    const lang::TreeRef& tc,
+    const std::vector<const DLTensor*>& inputs,
+    const std::vector<const DLTensor*>& outputs);
+
 llvm::Optional<CudaMappingOptions> getBestOptions(
     const std::string& id,
     const std::vector<const DLTensor*>& inputs,
diff --git a/include/tc/core/tc_executor.h b/include/tc/core/tc_executor.h
@@ -124,6 +124,7 @@ class TcExecutor {
 
   tc2halide::HalideComponents halideComponents_;
   lang::TreeRef tcTree_;
+  std::string cacheKeyId;
 };
 
 // templating to match both const and non-const DLTensor pointers
diff --git a/src/autotuner/genetic_autotuner.cc b/src/autotuner/genetic_autotuner.cc
@@ -80,7 +80,8 @@ std::vector<CudaMappingOptions> GeneticAutotuner::load(
   ExecutionEngine<CudaTcExecutor> ee;
   ee.define(tc_);
   auto outputs = ee.inferOutputTensorInfo(tcName, inputs);
-  return tc::autotune::restoreCandidates(tcName, inputs, outputs);
+  return tc::autotune::restoreCandidates(
+      tcNameMap_.at(tcName), inputs, outputs);
 }
 
 namespace {
diff --git a/src/autotuner/utils/utils.cc b/src/autotuner/utils/utils.cc
@@ -20,6 +20,10 @@
 #include "tc/autotuner/utils/utils.h"
 #include "tc/core/cuda/cuda_compilation_cache.h"
 #include "tc/core/utils/math.h"
+#include "tc/lang/canonicalize.h"
+#include "tc/lang/parser.h"
+#include "tc/lang/sema.h"
+#include "tc/lang/tree.h"
 
 namespace tc {
 namespace autotune {
@@ -70,11 +74,20 @@ std::vector<OptionsWithMedianTime> getOptionsAndMedianRuntimes(
   return c;
 }
 
+namespace {
+std::string canonicalTC(const lang::TreeRef& tc) {
+  std::stringstream ss;
+  ss << lang::canonicalize(tc);
+  return ss.str();
+}
+} // namespace
+
 std::vector<CudaMappingOptions> restoreCandidates(
-    const std::string& id,
+    const lang::TreeRef& tc,
     const std::vector<const DLTensor*>& inputs,
     const std::vector<const DLTensor*>& outputs) {
-  auto candidates = getOptionsAndMedianRuntimes(id, inputs, outputs);
+  auto candidates = getOptionsAndMedianRuntimes(
+      canonicalTC(lang::Sema().checkFunction(tc)), inputs, outputs);
   LOG_IF(INFO, candidates.size() < FLAGS_tuner_gen_restore_number)
       << "Requested " << FLAGS_tuner_gen_restore_number
       << " candidates but there are only " << candidates.size() << " in cache.";
@@ -96,6 +109,13 @@ std::vector<CudaMappingOptions> restoreCandidates(
   return res;
 }
 
+std::vector<CudaMappingOptions> restoreCandidates(
+    const std::string& tc,
+    const std::vector<const DLTensor*>& inputs,
+    const std::vector<const DLTensor*>& outputs) {
+  return restoreCandidates(lang::Parser(tc).parseFunction(), inputs, outputs);
+}
+
 llvm::Optional<CudaMappingOptions> getBestOptions(
     const std::string& id,
     const std::vector<const DLTensor*>& inputs,
diff --git a/src/core/cuda/cuda_tc_executor.cc b/src/core/cuda/cuda_tc_executor.cc
@@ -56,8 +56,7 @@ void CudaTcExecutor::compile(const tc::CudaMappingOptions& options) {
   auto cachedOp = [&]() -> std::unique_ptr<CudaCache::RetrievalResult> {
     if (ManualCudaCache::cacheEnabled()) {
       auto rr = ManualCudaCache::getCache()->retrieveKernel(
-          // TODO:replace this with pretty printed TC
-          executionInfo_.kernelName,
+          cacheKeyId,
           extractRawPtrs(executionInfo_.inputsInfo),
           extractRawPtrs(executionInfo_.outputsInfo));
       if (rr) {
@@ -72,7 +71,7 @@ void CudaTcExecutor::compile(const tc::CudaMappingOptions& options) {
         << "options string is empty, are you trying compile "
         << "a dummy CudaTcExecutor?";
     return CudaCache::getCache()->retrieveKernel(
-        executionInfo_.kernelName, // TODO:replace this with pretty printed TC
+        cacheKeyId,
         options,
         extractRawPtrs(executionInfo_.inputsInfo),
         extractRawPtrs(executionInfo_.outputsInfo));
@@ -94,7 +93,7 @@ void CudaTcExecutor::compile(const tc::CudaMappingOptions& options) {
       LOG_IF(INFO, FLAGS_debug_tc_mapper) << "original grid: " << grid;
       LOG_IF(INFO, FLAGS_debug_tc_mapper) << "original block: " << block;
       CudaCache::getCache()->cacheKernel(
-          executionInfo_.kernelName, // TODO:replace this with pretty printed TC
+          cacheKeyId,
           options,
           extractRawPtrs(executionInfo_.inputsInfo),
           extractRawPtrs(executionInfo_.outputsInfo),
@@ -213,8 +212,7 @@ Duration CudaTcExecutor::run(
       profile);
   if (profile and OptionsCache::cacheEnabled()) {
     OptionsCache::getCache()->recordRuntime(
-        // TODO:replace this with pretty printed TC
-        executionInfo_.kernelName,
+        cacheKeyId,
         CudaMappingOptions(executionInfo_.options),
         inputs,
         constPtrs(outputs),
diff --git a/src/core/tc_executor.cc b/src/core/tc_executor.cc
@@ -15,9 +15,11 @@
  */
 #include "tc/core/tc_executor.h"
 
+#include <sstream>
 #include <string>
 
 #include "tc/core/utils/dlpack.h"
+#include "tc/lang/canonicalize.h"
 #include "tc/lang/parser.h"
 #include "tc/lang/sema.h"
 
@@ -30,6 +32,13 @@ int toTypeToken(DLDataType dtype) {
   return lang::TypeInfo(lang::TypeInfo::Code(dtype.code), dtype.bits)
       .toScalarToken();
 }
+
+std::string canonicalizedTc(const lang::TreeRef tcDefinition) {
+  std::stringstream ss;
+  ss << canonicalize(lang::Sema().checkFunction(tcDefinition));
+  return ss.str();
+}
+
 } // namespace
 
 TcExecutor::TcExecutor(
@@ -49,6 +58,7 @@ TcExecutor::TcExecutor(
   // TODO: check if this is wrong, packed tensors may  have 0 strides stored
   executionInfo_.outputsInfo =
       tc::inferOutputTensorInfo(halideComponents_, inputsInfo);
+  cacheKeyId = canonicalizedTc(tcDefinition);
 }
 
 TcExecutor::~TcExecutor() {}
diff --git a/test/test_autotuner_utility.cc b/test/test_autotuner_utility.cc
@@ -43,7 +43,7 @@ TEST(DivisorsAndPowers, Default) {
 }
 
 std::vector<CudaMappingOptions> restoreCandidates(
-    const std::string& kernelName,
+    const std::string& tc,
     std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs) {
   auto inputsPair = toConstDlpackTensors(inputs);
@@ -54,13 +54,23 @@ std::vector<CudaMappingOptions> restoreCandidates(
   });
 
   return tc::autotune::restoreCandidates(
-      kernelName, inputsPair.first, outputsPair.first);
+      tc, inputsPair.first, outputsPair.first);
 }
 
 TEST(RestoreCandidates, NoCache) {
   std::vector<at::Tensor> inputs{at::CUDA(at::kFloat).rand({10, 16}),
                                  at::CUDA(at::kFloat).rand({16, 20})};
-  ASSERT_THROW(restoreCandidates("bla", inputs, inputs), std::runtime_error);
+  static constexpr auto tc = R"(
+      def tc2(float(M,N) A, float(N,K) B) -> (output) {
+        output(m, k) +=! A(m, nn) * B(nn, k) + 1
+      })";
+  ASSERT_THROW(restoreCandidates(tc, inputs, inputs), std::runtime_error);
+}
+
+TEST(RestoreCandidates, NotATCid) {
+  std::vector<at::Tensor> inputs{at::CUDA(at::kFloat).rand({10, 16}),
+                                 at::CUDA(at::kFloat).rand({16, 20})};
+  ASSERT_THROW(restoreCandidates("bla", inputs, inputs), lang::ErrorReport);
 }
 
 static constexpr auto tc_ = R"(
@@ -89,7 +99,7 @@ TEST(RestoreCandidates, NoRuntimeRecorded) {
   atCompl.run("matmul", inputs, outputs_, handle);
 
   FLAGS_tuner_gen_restore_number = 1;
-  ASSERT_EQ(restoreCandidates("matmul", inputs, outputs_).size(), 0);
+  ASSERT_EQ(restoreCandidates(tc_, inputs, outputs_).size(), 0);
 }
 
 TEST(RestoreCandidates, Hit) {
@@ -110,11 +120,11 @@ TEST(RestoreCandidates, Hit) {
   atCompl.run("matmul", inputs, outputs_, handle, true);
 
   FLAGS_tuner_gen_restore_number = 2;
-  auto restored = restoreCandidates("matmul", inputs, outputs_);
+  auto restored = restoreCandidates(tc_, inputs, outputs_);
   ASSERT_EQ(restored.size(), 2);
 
   FLAGS_tuner_gen_restore_number = 1;
-  restored = restoreCandidates("matmul", inputs, outputs_);
+  restored = restoreCandidates(tc_, inputs, outputs_);
   ASSERT_EQ(restored.size(), 1);
 }
 
diff --git a/test/test_tc_mapper.cc b/test/test_tc_mapper.cc
@@ -24,6 +24,9 @@
 #include "tc/core/cuda/cuda_compilation_cache.h"
 #include "tc/core/cuda/cuda_tc_executor.h"
 #include "tc/core/scope_guard.h"
+#include "tc/lang/canonicalize.h"
+#include "tc/lang/sema.h"
+#include "tc/lang/tree.h"
 
 #include "test_harness_aten_cuda.h"
 
@@ -59,7 +62,12 @@ struct TcMapperTest : public ::testing::Test {
       tc::deleteDlmTensors(outputDLTensorsPair.second);
     });
     auto cached = tc::CudaCache::getCache()->retrieveKernel(
-        name,
+        [&]() {
+          std::stringstream ss;
+          ss << lang::canonicalize(
+              lang::Sema().checkFunction(lang::Parser(tc).parseFunction()));
+          return ss.str();
+        }(),
         mappingOptions,
         inputDLTensorsPair.first,
         outputDLTensorsPair.first);

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,8 @@ std::vector<CudaMappingOptions> GeneticAutotuner::load(`
`80`	`80`	`ExecutionEngine<CudaTcExecutor> ee;`
`81`	`81`	`ee.define(tc_);`
`82`	`82`	`auto outputs = ee.inferOutputTensorInfo(tcName, inputs);`
`83`		`- return tc::autotune::restoreCandidates(tcName, inputs, outputs);`
	`83`	`+ return tc::autotune::restoreCandidates(`
	`84`	`+ tcNameMap_.at(tcName), inputs, outputs);`
`84`	`85`	`}`
`85`	`86`
`86`	`87`	`namespace {`