facebookresearch
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎tc/aten/aten_compiler-inl.h
Lines changed: 2 additions & 2 deletions b/‎tc/aten/aten_compiler-inl.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎tc/autotuner/genetic_autotuner.cc
Lines changed: 3 additions & 3 deletions b/‎tc/autotuner/genetic_autotuner.cc
Lines changed: 3 additions & 3 deletions
diff --git a/‎tc/autotuner/genetic_search.cc
Lines changed: 6 additions & 6 deletions b/‎tc/autotuner/genetic_search.cc
Lines changed: 6 additions & 6 deletions
diff --git a/‎tc/autotuner/genetic_tuning_harness.cc
Lines changed: 17 additions & 16 deletions b/‎tc/autotuner/genetic_tuning_harness.cc
Lines changed: 17 additions & 16 deletions
diff --git a/‎tc/autotuner/genetic_tuning_harness.h
Lines changed: 4 additions & 3 deletions b/‎tc/autotuner/genetic_tuning_harness.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎tc/autotuner/parameters.cc
Lines changed: 1 addition & 1 deletion b/‎tc/autotuner/parameters.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎tc/benchmarks/benchmark_fixture.h
Lines changed: 8 additions & 8 deletions b/‎tc/benchmarks/benchmark_fixture.h
Lines changed: 8 additions & 8 deletions
diff --git a/‎tc/c2/tc_op.h
Lines changed: 1 addition & 1 deletion b/‎tc/c2/tc_op.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎tc/core/cuda/cuda_mapping_options-inl.h
Lines changed: 9 additions & 9 deletions b/‎tc/core/cuda/cuda_mapping_options-inl.h
Lines changed: 9 additions & 9 deletions
@@ -176,7 +176,7 @@ if (WITH_CAFFE2)
   include_directories(third-party/caffe2/third_party/eigen)
 
   find_path(CAFFE2_INCLUDE_DIR NAMES caffe2)
-  include_directories(${CAFFE2_INCLUDE_DIR})
+  include_directories(SYSTEM ${CAFFE2_INCLUDE_DIR})
   find_library(CAFFE2_CPU_LIBRARIES NAMES caffe2 PATHS ${CMAKE_INSTALL_PREFIX} PATH_SUFFIXES lib lib64)
   message(STATUS "Found Caffe2_CPU: ${CAFFE2_CPU_LIBRARIES}")
   if (WITH_CUDA)
 
@@ -49,7 +49,7 @@ void prepareOutputs(
     throw lang::ErrorReport(func) << "expected " << tensorInfo.size()
                                   << " outputs but found " << outputs.size();
   }
-  for (int i = 0; i < tensorInfo.size(); ++i) {
+  for (size_t i = 0; i < tensorInfo.size(); ++i) {
     auto info = tensorInfo[i];
     auto stype = at::toScalarType(info->dtype);
     if (outputs.size() < tensorInfo.size()) {
@@ -121,7 +121,7 @@ void ATenCompilationUnit<ExecutorType>::uncheckedRun(
   constexpr auto kReservedSize = 8;
   std::vector<const void*> I(kReservedSize, nullptr);
   std::vector<void*> O(kReservedSize, nullptr);
-  int i;
+  size_t i;
   for (i = 0; i < inputs.size(); ++i) {
     if (i < kReservedSize) {
       I[i] = inputs[i].data_ptr();
 
@@ -98,11 +98,11 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
     CudaMappingOptions baseMapping,
     std::vector<CudaMappingOptions> startingPoints,
     const TuningParameterFixer& fixedParams) {
-  CHECK_EQ(1, tcNameMap_.count(tcName)) << "Error looking up " << tcName;
+  CHECK_EQ(1u, tcNameMap_.count(tcName)) << "Error looking up " << tcName;
   enableOrLoadCache(cacheFileName);
 
   if (FLAGS_tuner_gen_restore_from_proto && !(cacheFileName.empty())) {
-    CHECK_GT(inputs.size(), 0);
+    CHECK_GT(inputs.size(), 0u);
 
     auto restoredCandidates = load(
         cacheFileName,
@@ -185,7 +185,7 @@ llvm::Optional<CudaMappingOptions> GeneticAutotuner::tune(
   ee.define(tc_);
   auto outputPtrs = ee.inferOutputTensorInfo(tcName, inputs.begin()->second);
 
-  CHECK_GT(inputs.size(), 0);
+  CHECK_GT(inputs.size(), 0u);
   return tc::autotune::getBestOptions(
       canonicalTc(tcNameMap_.at(tcName)), inputs.begin()->second, outputPtrs);
 }
 
@@ -49,7 +49,7 @@ template <typename RNG>
 void mutate(
     CandidateConfiguration& candidate,
     double rate,
-    int iterations,
+    size_t iterations,
     RNG& rng) {
   auto shouldMutate = [&]() -> bool {
     return std::discrete_distribution<int>{static_cast<double>(100 - rate),
@@ -179,7 +179,7 @@ GeneticSearch::GeneticSearch(
   }
   if (kMaxPopulationSize - population.size() > 0) {
     auto oldSize = population.size();
-    for (int i = oldSize; i < kMaxPopulationSize; ++i) {
+    for (size_t i = oldSize; i < kMaxPopulationSize; ++i) {
       population.emplace_back(
           make_unique<CandidateConfiguration>(*population.front()));
     }
@@ -202,7 +202,7 @@ GeneticSearch::GeneticSearch(
       rng{std::random_device{}()} {
   restoreRngState(rng);
   VALIDATE();
-  for (int i = 0; i < kMaxPopulationSize; ++i) {
+  for (size_t i = 0; i < kMaxPopulationSize; ++i) {
     population.emplace_back(make_unique<CandidateConfiguration>(conf));
   }
   randomizePopulation(population.begin(), population.end(), rng);
@@ -320,18 +320,18 @@ void GeneticSearch::updateParameters() {
            "when autotuning a TC operating on small tensors. The next "
            "generation will be randomly initialized.";
     population.resize(0);
-    for (int i = 0; i < kMaxPopulationSize; ++i) {
+    for (size_t i = 0; i < kMaxPopulationSize; ++i) {
       population.emplace_back(
           make_unique<CandidateConfiguration>(lastBestConf));
     }
     // Don't lose the first one which was the best from before
-    CHECK_LT(0, population.size());
+    CHECK_LT(0u, population.size());
     randomizePopulation(population.begin() + 1, population.end(), rng);
     return;
   }
 
   breed();
-  for (int i = kNumberElites; i < population.size(); ++i) {
+  for (size_t i = kNumberElites; i < population.size(); ++i) {
     mutate(*population[i], kMutationRate, kMutateIterations, rng);
   }
 }
 
@@ -146,7 +146,7 @@ std::vector<size_t> inputDivisorsAndPowers2(
 }
 
 size_t largestDim(const std::vector<const DLTensor*>& inputs) {
-  CHECK_GE(inputs.size(), 0);
+  CHECK_GE(inputs.size(), 0u);
   auto maxElement = std::max_element(
       inputs.begin(), inputs.end(), [](const DLTensor* a, const DLTensor* b) {
         return a->ndim < b->ndim;
@@ -157,7 +157,7 @@ size_t largestDim(const std::vector<const DLTensor*>& inputs) {
 } // namespace
 
 void GeneticTunerHarness::setupTuningParameters() {
-  CHECK_GT(kInputs_.size(), 0);
+  CHECK_GT(kInputs_.size(), 0u);
   auto range = inputDivisorsAndPowers2(kInputs_.begin()->second);
   auto rangeUpTo64 = filterHigherThan(range, 64);
 
@@ -208,6 +208,9 @@ std::vector<size_t> parseGpus() {
     LOG(GSTREAM) << line;                                \
   }
 
+constexpr size_t GeneticTunerHarness::kEarlyPruneFactor;
+constexpr size_t GeneticTunerHarness::kCatastrophicPerfFactor;
+
 // This function is ran on a single pre-determined GPU, in a single thread
 // It takes the input/output DLTensor objects that reside on that GPU
 //
@@ -222,7 +225,7 @@ bool GeneticTunerHarness::warmupOrPrune(
     const std::vector<DLTensor*>& outputs,
     const std::vector<const DLTensor*>& inputs,
     size_t handle,
-    size_t bestTimeSoFar) {
+    Duration bestTimeSoFar) {
   // Pruning based on number of threads: if you don't hit at least k warps
   // (default k = 8; 256 total threads, controlled by
   // FLAGS_tuner_min_launch_total_threads) then it's likely the kernel is not
@@ -276,10 +279,8 @@ bool GeneticTunerHarness::warmupOrPrune(
   }
 
   // 1.b.
-  constexpr size_t kCatastrophicPerfFactor = 100;
-  if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
-      prof >= std::chrono::microseconds(
-                  (kCatastrophicPerfFactor * bestTimeSoFar))) {
+  if (bestTimeSoFar < Duration::max() and
+      prof >= kCatastrophicPerfFactor * bestTimeSoFar) {
     return true;
   }
 
@@ -291,8 +292,8 @@ bool GeneticTunerHarness::warmupOrPrune(
   // 2. After reasonable warmup, look at the performance and prune with
   // kEarlyPruneFactor
   prof = engine.run(handle, inputs, outputs, true);
-  if (bestTimeSoFar < std::numeric_limits<size_t>::max() and
-      prof >= std::chrono::microseconds((kEarlyPruneFactor * bestTimeSoFar))) {
+  if (bestTimeSoFar < Duration::max() and
+      prof >= kEarlyPruneFactor * bestTimeSoFar) {
     return true;
   }
 
@@ -346,9 +347,9 @@ void GeneticTunerHarness::doGpuWork(
     ExecutorType& engine,
     Printer& printer) {
   WithDevice wd(gpu);
-  CHECK_EQ(1, kInputs_.count(gpu));
+  CHECK_EQ(1u, kInputs_.count(gpu));
   auto& inputs = kInputs_.at(gpu);
-  CHECK_EQ(1, outputs_.count(gpu));
+  CHECK_EQ(1u, outputs_.count(gpu));
   auto& outputs = outputs_.at(gpu);
 
   while (true) {
@@ -394,7 +395,7 @@ void GeneticTunerHarness::doGpuWork(
 
     std::vector<Duration> runtimes;
     try {
-      size_t bestTimeSoFar;
+      Duration bestTimeSoFar;
       {
         std::lock_guard<std::mutex> lock(bestTimeMtx_);
         bestTimeSoFar = bestTime_;
@@ -451,8 +452,8 @@ void GeneticTunerHarness::doGpuWork(
     // Save best time under lock
     {
       std::lock_guard<std::mutex> lock(bestTimeMtx_);
-      if (prof_us < bestTime_) {
-        bestTime_ = prof_us;
+      if (prof < bestTime_) {
+        bestTime_ = prof;
         bestCudaMappingOptions_ = options;
       }
     }
@@ -484,7 +485,7 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
     currentCompilationJob_.store(0);
     numEvaluations_.store(0);
     readyToEvaluate_.resize(0);
-    for (int i = 0; i < kMaxPopulationSize; ++i) {
+    for (size_t i = 0; i < kMaxPopulationSize; ++i) {
       readyToEvaluate_.emplace_back();
       readyToEvaluate_[i].store(false);
     }
@@ -509,7 +510,7 @@ void GeneticTunerHarness::runOneGeneration(size_t generation) {
         cpuCompilationThread.join();
       }
     });
-    for (int i = 0; i < FLAGS_tuner_threads; ++i) {
+    for (size_t i = 0; i < FLAGS_tuner_threads; ++i) {
       cpuCompilationThreads.emplace_back(
           [this, &engine]() { this->doCompile(engine); });
     }
 
@@ -63,7 +63,7 @@ class GeneticTunerHarness {
       const std::vector<DLTensor*>& outputs,
       const std::vector<const DLTensor*>& inputs,
       size_t handle,
-      size_t bestTimeSoFar);
+      Duration bestTimeSoFar);
 
   /// Helper function to delegate compiling on the cpu to different threads
   template <typename ExecutorType>
@@ -85,7 +85,8 @@ class GeneticTunerHarness {
  public:
   static constexpr int kReducedWarmupIterations = 2;
   static constexpr int kReducedBenchmarkIterations = 10;
-  static constexpr int kEarlyPruneFactor = 5;
+  static constexpr size_t kEarlyPruneFactor = 5;
+  static constexpr size_t kCatastrophicPerfFactor = 100;
 
   const size_t kMaxPopulationSize;
   const uint8_t kCrossOverRate;
@@ -96,7 +97,7 @@ class GeneticTunerHarness {
 
  private:
   std::mutex bestTimeMtx_;
-  size_t bestTime_ = std::numeric_limits<size_t>::max();
+  Duration bestTime_ = Duration::max();
   CudaMappingOptions bestCudaMappingOptions_;
 
   const lang::TreeRef kTc_;
 
@@ -97,7 +97,7 @@ RangeParameter& RangeParameter::operator=(const RangeParameter& other) {
 }
 
 void BoolParameter::selectOption(size_t idx) {
-  CHECK_LE(idx, 1);
+  CHECK_LE(idx, 1u);
   selectValue(idx);
 }
 
 
@@ -121,14 +121,14 @@ struct Benchmark : public ::testing::Test {
     auto handle = atCompl.compile(name, inputs, mappingOptions);
     atCompl.run(name, inputs, outputs, handle);
     EXPECT_TRUE(checkFun(inputs, outputs));
-    for (int i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
+    for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
       atCompl.run(name, inputs, outputs, handle);
     }
     std::vector<tc::Duration> kernelTimes;
     kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
     std::vector<tc::Duration> totalTimes;
     totalTimes.reserve(tc::FLAGS_benchmark_iterations);
-    for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
+    for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
       kernelTimes.push_back(atCompl.run(name, inputs, outputs, handle, true));
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
       auto time(std::chrono::system_clock::now());
@@ -201,12 +201,12 @@ struct Benchmark : public ::testing::Test {
   template <typename InitFunction, typename InplaceFunction>
   void Reference(InitFunction init, InplaceFunction compute) {
     auto res = init();
-    for (int i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
+    for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
       compute(res);
     }
     std::vector<tc::Duration> times;
     times.reserve(tc::FLAGS_benchmark_iterations);
-    for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
+    for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
       auto time(std::chrono::system_clock::now());
       compute(res);
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
@@ -278,14 +278,14 @@ struct Benchmark : public ::testing::Test {
     std::vector<at::Tensor> outputs;
     atCompl.run(name, inputs, outputs, handle);
     EXPECT_TRUE(checkFun(inputs, outputs));
-    for (int i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
+    for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
       atCompl.run(name, inputs, outputs, handle);
     }
     std::vector<tc::Duration> kernelTimes;
     kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
     std::vector<tc::Duration> totalTimes;
     totalTimes.reserve(tc::FLAGS_benchmark_iterations);
-    for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
+    for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
       kernelTimes.push_back(atCompl.run(name, inputs, outputs, handle, true));
       TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
       auto time(std::chrono::system_clock::now());
@@ -389,14 +389,14 @@ struct Benchmark : public ::testing::Test {
       std::vector<at::Tensor> outputs;
       atCompl.run(kernelName, inputs, outputs, handle);
       EXPECT_TRUE(checkFun(inputs, outputs));
-      for (int i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
+      for (size_t i = 1; i < tc::FLAGS_benchmark_warmup; ++i) {
         atCompl.run(kernelName, inputs, outputs, handle);
       }
       std::vector<tc::Duration> kernelTimes;
       kernelTimes.reserve(tc::FLAGS_benchmark_iterations);
       std::vector<tc::Duration> totalTimes;
       totalTimes.reserve(tc::FLAGS_benchmark_iterations);
-      for (int i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
+      for (size_t i = 0; i < tc::FLAGS_benchmark_iterations; ++i) {
         kernelTimes.push_back(
             atCompl.run(kernelName, inputs, outputs, handle, true));
         TC_CUDA_RUNTIMEAPI_ENFORCE(cudaDeviceSynchronize());
 
@@ -85,7 +85,7 @@ class TcOp : public Operator<Context> {
   virtual void setupDefaultGradCudaMappingOptions() {}
 
   void prepareOutputs(const std::vector<const DLTensor*> tensorInfo) {
-    for (int i = 0; i < tensorInfo.size(); ++i) {
+    for (size_t i = 0; i < tensorInfo.size(); ++i) {
       auto info = tensorInfo[i];
       std::vector<int64_t> shape(info->shape, info->shape + info->ndim);
       Output(i)->Resize(shape);
 
@@ -20,8 +20,8 @@ namespace tc {
 // CudaDimView & CudaDim
 //
 CudaDim::CudaDim(std::vector<uint64_t> il) : ownedProto_(), view(ownedProto_) {
-  CHECK_GT(il.size(), 0) << "list of values in CudaDimView must be non-empty";
-  CHECK_LE(il.size(), 3) << "at most 3 values allowed in CudaDimView";
+  CHECK_GT(il.size(), 0u) << "list of values in CudaDimView must be non-empty";
+  CHECK_LE(il.size(), 3u) << "at most 3 values allowed in CudaDimView";
 
   switch (il.size()) {
     case 3:
@@ -80,13 +80,13 @@ std::array<uint64_t, 3> CudaDimView::extractDefaultedArray() const {
                               CudaDimView::defaultDim,
                               CudaDimView::defaultDim};
   auto v = extractVector();
-  CHECK_LE(v.size(), 3);
+  CHECK_LE(v.size(), 3u);
   std::copy(v.begin(), v.end(), arr.begin());
   return arr;
 }
 
 ValueAccessor<uint64_t> CudaDimView::operator[](size_t i) {
-  CHECK_LT(i, 3) << "index overflow";
+  CHECK_LT(i, 3u) << "index overflow";
   if (i == 0) {
     return ValueAccessor<uint64_t>(
         [this](uint64_t u) { this->proto.set_x(u); },
@@ -109,7 +109,7 @@ ValueAccessor<uint64_t> CudaDimView::operator[](size_t i) {
 }
 
 uint64_t CudaDimView::operator[](size_t i) const {
-  CHECK_LT(i, 3) << "index overflow";
+  CHECK_LT(i, 3u) << "index overflow";
   if (i == 0) {
     return proto.x();
   } else if (i == 1) {
@@ -192,8 +192,8 @@ CudaMappingOptions::mapToThreads(uint64_t x, uint64_t y, uint64_t z) {
 
 CudaMappingOptions& CudaMappingOptions::mapToThreads(
     const std::vector<uint64_t>& threads) {
-  CHECK_GT(threads.size(), 0) << "expected at least one thread size";
-  CHECK_LE(threads.size(), 3) << "expected at most three thread sizes";
+  CHECK_GT(threads.size(), 0u) << "expected at least one thread size";
+  CHECK_LE(threads.size(), 3u) << "expected at most three thread sizes";
 
   uint64_t x = threads[0];
   uint64_t y = threads.size() > 1 ? threads[1] : CudaDimView::defaultDim;
@@ -216,8 +216,8 @@ CudaMappingOptions::mapToBlocks(uint64_t x, uint64_t y, uint64_t z) {
 
 CudaMappingOptions& CudaMappingOptions::mapToBlocks(
     const std::vector<uint64_t>& blocks) {
-  CHECK_GT(blocks.size(), 0) << "expected at least one thread size";
-  CHECK_LE(blocks.size(), 3) << "expected at most three thread sizes";
+  CHECK_GT(blocks.size(), 0u) << "expected at least one thread size";
+  CHECK_LE(blocks.size(), 3u) << "expected at most three thread sizes";
 
   uint64_t x = blocks[0];
   uint64_t y = blocks.size() > 1 ? blocks[1] : CudaDimView::defaultDim;
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ RangeParameter& RangeParameter::operator=(const RangeParameter& other) {`
`97`	`97`	`}`
`98`	`98`
`99`	`99`	`void BoolParameter::selectOption(size_t idx) {`
`100`		`- CHECK_LE(idx, 1);`
	`100`	`+ CHECK_LE(idx, 1u);`
`101`	`101`	`selectValue(idx);`
`102`	`102`	`}`
`103`	`103`