CudaGPUInfo: record the number of threads per block

ftynse · ftynse · commit 1d9d6e3bb364 · 2018-07-26T12:01:33.000+02:00
This will be used in computation of the default number of elements to
promote to private.
diff --git a/tc/core/cuda/cuda.cc b/tc/core/cuda/cuda.cc
@@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
 
 namespace {
 
-std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
+std::tuple<std::vector<std::string>, std::vector<size_t>, std::vector<size_t>>
+init() {
   int deviceCount = 0;
   auto err_id = cudaGetDeviceCount(&deviceCount);
   if (err_id == 35 or err_id == 30) {
@@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
   }
   std::vector<std::string> gpuNames;
   std::vector<size_t> sharedMemSizes;
+  std::vector<size_t> registersPerBlock;
   gpuNames.reserve(deviceCount);
   for (int i = 0; i < deviceCount; ++i) {
     cudaDeviceProp deviceProp;
     TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i));
     gpuNames.emplace_back(deviceProp.name);
     sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock);
+    registersPerBlock.emplace_back(deviceProp.regsPerBlock);
   }
-  return std::make_tuple(gpuNames, sharedMemSizes);
+  return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock);
 }
 
 } // namespace
@@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
   static thread_local bool inited = false;
   if (!inited) {
     auto infos = init();
-    pInfo = std::unique_ptr<CudaGPUInfo>(
-        new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos)));
+    pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo(
+        std::get<0>(infos), std::get<1>(infos), std::get<2>(infos)));
     inited = true;
   }
   return *pInfo;
@@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
   }
   return sharedMemSizes_.at(CurrentGPUId());
 }
+
+size_t CudaGPUInfo::RegistersPerBlock() const {
+  if (NumberGPUs() == 0) {
+    return 0; // no registers if no GPUs
+  }
+  return registersPerBlock_.at(CurrentGPUId());
+}
 } // namespace tc
diff --git a/tc/core/cuda/cuda.h b/tc/core/cuda/cuda.h
@@ -98,8 +98,11 @@ struct WithCudaDevice {
 class CudaGPUInfo {
   CudaGPUInfo(
       const std::vector<std::string>& gpuNames,
-      const std::vector<size_t>& sharedMemSizes)
-      : gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {}
+      const std::vector<size_t>& sharedMemSizes,
+      const std::vector<size_t>& registersPerBlock)
+      : gpuNames_(gpuNames),
+        sharedMemSizes_(sharedMemSizes),
+        registersPerBlock_(registersPerBlock) {}
 
  public:
   static CudaGPUInfo& GPUInfo();
@@ -112,9 +115,11 @@ class CudaGPUInfo {
   std::string GetGPUName(int id = -1) const;
   std::string getCudaDeviceStr() const;
   size_t SharedMemorySize() const;
+  size_t RegistersPerBlock() const;
 
   std::vector<std::string> gpuNames_;
   std::vector<size_t> sharedMemSizes_;
+  std::vector<size_t> registersPerBlock_;
 };
 
 struct CudaProfiler {