Skip to content
This repository was archived by the owner on Apr 28, 2023. It is now read-only.

Commit 1d9d6e3

Browse files
committed
CudaGPUInfo: record the number of threads per block
This will be used in computation of the default number of elements to promote to private.
1 parent c6730e3 commit 1d9d6e3

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

tc/core/cuda/cuda.cc

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ DEFINE_bool(use_nvprof, false, "Start / stop nvprof");
3030

3131
namespace {
3232

33-
std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
33+
std::tuple<std::vector<std::string>, std::vector<size_t>, std::vector<size_t>>
34+
init() {
3435
int deviceCount = 0;
3536
auto err_id = cudaGetDeviceCount(&deviceCount);
3637
if (err_id == 35 or err_id == 30) {
@@ -44,14 +45,16 @@ std::tuple<std::vector<std::string>, std::vector<size_t>> init() {
4445
}
4546
std::vector<std::string> gpuNames;
4647
std::vector<size_t> sharedMemSizes;
48+
std::vector<size_t> registersPerBlock;
4749
gpuNames.reserve(deviceCount);
4850
for (int i = 0; i < deviceCount; ++i) {
4951
cudaDeviceProp deviceProp;
5052
TC_CUDA_RUNTIMEAPI_ENFORCE(cudaGetDeviceProperties(&deviceProp, i));
5153
gpuNames.emplace_back(deviceProp.name);
5254
sharedMemSizes.emplace_back(deviceProp.sharedMemPerBlock);
55+
registersPerBlock.emplace_back(deviceProp.regsPerBlock);
5356
}
54-
return std::make_tuple(gpuNames, sharedMemSizes);
57+
return std::make_tuple(gpuNames, sharedMemSizes, registersPerBlock);
5558
}
5659

5760
} // namespace
@@ -61,8 +64,8 @@ CudaGPUInfo& CudaGPUInfo::GPUInfo() {
6164
static thread_local bool inited = false;
6265
if (!inited) {
6366
auto infos = init();
64-
pInfo = std::unique_ptr<CudaGPUInfo>(
65-
new CudaGPUInfo(std::get<0>(infos), std::get<1>(infos)));
67+
pInfo = std::unique_ptr<CudaGPUInfo>(new CudaGPUInfo(
68+
std::get<0>(infos), std::get<1>(infos), std::get<2>(infos)));
6669
inited = true;
6770
}
6871
return *pInfo;
@@ -102,4 +105,11 @@ size_t CudaGPUInfo::SharedMemorySize() const {
102105
}
103106
return sharedMemSizes_.at(CurrentGPUId());
104107
}
108+
109+
size_t CudaGPUInfo::RegistersPerBlock() const {
110+
if (NumberGPUs() == 0) {
111+
return 0; // no registers if no GPUs
112+
}
113+
return registersPerBlock_.at(CurrentGPUId());
114+
}
105115
} // namespace tc

tc/core/cuda/cuda.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,11 @@ struct WithCudaDevice {
9898
class CudaGPUInfo {
9999
CudaGPUInfo(
100100
const std::vector<std::string>& gpuNames,
101-
const std::vector<size_t>& sharedMemSizes)
102-
: gpuNames_(gpuNames), sharedMemSizes_(sharedMemSizes) {}
101+
const std::vector<size_t>& sharedMemSizes,
102+
const std::vector<size_t>& registersPerBlock)
103+
: gpuNames_(gpuNames),
104+
sharedMemSizes_(sharedMemSizes),
105+
registersPerBlock_(registersPerBlock) {}
103106

104107
public:
105108
static CudaGPUInfo& GPUInfo();
@@ -112,9 +115,11 @@ class CudaGPUInfo {
112115
std::string GetGPUName(int id = -1) const;
113116
std::string getCudaDeviceStr() const;
114117
size_t SharedMemorySize() const;
118+
size_t RegistersPerBlock() const;
115119

116120
std::vector<std::string> gpuNames_;
117121
std::vector<size_t> sharedMemSizes_;
122+
std::vector<size_t> registersPerBlock_;
118123
};
119124

120125
struct CudaProfiler {

0 commit comments

Comments
 (0)