From f6a78dcc1d2e6938f6357376630298d3cfc874f3 Mon Sep 17 00:00:00 2001
From: Theodoros Theodoridis <theodoridisgr@gmail.com>
Date: Tue, 19 Jun 2018 14:49:33 +0200
Subject: [PATCH] [Cuda Codegen] Emit launch bounds

Cuda functions can be annotated with launch bounds, that is the maximum
number of threads per block (the minimum blocks per multiprocessor can
also be specified). This information is used by nvrtc/nvcc during
register allocation (and probably other phases as well).
---
 tc/core/polyhedral/cuda/codegen.cc | 14 +++++++++++---
 test/test_cuda_mapper.cc           |  4 ++--
 2 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
index 230623eff..452b1460a 100644
--- a/tc/core/polyhedral/cuda/codegen.cc
+++ b/tc/core/polyhedral/cuda/codegen.cc
@@ -153,9 +153,17 @@ void emitArgs(stringstream& ss, const Scop& scop) {
 void emitKernelSignature(
     stringstream& ss,
     const std::string& specializedName,
-    const Scop& scop) {
+    const Scop& scop,
+    const Block& block) {
   TC_CHECK_NE(specializedName, "") << "name not provided";
-  ss << "__global__ void " << specializedName << "(";
+  auto b0 = block.view[0];
+  b0 = b0 == 0 ? 1 : b0;
+  auto b1 = block.view[1];
+  b1 = b1 == 0 ? 1 : b1;
+  auto b2 = block.view[2];
+  b1 = b2 == 0 ? 1 : b2;
+  ss << "__global__ __launch_bounds__(" << b0 * b1 * b2 << ") void "
+     << specializedName << "(";
   emitArgs(ss, scop);
   ss << ") {" << endl;
 }
@@ -753,7 +761,7 @@ string emitCudaKernel(
   }
 
   stringstream ss;
-  emitKernelSignature(ss, specializedName, scop);
+  emitKernelSignature(ss, specializedName, scop, mscop.numThreads);
   emitThreadIdInit(ss, mscop);
   emitTensorViews(ss, scop.halide.outputs, paramValues);
   emitTensorViews(ss, scop.halide.inputs, paramValues);
diff --git a/test/test_cuda_mapper.cc b/test/test_cuda_mapper.cc
index 1e3d6ec69..4a0c9971b 100644
--- a/test/test_cuda_mapper.cc
+++ b/test/test_cuda_mapper.cc
@@ -451,7 +451,7 @@ def fun(float(N, N) A) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected(
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[N] = reinterpret_cast<float32 (*)[N]>(pO);
@@ -480,7 +480,7 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected =
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
+      R"RES(__global__ __launch_bounds__(1) void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
   float32 (*O)[512] = reinterpret_cast<float32 (*)[512]>(pO);