Make Halide output standard types

nicolasvasilache · ftynse · commit 6a38863d7aed · 2018-07-24T18:47:43.000+02:00
Halide has its own way of pretty-printing types.
In the case of `bool` (i.e. `uint1`), this conflicts with the
default cuda types. It will also conflict for `(u)int2` and
`(u)int4`.
This commit make our `Halide::IRPrinter` print `bool` and other
types properly for CUDA. As a consequence the typedefs in
`cuda_libraries.h` can be removed.
diff --git a/tc/core/cuda/cuda_libraries.h b/tc/core/cuda/cuda_libraries.h
@@ -34,19 +34,6 @@ constexpr auto types = R"C(
 // Can't include system dependencies with NVRTC
 // Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
 // #include <cuda_fp16.h>
-
-// Halide type handling
-typedef char int8;
-typedef short int16;
-typedef int int32;
-typedef long int64;
-typedef unsigned char uint8;
-typedef unsigned short uint16;
-typedef unsigned int uint32;
-typedef unsigned long uint64;
-// typedef half float16;
-typedef float float32;
-typedef double float64;
 )C";
 
 constexpr auto defines = R"C(
diff --git a/tc/core/polyhedral/cuda/codegen.cc b/tc/core/polyhedral/cuda/codegen.cc
@@ -39,6 +39,37 @@ namespace polyhedral {
 
 namespace {
 
+static std::string halideTypeString(const Halide::Type& t) {
+  if (t.is_bool()) {
+    return "bool";
+  } else if (t.is_int() && t.bits() == 8) {
+    return "char";
+  } else if (t.is_int() && t.bits() == 16) {
+    return "short";
+  } else if (t.is_int() && t.bits() == 32) {
+    return "int";
+  } else if (t.is_int() && t.bits() == 64) {
+    return "long";
+  } else if (t.is_uint() && t.bits() == 8) {
+    return "unsigned char";
+  } else if (t.is_uint() && t.bits() == 16) {
+    return "unsigned short";
+  } else if (t.is_uint() && t.bits() == 32) {
+    return "unsigned int";
+  } else if (t.is_uint() && t.bits() == 64) {
+    return "unsigned long";
+  } else if (t.is_float() && t.bits() == 16) {
+    return "half";
+  } else if (t.is_float() && t.bits() == 32) {
+    return "float";
+  } else if (t.is_float() && t.bits() == 64) {
+    return "double";
+  }
+  std::stringstream ss;
+  ss << t;
+  return ss.str();
+}
+
 struct WS {
   static thread_local int n;
   WS() {
@@ -102,7 +133,7 @@ vector<string> emitParams(const Scop& scop) {
   // Halide params. One of these two vectors will be empty.
   for (auto p : scop.halide.params) {
     stringstream ss;
-    ss << p.type() << " " << p.name();
+    ss << halideTypeString(p.type()) << " " << p.name();
     res.push_back(ss.str());
   }
   return res;
@@ -113,7 +144,7 @@ string emitTypedTensorName(
     Halide::OutputImageParam t,
     bool constInput = false) {
   stringstream ss;
-  ss << (constInput ? "const " : "") << t.type() << "* "
+  ss << (constInput ? "const " : "") << halideTypeString(t.type()) << "* "
      << makePointerName(t.name());
   return ss.str();
 }
@@ -195,11 +226,11 @@ void emitTensorView(
     ssViewType << "[" << extent << "]";
   }
   ss << ws.tab();
-  ss << (constInput ? "const " : "") << p.type() << " (*" << p.name() << ")"
-     << ssViewType.str();
+  ss << (constInput ? "const " : "") << halideTypeString(p.type()) << " (*"
+     << p.name() << ")" << ssViewType.str();
   ss << " = ";
-  ss << "reinterpret_cast<" << (constInput ? "const " : "") << p.type()
-     << " (*)" << ssViewType.str() << ">";
+  ss << "reinterpret_cast<" << (constInput ? "const " : "")
+     << halideTypeString(p.type()) << " (*)" << ssViewType.str() << ">";
   ss << "(" << makePointerName(p.name()) << ")";
   ss << ";";
   ss << endl;
@@ -604,6 +635,21 @@ void emitHalideExpr(
         IRPrinter::visit(op);
       }
     }
+    void visit(const Halide::Internal::IntImm* op) {
+      context.ss << "(" << halideTypeString(op->type) << ")" << op->value;
+    }
+    void visit(const Halide::Internal::UIntImm* op) {
+      context.ss << "(" << halideTypeString(op->type) << ")" << op->value;
+    }
+    void visit(const Halide::Internal::FloatImm* op) {
+      context.ss << "(" << halideTypeString(op->type) << ")" << op->value;
+    }
+    void visit(const Halide::Internal::Cast* op) {
+      context.ss << "(" << halideTypeString(op->type) << ")";
+      context.ss << "(";
+      op->value.accept(this);
+      context.ss << ")";
+    }
     // TODO: handle casts
     const CodegenStatementContext& context;
     const map<string, string>& substitutions;
@@ -720,7 +766,7 @@ void emitTmpDecl(stringstream& ss, const Scop& scop) {
     auto updateId = kvp.second;
     auto provide =
         scop.halide.statements.at(updateId).as<Halide::Internal::Provide>();
-    ss << provide->values[0].type() << " "
+    ss << halideTypeString(provide->values[0].type()) << " "
        << makeReductionTmpName(updateId, scop) << ";" << endl;
   }
 }
@@ -745,7 +791,7 @@ void emitPromotedArrayViewsHalide(stringstream& ss, const Scop& scop) {
     if (p.second.kind == Scop::PromotedDecl::Kind::SharedMem) {
       ss << "__shared__ ";
     }
-    ss << t << " " << viewName;
+    ss << halideTypeString(t) << " " << viewName;
     for (auto s : p.second.sizes) {
       ss << "[" << s << "]";
     }
diff --git a/test/cuda/test_basic_gpu.cc b/test/cuda/test_basic_gpu.cc
@@ -99,13 +99,15 @@ void loadUnload(const std::string& ptx) {
 
 TEST(BasicGpuTest, Nvrtc) {
   TC_CUDA_RUNTIMEAPI_ENFORCE(cudaFree(0));
-  auto PTX = jitCompile(R"CUDA(
+  auto PTX = jitCompile(
+      R"CUDA(
 extern "C" {
 __global__ void foo(int N)
 {
   assert(N == 1);
 }
-})CUDA", {"-G"});
+})CUDA",
+      {"-G"});
 
   std::string ptx(PTX.data());
   loadUnload(ptx);
@@ -153,15 +155,12 @@ namespace {
 // Mark the function argument as __restrict__ depending on the flag.
 std::string makeFuncWithOptionalRestrict(bool useRestrict) {
   std::stringstream ss;
-  ss << R"CUDA(typedef float float32;
-extern "C" {
-)CUDA";
   ss
-      << (useRestrict ? "__global__ void func(float32* __restrict__ pO2) {"
-                      : "__global__ void func(float32* pO2) {");
+      << (useRestrict ? "__global__ void func(float* __restrict__ pO2) {"
+                      : "__global__ void func(float* pO2) {");
   ss << R"CUDA(int b0 = blockIdx.x;
   int t0 = threadIdx.x;
-  float32 (*O2)[2] = reinterpret_cast<float32 (*)[2]>(pO2);
+  float (*O2)[2] = reinterpret_cast<float (*)[2]>(pO2);
   O2[b0][t0] = 0.000000f;  // S1
   __syncthreads();
   if (t0 == 0) {
@@ -171,7 +170,6 @@ extern "C" {
   }
   __syncthreads();
   O2[b0][t0] = fmax(O2[b0][t0], 0);  // S3
-}
 })CUDA";
   return ss.str();
 }
diff --git a/test/cuda/test_tc_mapper.cc b/test/cuda/test_tc_mapper.cc
@@ -327,8 +327,8 @@ def tensoraddstrided(float(N, M) I0_view, float(N, M) I1_view) -> (O) {
   auto res = Check(TC, name, options, inputs, checkFun);
   // This test should be modified  when strided tensors are handled
   std::string expected =
-      "const float32 (*I0_view)[64] = "
-      "reinterpret_cast<const float32 (*)[64]>(pI0_view)";
+      "const float (*I0_view)[64] = "
+      "reinterpret_cast<const float (*)[64]>(pI0_view)";
   ASSERT_NE(std::string::npos, res.second.find(expected))
       << "In resulting code:\n"
       << res.second << "\nfound unexpected: " << expected;
diff --git a/test/test_cuda_mapper.cc b/test/test_cuda_mapper.cc
@@ -365,9 +365,9 @@ def fun(float(N, M) A, float(N, M) B) -> (C) {
   std::string expected(
       R"RES(int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
-  float32 (*C)[M] = reinterpret_cast<float32 (*)[M]>(pC);
-  const float32 (*A)[M] = reinterpret_cast<const float32 (*)[M]>(pA);
-  const float32 (*B)[M] = reinterpret_cast<const float32 (*)[M]>(pB);
+  float (*C)[M] = reinterpret_cast<float (*)[M]>(pC);
+  const float (*A)[M] = reinterpret_cast<const float (*)[M]>(pA);
+  const float (*B)[M] = reinterpret_cast<const float (*)[M]>(pB);
   for (int c1 = 16 * b1; c1 < M; c1 += 4096) {
     if (M >= t0 + c1 + 1) {
       C[(t1 + 16 * b0)][(t0 + c1)] = (A[(t1 + 16 * b0)][(t0 + c1)] + B[(t1 + 16 * b0)][(t0 + c1)]);
@@ -400,16 +400,16 @@ def fun(float(N, N, N, N) A, float(N, N) B, float(N, N) C, float(N, N) D)
   std::string expected(
       R"RES(int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
-  float32 (*O1)[N] = reinterpret_cast<float32 (*)[N]>(pO1);
-  float32 (*O2)[N] = reinterpret_cast<float32 (*)[N]>(pO2);
-  float32 (*O3)[N] = reinterpret_cast<float32 (*)[N]>(pO3);
-  const float32 (*A)[N][N][N] = reinterpret_cast<const float32 (*)[N][N][N]>(pA);
-  const float32 (*B)[N] = reinterpret_cast<const float32 (*)[N]>(pB);
-  const float32 (*C)[N] = reinterpret_cast<const float32 (*)[N]>(pC);
-  const float32 (*D)[N] = reinterpret_cast<const float32 (*)[N]>(pD);
+  float (*O1)[N] = reinterpret_cast<float (*)[N]>(pO1);
+  float (*O2)[N] = reinterpret_cast<float (*)[N]>(pO2);
+  float (*O3)[N] = reinterpret_cast<float (*)[N]>(pO3);
+  const float (*A)[N][N][N] = reinterpret_cast<const float (*)[N][N][N]>(pA);
+  const float (*B)[N] = reinterpret_cast<const float (*)[N]>(pB);
+  const float (*C)[N] = reinterpret_cast<const float (*)[N]>(pC);
+  const float (*D)[N] = reinterpret_cast<const float (*)[N]>(pD);
   for (int c0 = 0; c0 < N; c0 += 1) {
     for (int c1 = 0; c1 < N; c1 += 1) {
-      O1[c0][c1] = 0.000000f;
+      O1[c0][c1] = (float)0.000000;
     }
   }
   for (int c0 = 0; c0 < N; c0 += 1) {
@@ -449,14 +449,14 @@ def fun(float(N, N) A) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected(
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA) {
+      R"RES(__global__ void kernel_anon(int N, float* pO, const float* pA) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
-  float32 (*O)[N] = reinterpret_cast<float32 (*)[N]>(pO);
-  const float32 (*A)[N] = reinterpret_cast<const float32 (*)[N]>(pA);
+  float (*O)[N] = reinterpret_cast<float (*)[N]>(pO);
+  const float (*A)[N] = reinterpret_cast<const float (*)[N]>(pA);
   for (int c0 = 0; c0 < N; c0 += 1) {
     for (int c1 = 0; c1 < N; c1 += 1) {
-      O[c0][c1] = (((A[c0][c1] + float32(c0)) + float32(c1)) + float32(N));
+      O[c0][c1] = (((A[c0][c1] + (float)(c0)) + (float)(c1)) + (float)(N));
     }
   }
 }
@@ -478,13 +478,13 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
   auto res = std::get<0>(mscop->codegen(specializedName));
 
   string expected =
-      R"RES(__global__ void kernel_anon(int32 N, float32* pO, const float32* pA, const float32* pB, const float32* pC) {
+      R"RES(__global__ void kernel_anon(int N, float* pO, const float* pA, const float* pB, const float* pC) {
   int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
-  float32 (*O)[512] = reinterpret_cast<float32 (*)[512]>(pO);
-  const float32 (*A)[512] = reinterpret_cast<const float32 (*)[512]>(pA);
-  const float32 (*B)[512] = reinterpret_cast<const float32 (*)[512]>(pB);
-  const float32 (*C) = reinterpret_cast<const float32 (*)>(pC);
+  float (*O)[512] = reinterpret_cast<float (*)[512]>(pO);
+  const float (*A)[512] = reinterpret_cast<const float (*)[512]>(pA);
+  const float (*B)[512] = reinterpret_cast<const float (*)[512]>(pB);
+  const float (*C) = reinterpret_cast<const float (*)>(pC);
   for (int c0 = 0; c0 <= 511; c0 += 1) {
     for (int c1 = 0; c1 <= 511; c1 += 1) {
       O[c0][c1] = (nextafter(C[c0], exp(A[c0][c1])) + log(B[c1][c0]));
@@ -499,13 +499,13 @@ def fun(float(N, N) A, float(N, N) B, float(N) C) -> (O)
 constexpr auto kExpectedMatmul_64_64_64 =
     R"CUDA(int b0 = blockIdx.x; int b1 = blockIdx.y; int b2 = blockIdx.z;
   int t0 = threadIdx.x; int t1 = threadIdx.y; int t2 = threadIdx.z;
-  float32 (*O)[64] = reinterpret_cast<float32 (*)[64]>(pO);
-  const float32 (*A)[64] = reinterpret_cast<const float32 (*)[64]>(pA);
-  const float32 (*B)[64] = reinterpret_cast<const float32 (*)[64]>(pB);
+  float (*O)[64] = reinterpret_cast<float (*)[64]>(pO);
+  const float (*A)[64] = reinterpret_cast<const float (*)[64]>(pA);
+  const float (*B)[64] = reinterpret_cast<const float (*)[64]>(pB);
   for (int c0 = 0; c0 <= 63; c0 += 16) {
     for (int c1 = 0; c1 <= 63; c1 += 16) {
       for (int c2 = t1; c2 <= 15; c2 += 8) {
-        O[(c0 + c2)][(t0 + c1)] = 0.000000f;
+        O[(c0 + c2)][(t0 + c1)] = (float)0.000000;
         for (int c4 = 0; c4 <= 63; c4 += 1) {
           O[(c0 + c2)][(t0 + c1)] = (O[(c0 + c2)][(t0 + c1)] + (A[(c0 + c2)][c4]*B[c4][(t0 + c1)]));
         }
diff --git a/test/test_cuda_mapper_memory_promotion.cc b/test/test_cuda_mapper_memory_promotion.cc
@@ -113,9 +113,9 @@ def fun(float(N,M,K,L) A, float(N,M,K,L) B) -> (C) {
 };
 
 TEST_F(Sum4D, CodeOuterBand) {
-  auto declarations = {"__shared__ float32 _A_0[16][16][16][16];",
-                       "__shared__ float32 _B_0[16][16][16][16];",
-                       "__shared__ float32 _C_0[16][16][16][16];"};
+  auto declarations = {"__shared__ float _A_0[16][16][16][16];",
+                       "__shared__ float _B_0[16][16][16][16];",
+                       "__shared__ float _C_0[16][16][16][16];"};
 
   auto copyA =
       "_A_0[c4][c5][c6][c7] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7];";
@@ -161,9 +161,9 @@ TEST_F(Sum4D, CodeOuterBand) {
  * promoteEverythingAt does not call mapCopiesToThreads.
  */
 TEST_F(Sum4D, CodeAboveThreadMapping) {
-  auto declarations = {"__shared__ float32 _A_0[16][16][16][16];",
-                       "__shared__ float32 _B_0[16][16][16][16];",
-                       "__shared__ float32 _C_0[16][16][16][16];"};
+  auto declarations = {"__shared__ float _A_0[16][16][16][16];",
+                       "__shared__ float _B_0[16][16][16][16];",
+                       "__shared__ float _C_0[16][16][16][16];"};
   auto copyA =
       "_A_0[c4][c5][c6][c7] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][c3 + c7]";
   auto copyB =
@@ -204,9 +204,9 @@ TEST_F(Sum4D, CodeAboveThreadMapping) {
 }
 
 TEST_F(Sum4D, CodeInnerBand) {
-  auto declarations = {"__shared__ float32 _C_0[1][1][1][1];",
-                       "__shared__ float32 _A_0[1][1][1][1];",
-                       "__shared__ float32 _B_0[1][1][1][1];"};
+  auto declarations = {"__shared__ float _C_0[1][1][1][1];",
+                       "__shared__ float _A_0[1][1][1][1];",
+                       "__shared__ float _B_0[1][1][1][1];"};
   auto copyA =
       "_A_0[0][0][0][0] = A[16 * b0 + c4][16 * b1 + c5][c2 + c6][t0 + c3];";
   auto copyB =
@@ -473,9 +473,9 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
   }
 
   void expectNoABCPromotion(const std::string& code) {
-    auto aDeclPos = code.find("  float32 _A_0");
-    auto bDeclPos = code.find("  float32 _B_0");
-    auto cDeclPos = code.find("  float32 _C_0");
+    auto aDeclPos = code.find("  float _A_0");
+    auto bDeclPos = code.find("  float _B_0");
+    auto cDeclPos = code.find("  float _C_0");
     EXPECT_TRUE(aDeclPos == std::string::npos)
         << "tensor A promoted to register but has elements accessed "
         << "by multiple threads";
@@ -487,7 +487,7 @@ def fun(float(N,K) A, float(K,M) B, float(N,M) C) -> (O) {
   }
 
   void expectFourOElementsPromoted(const std::string& code) {
-    auto oDeclPos = code.find("float32 _O_0[4][1];");
+    auto oDeclPos = code.find("float _O_0[4][1];");
     EXPECT_TRUE(oDeclPos != std::string::npos)
         << "expected O to be promoted to registers";
 
@@ -541,7 +541,7 @@ TEST_F(MatMulBias, RegisterPromotion) {
                             .usePrivateMemory(true);
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
-  auto declPos = code.find("float32 _O_0");
+  auto declPos = code.find("float _O_0");
   auto copyToPos =
       code.find("_O_0[0][0] = O[32 * b0 + c3][t0 + 32 * b1]", declPos + 1);
   auto copyFromPos =
@@ -570,7 +570,7 @@ TEST_F(MatMulBias, RegisterPromotionSharedPreference) {
 
   auto code = emitCode({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
 
-  auto declPos = code.find("float32 _O_0[1][1]");
+  auto declPos = code.find("float _O_0[1][1]");
   EXPECT_TRUE(declPos == std::string::npos)
       << "not expected promotion to register because promoted to shared";
 
@@ -606,7 +606,7 @@ TEST_F(MatMulBias, RegistersAtRootNotEnoughUnroll) {
   auto mscop = prepare({{"N", 42}, {"M", 56}, {"K", 37}}, mappingOptions);
   promoteToRegistersBelow(*mscop, mscop->scop().scheduleRoot());
   auto code = emitCode(mscop);
-  auto oDeclPos = code.find("float32 _O_0;");
+  auto oDeclPos = code.find("float _O_0;");
 
   EXPECT_TRUE(oDeclPos == std::string::npos)
       << "not expected O to be promoted to registers";