Merge pull request #512 from nicolasvasilache/pr/types

nicolasvasilache · web-flow · commit 7e2df7c4a680 · 2018-06-13T15:37:17.000-04:00
Types support and min_distance.py function example
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -69,6 +69,10 @@ WITH_CAFFE2=ON CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda CLANG_PREFIX=$(${CONDA_PREF
 python setup.py install
 ./test_python/run_test.sh
 
+for f in $(find ./python/examples -name "*.py"); do
+    python $f
+done
+
 FILTER_OUT="benchmark_MLP_model benchmark_kronecker" ./test.sh
 # 2LUT can OOM on smaller Maxwells on our CI machines
 ./build/tc/benchmarks/benchmark_MLP_model --gtest_filter=-*2LUT*
diff --git a/python/examples/min_distance.py b/python/examples/min_distance.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2017-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+import tensor_comprehensions as tc
+from tensor_comprehensions.tc import set_logtostderr
+from tensor_comprehensions.tc import set_debug_tc_mapper
+from tensor_comprehensions.tc import set_debug_cuda
+
+import numpy as np
+import torch
+
+#
+## Example submitted by @mdouze, originally related to uint8 type support
+#
+
+debug = False
+set_logtostderr(debug)
+set_debug_tc_mapper(debug)
+set_debug_cuda(debug)
+
+N = 1000
+M = 32
+
+codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
+codes = codes.view('uint8')
+luts = np.random.randn(M, 256).astype('float32')
+
+codes_t = torch.from_numpy(codes).cuda()
+luts_t = torch.from_numpy(luts).cuda()
+
+lang = """
+# mindis as a single kernel will require grid synchronization to run efficiently
+def mindis(float(M, 256) L, uint8(N, M) Codes) -> (S, v, min_idx) {
+          S(n) +=! L(r_m, int32(Codes(n, r_m)))
+          v  min=! S(r_n)
+    min_idx  min=! (S(r_n) == v) ? r_n : N
+}
+
+# Even when splitting in 3 kernels, global device reduction will be needed to
+# run efficiently
+# don't try to run it with large sizes for now
+def reduce_codes(float(M, 256) L, uint8(N, M) Codes) -> (S) {
+    S(n) +=! L(r_m, int32(Codes(n, r_m)))
+}
+def min_2d(float(N) S) -> (v) {
+    v min=! S(r_n)
+}
+def argmin_2d(float(N) S, float v) -> (min_idx) {
+    min_idx min=! (S(r_n) == v) ? r_n : N
+}
+"""
+
+mindis = tc.define(lang, name="mindis")
+S, v, min_idx = mindis(luts_t, codes_t)
+print("minval: {} minidx: {}".format(v, min_idx))
+
+reduce_codes = tc.define(lang, name="reduce_codes")
+min_2d = tc.define(lang, name="min_2d")
+argmin_2d = tc.define(lang, name="argmin_2d")
+
+S = reduce_codes(luts_t, codes_t)
+v = min_2d(S)
+min_idx = argmin_2d(S, v)
+
+print("minval: {} minidx: {}".format(v, min_idx))
+
+################################################################################
+# Each reduction is probably easier to optimize with a 2-staged TC where we
+# artifically increase parallelism and finish the reduction in a second kernel.
+# Properly choosing D such that N = D * (N / D) should result in a good version
+# with 5 kernels total.
+################################################################################
+N = 10 ** 5 # bump to 10**7 when ready for primetime
+D = 1000
+assert N % D == 0, "D={} must divide N={}".format(D, N)
+M = 32
+
+lang = """
+def reduce_codes(float(M, 256) L, uint8(N, M) Codes) -> (S) {
+    S(n) +=! L(r_m, int32(Codes(n, r_m)))
+}
+def min_2d(float(D, NBYD) S) -> (V) {
+    V(d) min=! S(d, r_nbyd)
+}
+def min_1d(float(D) V) -> (v) {
+    v min=! V(r_d)
+}
+def argmin_2d(float(D, NBYD) S, float v) -> (MinIdx) {
+    MinIdx(d) min=! (S(d, r_nbyd) == v) ? d * NBYD + r_nbyd : N
+}
+def argmin_1d(float(N) S, int32(D) MinIdx) -> (min_idx) {
+    min_idx min=! (MinIdx(r_d) < N) ? r_d : N
+}
+"""
+
+codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
+codes = codes.view('uint8')
+luts = np.random.randn(M, 256).astype('float32')
+
+codes_t = torch.from_numpy(codes).cuda()
+luts_t = torch.from_numpy(luts).cuda()
+
+reduce_codes = tc.define(lang, name="reduce_codes")
+min_2d = tc.define(lang, name="min_2d")
+min_1d = tc.define(lang, name="min_1d")
+argmin_2d = tc.define(lang, name="argmin_2d")
+argmin_1d = tc.define(lang, name="argmin_1d")
+
+S = reduce_codes(luts_t, codes_t)
+V = min_2d(S.view((D, N / D)))
+v = min_1d(V)
+MinIdx = argmin_2d(S.view((D, N / D)), v)
+min_idx = argmin_1d(S, MinIdx)
+print("minval: {} minidx: {}".format(v, min_idx))
+
+################################################################################
+# Longer form version has an extra k dimension we could use for parallelism
+# Unfortunately is it a small dimension (16) so it won't saturate Pascal/Volta.
+# So we may want to split in 5 to run efficiently.
+################################################################################
+N = 10 ** 7 # bump to 10**7 when ready for primetime
+D = 1000
+assert N % D == 0, "D={} must divide N={}".format(D, N)
+M = 32
+K = 16
+codes = np.random.randint(1<<32, size=(N, M // 4)).astype('uint32')
+codes = codes.view('uint8')
+luts = np.random.randn(K, M, 256).astype('float32')
+
+codes_t = torch.from_numpy(codes).cuda()
+luts_t = torch.from_numpy(luts).cuda()
+
+lang = """
+def mindis(float(K, M, 256) L, uint8(N, M) Codes) -> (S, V, MinIdx) {
+         S(k, n)   +=!  L(k, r_m, int32(Codes(n, r_m)))
+         V(k)    min=!  S(k, r_n)
+    MinIdx(k)    min=! (S(k, r_n) == V(k)) ? r_n : N
+}
+"""
+
+debug = False
+set_logtostderr(debug)
+set_debug_tc_mapper(debug)
+set_debug_cuda(debug)
+
+mindis = tc.define(lang, name="mindis")
+S, V, MinIdx = mindis(luts_t, codes_t)
+print("minvals: {}\nminidxs: {}".format(V, MinIdx))
+
+lang = """
+def reduce_codes(float(K, M, 256) L, uint8(N, M) Codes) -> (S) {
+    S(k, n) +=! L(k, r_m, int32(Codes(n, r_m)))
+}
+def min_2d(float(K, D, NBYD) S) -> (V2) {
+    V2(k, d) min=! S(k, d, r_nbyd)
+}
+def min_1d(float(K, D) V2) -> (V) {
+    V(k) min=! V2(k, r_d)
+}
+def argmin_2d(float(K, D, NBYD) S, float(K) V) -> (MinIdx2) {
+    MinIdx2(k, d) min=! (S(k, d, r_nbyd) == V(k)) ? d * NBYD + r_nbyd : N
+}
+def argmin_1d(float(K, N) S, int32(K, D) MinIdx2) -> (MinIdx) {
+    MinIdx(k) min=! (MinIdx2(k, r_d) < N) ? r_d : N
+}
+"""
+
+reduce_codes = tc.define(lang, name="reduce_codes")
+min_2d = tc.define(lang, name="min_2d")
+min_1d = tc.define(lang, name="min_1d")
+argmin_2d = tc.define(lang, name="argmin_2d")
+argmin_1d = tc.define(lang, name="argmin_1d")
+
+S = reduce_codes(luts_t, codes_t)
+V2 = min_2d(S.view((K, D, N / D)))
+V = min_1d(V2)
+MinIdx2 = argmin_2d(S.view((K, D, N / D)), V)
+MinIdx = argmin_1d(S, MinIdx2)
+print("minval: {} minidx: {}".format(V, MinIdx))
diff --git a/tc/core/libraries.h b/tc/core/libraries.h
@@ -31,9 +31,20 @@ namespace code {
 namespace c {
 
 constexpr auto types = R"C(
+// Can't include system dependencies with NVRTC
+// Can't include cuda_fp16.h with NVRTC due to transitive system dependencies
+// #include <cuda_fp16.h>
+
 // Halide type handling
+typedef char int8;
+typedef short int16;
 typedef int int32;
 typedef long int64;
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
+typedef unsigned long uint64;
+// typedef half float16;
 typedef float float32;
 typedef double float64;
 )C";
diff --git a/tc/core/tc2halide.cc b/tc/core/tc2halide.cc
@@ -53,10 +53,17 @@ Type translateScalarType(int tcType) {
       return Int(32);
     case lang::TK_INT64:
       return Int(64);
+    case lang::TK_FLOAT16:
+      return Float(16);
+    case lang::TK_FLOAT32:
+      return Float(32);
+    case lang::TK_FLOAT64:
+      return Float(64);
     case lang::TK_FLOAT:
       return Float(32);
     case lang::TK_DOUBLE:
       return Float(64);
+
     default:
       LOG(FATAL) << "Unhandled TC scalar type: " << tcType << '\n';
       return Type();
diff --git a/tc/lang/lexer.h b/tc/lang/lexer.h
@@ -41,8 +41,6 @@ namespace lang {
   _(TK_MIN, "min", "min")                        \
   _(TK_MAX, "max", "max")                        \
   _(TK_WHERE, "where", "where")                  \
-  _(TK_FLOAT, "float", "float")                  \
-  _(TK_DOUBLE, "double", "double")               \
   _(TK_DEF, "def", "def")                        \
   _(TK_ARROW, "arrow", "->")                     \
   _(TK_EQUIVALENT, "equivalent", "<=>")          \
@@ -67,15 +65,21 @@ namespace lang {
   _(TK_TIMES_EQ_B, "times_eq_b", "*=!")          \
   _(TK_MIN_EQ_B, "min_eq_b", "min=!")            \
   _(TK_MAX_EQ_B, "max_eq_b", "max=!")            \
-  _(TK_INT8, "int8", "int8")                     \
-  _(TK_INT16, "int16", "int16")                  \
-  _(TK_INT32, "int32", "int32")                  \
-  _(TK_INT64, "int64", "int64")                  \
+                                                 \
+  _(TK_BOOL, "bool", "bool")                     \
   _(TK_UINT8, "uint8", "uint8")                  \
   _(TK_UINT16, "uint16", "uint16")               \
   _(TK_UINT32, "uint32", "uint32")               \
   _(TK_UINT64, "uint64", "uint64")               \
-  _(TK_BOOL, "bool", "bool")                     \
+  _(TK_INT8, "int8", "int8")                     \
+  _(TK_INT16, "int16", "int16")                  \
+  _(TK_INT32, "int32", "int32")                  \
+  _(TK_INT64, "int64", "int64")                  \
+  _(TK_FLOAT16, "float16", "float16")            \
+  _(TK_FLOAT32, "float32", "float32")            \
+  _(TK_FLOAT64, "float64", "float64")            \
+  _(TK_FLOAT, "float", "float")                  \
+  _(TK_DOUBLE, "double", "double")               \
   _(TK_CAST, "cast", "")                         \
   _(TK_IN, "in", "in")                           \
   _(TK_GE, "ge", ">=")                           \
@@ -271,15 +275,18 @@ struct SharedParserData {
   }
   bool isScalarType(int kind) {
     switch (kind) {
-      case TK_INT8:
-      case TK_INT16:
-      case TK_INT32:
-      case TK_INT64:
+      case TK_BOOL:
       case TK_UINT8:
       case TK_UINT16:
       case TK_UINT32:
       case TK_UINT64:
-      case TK_BOOL:
+      case TK_INT8:
+      case TK_INT16:
+      case TK_INT32:
+      case TK_INT64:
+      case TK_FLOAT16:
+      case TK_FLOAT32:
+      case TK_FLOAT64:
       case TK_FLOAT:
       case TK_DOUBLE:
         return true;
diff --git a/tc/lang/sema.h b/tc/lang/sema.h
@@ -46,13 +46,23 @@ struct TypeInfo {
       TYPE_INFO_OPTION(TK_INT16, Int, 16)
       TYPE_INFO_OPTION(TK_INT32, Int, 32)
       TYPE_INFO_OPTION(TK_INT64, Int, 64)
+      TYPE_INFO_OPTION(TK_FLOAT16, Float, 16)
+      TYPE_INFO_OPTION(TK_FLOAT32, Float, 32)
+      TYPE_INFO_OPTION(TK_FLOAT64, Float, 64)
       TYPE_INFO_OPTION(TK_FLOAT, Float, 32)
       TYPE_INFO_OPTION(TK_DOUBLE, Float, 64)
+
 #undef TYPE_INFO_OPTION
       default:
         throw ErrorReport(scalar_type)
             << "Unhandled TC scalar type: " << scalar_type;
     }
+
+    if (code_ == Code::Float && bits_ == 16) {
+      throw ErrorReport(scalar_type)
+          << "Half precision floating point not supported "
+          << "until we can make NVRTC include system headers";
+    }
   }
   int toScalarToken() const {
     switch (code()) {
@@ -82,12 +92,15 @@ struct TypeInfo {
         }
       case Float:
         switch (bits()) {
+          case 16:
+            return TK_FLOAT16;
           case 32:
             return TK_FLOAT;
           case 64:
             return TK_DOUBLE;
         }
     }
+
     throw std::runtime_error("Unknown type info?");
   }
   Code code() const {
diff --git a/test/cuda/test_compile_and_run.cc b/test/cuda/test_compile_and_run.cc
@@ -275,6 +275,37 @@ def cast(float(M,N) A, int32 four) -> (int32(M,N) output) {
   TC_CHECK_EQ(r, 0);
 }
 
+TEST_F(CompilationTest, Types) {
+  struct TypeMatch {
+    std::string s;
+    at::ScalarType a;
+  };
+  for (auto type :
+       {// TypeMatch{"bool", at::ScalarType::Bool},    // no aten version
+        TypeMatch{"uint8", at::ScalarType::Byte},
+        // TypeMatch{"uint16", at::ScalarType::Short}, // no aten version
+        // TypeMatch{"uint32", at::ScalarType::Int},   // no aten version
+        // TypeMatch{"uint64", at::ScalarType::Long},  // no aten version
+        TypeMatch{"int8", at::ScalarType::Char},
+        TypeMatch{"int16", at::ScalarType::Short},
+        TypeMatch{"int32", at::ScalarType::Int},
+        TypeMatch{"int64", at::ScalarType::Long},
+        // NVRTC include transitive dependencies issue
+        // TypeMatch{"float16", at::ScalarType::Half},
+        TypeMatch{"float32", at::ScalarType::Float},
+        TypeMatch{"float64", at::ScalarType::Double},
+        TypeMatch{"float", at::ScalarType::Float},
+        TypeMatch{"double", at::ScalarType::Double}}) {
+    std::string tc = std::string("def test_type(") + std::string(type.s) +
+        std::string("(N) A) -> (B) { B(k) +=! A(i) where k in 0:1 }");
+    std::vector<at::Tensor> outputs = Check(
+        tc,
+        "test_type",
+        tc::CudaMappingOptions::makeNaiveMappingOptions(),
+        {at::CUDA(type.a).ones({100})});
+  }
+}
+
 int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   ::gflags::ParseCommandLineFlags(&argc, &argv, true);
diff --git a/test/test_tc2halide.cc b/test/test_tc2halide.cc