Add utilities needed for all the different tuning experiments

Jules Pondard · Jules Pondard · commit 9763f96e07e6 · 2018-07-26T04:39:33.000-07:00
Also introduce a new folder where will be all the files related
to experimentingnew ways of tuning options.
diff --git a/python/experimental/options_search/utils.py b/python/experimental/options_search/utils.py
@@ -0,0 +1,208 @@
+import time
+import torch
+import torch.optim as optim
+import torch.nn as nn
+import torch.utils.data
+import torch.nn.functional as F
+import tensor_comprehensions as tc
+import numpy as np
+
+NB_HYPERPARAMS, INIT_INPUT_SZ = 26, 7
+USE_MAX_SHARED_MENORY=0
+
+def getrand(l):
+    return np.random.choice(l).item()
+
+def get_convolution_example(size_type="default", inp_sz_list=[], use_max_shared_memory=False):
+    global INIT_INPUT_SZ, USE_MAX_SHARED_MEMORY
+
+    USE_MAX_SHARED_MEMORY = use_max_shared_memory
+
+    INIT_INPUT_SZ = 7
+    tc_name = "convolution"
+    tc_code = """
+        def convolution(float(N,C,H,W) I, float(M,C,KH,KW) W1) -> (O) {
+            O(n, m, h, w) +=! I(n, r_c, h + r_kh, w + r_kw) * W1(m, r_c, r_kh, r_kw)
+        }
+    """
+
+    if(size_type=="input"):
+        N, C, H, W, O, kH, kW = tuple(inp_sz_list)
+    elif(size_type=="default"):
+        N, C, H, W, O, kH, kW = 16, 4, 56, 56, 16, 1, 1 #8, 2, 28, 28, 8, 1, 1
+    elif(size_type=="random"):
+        N, C, H, W, O, kH, kW = \
+            getrand([8, 16, 32, 64]), \
+            getrand([2, 4, 8, 16]), \
+            getrand([28, 56, 112]), \
+            getrand([28, 56, 112]), \
+            getrand([8, 16, 32]), \
+            getrand([1, 2, 4]), \
+            getrand([1, 2, 4])
+    else:
+        print("Unknown size type")
+        exit()
+    I, W1 = torch.randn(N, C, H, W, device='cuda'), torch.randn(O, C, kH, kW, device='cuda')
+    init_input = (I, W1)
+    init_input_sz = np.array([N,C,H,W,O, kH, kW])
+    print(init_input_sz)
+    init_input_sz = torch.from_numpy(init_input_sz).float()
+
+    computeCat(init_input)
+    set_tc(tc_code, tc_name)
+
+    return (tc_code, tc_name, init_input, init_input_sz)
+
+def print_opt(options):
+    print(options.tolist())
+
+def set_tc(tc_code_arg, tc_name_arg):
+    global tc_code, tc_name
+    tc_code = tc_code_arg
+    tc_name = tc_name_arg
+
+def catVec_to_optVec(catVec):
+    global cat_val
+    opt = [cat_val[i][catVec[i]] for i in range(NB_HYPERPARAMS)]
+    return opt
+
+def evalTime(opt, iters=50, warmup=10, estimator="mean", naive=False, prune=-1, curr_best=-1):
+    global tc_code, tc_name, inp, cat_val
+
+    infty = 30000
+    opt = catVec_to_optVec(opt)
+    if naive:
+        opt = tc.MappingOptions("naive")
+    else:
+        opt = optionsFromVector(opt)
+    try:
+        tc_prog = tc.compile(tc_code, tc_name, opt, *inp)
+        first_ft = tc_prog.executor.profile_kernel(inp)
+    except (KeyboardInterrupt, SystemExit):
+        raise
+    except:
+        return infty
+    if(prune != -1 and first_ft > 100*curr_best):
+        return first_ft
+    for _ in range(warmup):
+        tc_prog.executor.profile_kernel(inp)
+
+    first_t = tc_prog.executor.profile_kernel(inp)
+
+    if(prune != -1 and first_t > prune*curr_best):
+        return first_t
+
+    tc_time_list = []
+    for i in range(iters):
+        iter_time = tc_prog.executor.profile_kernel(inp)
+        tc_time_list.append(iter_time)
+    if(estimator == "mean"):
+        mean_time = np.mean(tc_time_list)
+        return mean_time
+    elif(estimator == "median"):
+        median_time = np.median(tc_time_list)
+        return median_time
+    elif(estimator == "p25"):
+        p25_time = np.percentile(tc_time_list, 25)
+        return p25_time
+    print("Unknown estimator")
+    return infty
+
+def getRawVectorFromTcOpt(tc_opt):
+    tr_dic = {"Max":0, "Preserve3Coincident":1, "Min":2}
+    opt_vect = np.zeros(NB_HYPERPARAMS).astype(int)
+    opt_vect[0] = tr_dic[tc_opt["outerScheduleFusionStrategy"]]
+    opt_vect[1] = tr_dic[tc_opt["intraTileScheduleFusionStrategy"]]
+    opt_vect[2] = tc_opt["fixParametersBeforeScheduling"]
+    opt_vect[3] = len(tc_opt["tile"])
+    assert opt_vect[3] < 7, "Too many tilings"
+    opt_vect[4:4+opt_vect[3]] = tc_opt["tile"]
+    opt_vect[10] = tc_opt["unroll"]
+    #opt_vect[11] = tc_opt["tileImperfectlyNested"] #todo: pybind
+    opt_vect[11] = tc_opt["matchLibraryCalls"]
+    opt_vect[12] = len(tc_opt["mapToBlocks"])
+    opt_vect[13:13+opt_vect[12]] = tc_opt["mapToBlocks"]
+    opt_vect[16] = len(tc_opt["mapToThreads"])
+    opt_vect[17:17+opt_vect[16]] = tc_opt["mapToThreads"]
+    opt_vect[20] = tc_opt["useSharedMemory"]
+    opt_vect[21] = tc_opt["usePrivateMemory"]
+    opt_vect[22] = tc_opt["unrollCopyShared"]
+    if(USE_MAX_SHARED_MEMORY and "maxSharedMemory" in tc_opt):
+        opt_vect[23] = tc_opt["maxSharedMemory"]
+    opt_vect[24] = tc_opt["useReadOnlyCache"]
+    opt_vect[25] = tc_opt["privateDepth"]
+    return opt_vect
+
+def optionsFromVector(vect):
+    strat_str = ["Max", "Preserve3Coincident", "Min"]
+    options = tc.MappingOptions("naive")
+    options.outerScheduleFusionStrategy(strat_str[vect[0]])
+    options.intraTileScheduleFusionStrategy(strat_str[vect[1]])
+    options.fixParametersBeforeScheduling(vect[2])
+    options.tile(list(vect[4:(4+vect[3])]))
+    options.unroll(vect[10])
+    options.matchLibraryCalls(vect[11])
+    options.mapToBlocks(list(vect[13:13+vect[12]]))
+    options.mapToThreads(list(vect[17:17+vect[16]]))
+    options.useSharedMemory(vect[20])
+    options.usePrivateMemory(vect[21])
+    options.unrollCopyShared(vect[22])
+    if(USE_MAX_SHARED_MEMORY):
+        options.maxSharedMemory(vect[23])
+    options.useReadOnlyCache(vect[24])
+    options.privateDepth(vect[25])
+    return options
+
+def computeDivs(sz):
+    l = []
+    for i in range(sz):
+        if(2**i > sz):
+            break
+        l.append((sz+2**i-1)//(2**i))
+    return l
+
+def getAllDivs(inp, maxp2=8):
+    p2 = [2**i for i in range(maxp2 + 1)]
+    l = []
+    for elem in inp:
+        for sz in elem.shape:
+            l += computeDivs(sz)
+    divs_list = list(set(l + p2))
+    return sorted(divs_list)
+
+def computeCat(inp_arg):
+    global cat_sz, cat_val, inp
+    inp = inp_arg
+    cat_sz = np.zeros(NB_HYPERPARAMS).astype(int)
+    cat_val = []
+
+    divs = getAllDivs(inp)
+    if(USE_MAX_SHARED_MEMORY):
+        divs2 = getAllDivs([np.array([tc.tclib.shared_memory_size()])])
+
+    cat_val.append([0,1,2])                    #0
+    cat_val.append([0,1,2])                    #1
+    cat_val.append([0,1])                      #2
+    cat_val.append([i+1 for i in range(6)])    #3
+    for i in range(6): #tiling                 #4-9
+        cat_val.append(divs + [0])             #4-9
+    cat_val.append([2**i for i in range(8)])   #10
+    cat_val.append([0,1])                      #11
+    cat_val.append([i+1 for i in range(3)])    #12
+    for i in range(3):                         #13-15
+        cat_val.append(divs) #blocks #maximum 2^31-1 for the first value and 65535 for the second and third
+    cat_val.append([i+1 for i in range(3)])    #16
+    for i in range(3):                         #17-19
+        cat_val.append(divs) #threads #maximum 1024 for the first and second value, 32 for the third, product below 1024
+    cat_val.append([0,1])                      #20
+    cat_val.append([0,1])                      #21
+    cat_val.append([0,1])                      #22
+    if(USE_MAX_SHARED_MEMORY):                 #23
+        cat_val.append(divs2)
+    else:
+        cat_val.append([0])
+    cat_val.append([0,1])                      #24
+    cat_val.append([i for i in range(6)])      #25
+
+    for i in range(NB_HYPERPARAMS):
+        cat_sz[i] = len(cat_val[i])