|
| 1 | +/** |
| 2 | + * Copyright (c) 2017-present, Facebook, Inc. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | +#pragma once |
| 17 | + |
| 18 | +#include "tc/aten/aten.h" |
| 19 | +#include "tc/core/cuda/cuda_mapping_options.h" |
| 20 | + |
| 21 | +namespace tc { |
| 22 | +constexpr static auto TC_WAVENET1_NAME = "wavenet1"; |
| 23 | +constexpr static auto TC_WAVENET = R"TC( |
| 24 | +# Original data is float(B, C, RECEPTIVE_FIELD) and undergoes a \ |
| 25 | +# Conv1d to become float(B, RESIDUAL_C, RECEPTIVE_FIELD) |
| 26 | +
|
| 27 | +def wavenet1( |
| 28 | + float(B, RESIDUAL_C, RECEPTIVE_FIELD) Data, |
| 29 | + float(DILATION_C, RESIDUAL_C, 2) FilterWeight, |
| 30 | + float(DILATION_C) FilterBias, |
| 31 | + float(DILATION_C, RESIDUAL_C, 2) GateWeight, |
| 32 | + float(DILATION_C) GateBias, |
| 33 | + float(RESIDUAL_C, DILATION_C) ResWeight, |
| 34 | + float(RESIDUAL_C) ResBias, |
| 35 | + float(SKIP_C, DILATION_C) SkipWeight, |
| 36 | + float(SKIP_C) SkipBias, |
| 37 | + float(DILATION_FACTOR) Dilation) |
| 38 | + -> (FilterOut, GateOut, NonLin, Res, Skip) |
| 39 | +{ |
| 40 | + FilterOut(b, dilation_c, rf) = FilterBias(dilation_c) |
| 41 | + where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD |
| 42 | + FilterOut(b, dilation_c, rf) += Data(b, r_residual_c, rf) * FilterWeight(dilation_c, r_residual_c, 1) + |
| 43 | + ( |
| 44 | + (rf - DILATION_FACTOR >= 0) ? |
| 45 | + Data(b, r_residual_c, rf - DILATION_FACTOR) * FilterWeight(dilation_c, r_residual_c, 0) : |
| 46 | + float(0) |
| 47 | + ) |
| 48 | + where rf in 0:RECEPTIVE_FIELD |
| 49 | +
|
| 50 | + GateOut(b, dilation_c, rf) = GateBias(dilation_c) |
| 51 | + where b in 0:B, dilation_c in 0:DILATION_C, rf in 0:RECEPTIVE_FIELD |
| 52 | + GateOut(b, dilation_c, rf) += Data(b, r_residual_c, rf) * GateWeight(dilation_c, r_residual_c, 1) + |
| 53 | + ( |
| 54 | + (rf - DILATION_FACTOR >= 0) ? |
| 55 | + Data(b, r_residual_c, rf - DILATION_FACTOR) * GateWeight(dilation_c, r_residual_c, 0) : |
| 56 | + float(0) |
| 57 | + ) |
| 58 | + where rf in 0:RECEPTIVE_FIELD |
| 59 | +
|
| 60 | + NonLin(b, dilation_c, rf) = tanh(FilterOut(b, dilation_c, rf)) |
| 61 | + where rf in 0:RECEPTIVE_FIELD |
| 62 | + NonLin(b, dilation_c, rf) *= 1 / (1 + exp( -GateOut(b, dilation_c, rf))) |
| 63 | + where rf in 0:RECEPTIVE_FIELD |
| 64 | +
|
| 65 | + Res(b, residual_c, rf) = Data(b, residual_c, rf) + ResBias(residual_c) |
| 66 | + Res(b, residual_c, rf) += NonLin(b, r_dilation_c, rf) * ResWeight(residual_c, r_dilation_c) |
| 67 | +
|
| 68 | + Skip(b, skip, rf) +=! NonLin(b, r_dilation_c, rf) * SkipWeight(skip, r_dilation_c) |
| 69 | + where rf in 0:RECEPTIVE_FIELD |
| 70 | + Skip(b, skip, rf) = Skip(b, skip, rf) + SkipBias(skip) |
| 71 | + where rf in 0:RECEPTIVE_FIELD |
| 72 | +} |
| 73 | + )TC"; |
| 74 | + |
| 75 | +auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 = |
| 76 | + tc::CudaMappingOptions::makeNaiveMappingOptions() |
| 77 | + .outerScheduleFusionStrategy(tc::FusionStrategy::Max) |
| 78 | + .outerScheduleAllowSkewing(false) |
| 79 | + .outerSchedulePositiveOrthant(true) |
| 80 | + .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min) |
| 81 | + .intraTileScheduleAllowSkewing(false) |
| 82 | + .intraTileSchedulePositiveOrthant(true) |
| 83 | + .fixParametersBeforeScheduling(true) |
| 84 | + .tile(63) |
| 85 | + .unroll(32) |
| 86 | + .tileImperfectlyNested(false) |
| 87 | + .matchLibraryCalls(false) |
| 88 | + .mapToThreads(32, 4, 1) |
| 89 | + .mapToBlocks(256, 4, 63) |
| 90 | + .useSharedMemory(true) |
| 91 | + .usePrivateMemory(true) |
| 92 | + .unrollCopyShared(false) |
| 93 | + .useReadOnlyCache(false); |
| 94 | + |
| 95 | +auto options_WaveNet1_P100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 = |
| 96 | + tc::CudaMappingOptions::makeNaiveMappingOptions() |
| 97 | + .outerScheduleFusionStrategy(tc::FusionStrategy::Max) |
| 98 | + .outerScheduleAllowSkewing(false) |
| 99 | + .outerSchedulePositiveOrthant(true) |
| 100 | + .intraTileScheduleFusionStrategy( |
| 101 | + tc::FusionStrategy::Preserve3Coincident) |
| 102 | + .intraTileScheduleAllowSkewing(false) |
| 103 | + .intraTileSchedulePositiveOrthant(true) |
| 104 | + .fixParametersBeforeScheduling(true) |
| 105 | + .tile(128, 4096, 1000, 64) |
| 106 | + .unroll(1) |
| 107 | + .tileImperfectlyNested(false) |
| 108 | + .matchLibraryCalls(true) |
| 109 | + .mapToThreads(128) |
| 110 | + .mapToBlocks(63) |
| 111 | + .useSharedMemory(true) |
| 112 | + .usePrivateMemory(true) |
| 113 | + .unrollCopyShared(false) |
| 114 | + .useReadOnlyCache(false); |
| 115 | + |
| 116 | +auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_1 = |
| 117 | + tc::CudaMappingOptions::makeNaiveMappingOptions() |
| 118 | + .outerScheduleFusionStrategy(tc::FusionStrategy::Max) |
| 119 | + .outerScheduleAllowSkewing(false) |
| 120 | + .outerSchedulePositiveOrthant(true) |
| 121 | + .intraTileScheduleFusionStrategy( |
| 122 | + tc::FusionStrategy::Preserve3Coincident) |
| 123 | + .intraTileScheduleAllowSkewing(false) |
| 124 | + .intraTileSchedulePositiveOrthant(true) |
| 125 | + .fixParametersBeforeScheduling(false) |
| 126 | + .tile(1000, 128, 500) |
| 127 | + .unroll(2) |
| 128 | + .tileImperfectlyNested(false) |
| 129 | + .matchLibraryCalls(false) |
| 130 | + .mapToThreads(256) |
| 131 | + .mapToBlocks(4000, 128) |
| 132 | + .useSharedMemory(true) |
| 133 | + .usePrivateMemory(true) |
| 134 | + .unrollCopyShared(true) |
| 135 | + .useReadOnlyCache(false); |
| 136 | + |
| 137 | +auto options_WaveNet1_V100_autotuned_B_1_RES_32_DIL_32_SKIP_256_REC_4000_F_32 = |
| 138 | + tc::CudaMappingOptions::makeNaiveMappingOptions() |
| 139 | + .outerScheduleFusionStrategy(tc::FusionStrategy::Max) |
| 140 | + .outerScheduleAllowSkewing(false) |
| 141 | + .outerSchedulePositiveOrthant(true) |
| 142 | + .intraTileScheduleFusionStrategy(tc::FusionStrategy::Min) |
| 143 | + .intraTileScheduleAllowSkewing(false) |
| 144 | + .intraTileSchedulePositiveOrthant(true) |
| 145 | + .fixParametersBeforeScheduling(true) |
| 146 | + .tile(8, 125, 512, 500) |
| 147 | + .unroll(32) |
| 148 | + .tileImperfectlyNested(false) |
| 149 | + .matchLibraryCalls(false) |
| 150 | + .mapToThreads(16, 16) |
| 151 | + .mapToBlocks(4000, 2048, 4096) |
| 152 | + .useSharedMemory(true) |
| 153 | + .usePrivateMemory(true) |
| 154 | + .unrollCopyShared(true) |
| 155 | + .useReadOnlyCache(false); |
| 156 | + |
| 157 | +} // namespace tc |
0 commit comments