|
| 1 | +/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. |
| 2 | +
|
| 3 | +Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +you may not use this file except in compliance with the License. |
| 5 | +You may obtain a copy of the License at |
| 6 | +
|
| 7 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +
|
| 9 | +Unless required by applicable law or agreed to in writing, software |
| 10 | +distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +See the License for the specific language governing permissions and |
| 13 | +limitations under the License. |
| 14 | +==============================================================================*/ |
| 15 | + |
| 16 | +#include "tensorflow/lite/micro/kernels/streaming_conv.h" |
| 17 | + |
| 18 | +#include "tensorflow/lite/c/builtin_op_data.h" |
| 19 | +#include "tensorflow/lite/c/common.h" |
| 20 | +#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h" |
| 21 | +#include "tensorflow/lite/kernels/internal/reference/conv.h" |
| 22 | +#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" |
| 23 | +#include "tensorflow/lite/kernels/kernel_util.h" |
| 24 | +#include "tensorflow/lite/micro/kernels/kernel_util.h" |
| 25 | +#include "tensorflow/lite/micro/micro_log.h" |
| 26 | + |
| 27 | +namespace tflite { |
| 28 | +namespace { |
| 29 | + |
| 30 | +void updateStreamingConvBuffer(int16_t* pbuf, const int16_t* pinp, int ih, |
| 31 | + int ic, int kw) { |
| 32 | + /* This will update the persistent mem. Input data is coming in as ihx1xic, |
| 33 | + * buffer is ihxkwxic */ |
| 34 | + int i, j; |
| 35 | + |
| 36 | + /* step 1 : Move the older columns in persistent buffer */ |
| 37 | + int striplength = (kw - 1) * ic; |
| 38 | + int16_t* pbuf_dst = pbuf; |
| 39 | + int16_t* pbuf_src = pbuf + ic; |
| 40 | + |
| 41 | + for (i = 0; i < ih; i++) { |
| 42 | + memmove(pbuf_dst, pbuf_src, striplength * sizeof(int16_t)); |
| 43 | + pbuf_dst += kw * ic; |
| 44 | + pbuf_src += kw * ic; |
| 45 | + } |
| 46 | + |
| 47 | + /* step 2: Copy new input column to buffer */ |
| 48 | + int16_t* pbuf_base = pbuf + (ic * (kw - 1)); |
| 49 | + const int16_t* pinp_base = pinp; |
| 50 | + |
| 51 | + for (i = 0; i < ih; i++) { |
| 52 | + for (j = 0; j < ic; j++) { |
| 53 | + pbuf_base[j] = pinp_base[j]; |
| 54 | + } |
| 55 | + pbuf_base += (kw * ic); |
| 56 | + pinp_base += ic; |
| 57 | + } |
| 58 | +} |
| 59 | + |
| 60 | +template <typename AccumScalar> |
| 61 | +inline void StreamingConvPerChannel( |
| 62 | + const ConvParams& params, const int32_t* output_multiplier, |
| 63 | + const int32_t* output_shift, const RuntimeShape& input_shape, |
| 64 | + const int16_t* input_data, const RuntimeShape& filter_shape, |
| 65 | + const int8_t* filter_data, const RuntimeShape& bias_shape, |
| 66 | + const AccumScalar* bias_data, const RuntimeShape& output_shape, |
| 67 | + int16_t* output_data, int16_t* input_state) { |
| 68 | + // Get parameters. |
| 69 | + const int stride_width = params.stride_width; |
| 70 | + const int stride_height = params.stride_height; |
| 71 | + const int dilation_width_factor = params.dilation_width_factor; |
| 72 | + const int dilation_height_factor = params.dilation_height_factor; |
| 73 | + const int pad_width = params.padding_values.width; |
| 74 | + const int pad_height = params.padding_values.height; |
| 75 | + |
| 76 | + // Set min and max value of the output. |
| 77 | + const int32_t output_activation_min = params.quantized_activation_min; |
| 78 | + const int32_t output_activation_max = params.quantized_activation_max; |
| 79 | + |
| 80 | + // Consistency check. |
| 81 | + TFLITE_DCHECK_LE(output_activation_min, output_activation_max); |
| 82 | + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); |
| 83 | + TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4); |
| 84 | + TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4); |
| 85 | + const int batches = MatchingDim(input_shape, 0, output_shape, 0); |
| 86 | + const int input_depth = input_shape.Dims(3); |
| 87 | + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); |
| 88 | + if (bias_data) { |
| 89 | + TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth); |
| 90 | + } |
| 91 | + |
| 92 | + // Check dimensions of the tensors. |
| 93 | + const int input_height = input_shape.Dims(1); |
| 94 | + int input_width = input_shape.Dims(2); |
| 95 | + const int filter_height = filter_shape.Dims(1); |
| 96 | + const int filter_width = filter_shape.Dims(2); |
| 97 | + const int filter_input_depth = filter_shape.Dims(3); |
| 98 | + |
| 99 | + /* Update streaming conv buffer with input data */ |
| 100 | + input_width = filter_width; |
| 101 | + const int32_t dims_shape[4] = {1, input_height, filter_width, input_depth}; |
| 102 | + RuntimeShape input_state_shape(4, dims_shape); |
| 103 | + |
| 104 | + const int groups = input_depth / filter_input_depth; |
| 105 | + TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0); |
| 106 | + const int filters_per_group = output_depth / groups; |
| 107 | + const int output_height = output_shape.Dims(1); |
| 108 | + const int output_width = output_shape.Dims(2); |
| 109 | + for (int batch = 0; batch < batches; ++batch) { |
| 110 | + updateStreamingConvBuffer(input_state, |
| 111 | + &input_data[Offset(input_shape, batch, 0, 0, 0)], |
| 112 | + input_height, input_depth, filter_width); |
| 113 | + for (int out_y = 0; out_y < output_height; ++out_y) { |
| 114 | + const int in_y_origin = (out_y * stride_height) - pad_height; |
| 115 | + for (int out_x = 0; out_x < output_width; ++out_x) { |
| 116 | + const int in_x_origin = (out_x * stride_width) - pad_width; |
| 117 | + for (int out_channel = 0; out_channel < output_depth; ++out_channel) { |
| 118 | + auto group = out_channel / filters_per_group; |
| 119 | + AccumScalar acc = 0; |
| 120 | + for (int filter_y = 0; filter_y < filter_height; ++filter_y) { |
| 121 | + const int in_y = in_y_origin + dilation_height_factor * filter_y; |
| 122 | + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { |
| 123 | + const int in_x = in_x_origin + dilation_width_factor * filter_x; |
| 124 | + |
| 125 | + // Zero padding by omitting the areas outside the image. |
| 126 | + const bool is_point_inside_image = |
| 127 | + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && |
| 128 | + (in_y < input_height); |
| 129 | + |
| 130 | + if (!is_point_inside_image) { |
| 131 | + continue; |
| 132 | + } |
| 133 | + |
| 134 | + for (int in_channel = 0; in_channel < filter_input_depth; |
| 135 | + ++in_channel) { |
| 136 | + int32_t input_val = input_state[Offset( |
| 137 | + input_state_shape, 0, in_y, in_x, |
| 138 | + in_channel + group * filter_input_depth)]; |
| 139 | + int32_t filter_val = filter_data[Offset( |
| 140 | + filter_shape, out_channel, filter_y, filter_x, in_channel)]; |
| 141 | + // Accumulate with 64 bits accumulator. |
| 142 | + // int64_t += int8_t * int16_t so the highest value we can |
| 143 | + // get from each accumulation is [-127, 127] * ([-32768, |
| 144 | + // 32767] - |
| 145 | + // [-32768, 32767]), which is [-8322945, 8322945]. |
| 146 | + // log2(8322945) = 22.99. |
| 147 | + acc += filter_val * input_val; |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + if (bias_data) { |
| 152 | + acc += bias_data[out_channel]; |
| 153 | + } |
| 154 | + int32_t scaled_acc = MultiplyByQuantizedMultiplier( |
| 155 | + acc, output_multiplier[out_channel], output_shift[out_channel]); |
| 156 | + scaled_acc = std::max(scaled_acc, output_activation_min); |
| 157 | + scaled_acc = std::min(scaled_acc, output_activation_max); |
| 158 | + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = |
| 159 | + static_cast<int16_t>(scaled_acc); |
| 160 | + } |
| 161 | + } |
| 162 | + } |
| 163 | + } |
| 164 | +} |
| 165 | + |
| 166 | +TfLiteStatus StreamingConvEval(TfLiteContext* context, TfLiteNode* node) { |
| 167 | + const TfLiteEvalTensor* input = |
| 168 | + tflite::micro::GetEvalInput(context, node, kConvInputTensor); |
| 169 | + const TfLiteEvalTensor* filter = |
| 170 | + tflite::micro::GetEvalInput(context, node, kConvWeightsTensor); |
| 171 | + const TfLiteEvalTensor* bias = |
| 172 | + (NumInputs(node) == 3) |
| 173 | + ? tflite::micro::GetEvalInput(context, node, kConvBiasTensor) |
| 174 | + : nullptr; |
| 175 | + TfLiteEvalTensor* output = |
| 176 | + tflite::micro::GetEvalOutput(context, node, kConvOutputTensor); |
| 177 | + |
| 178 | + TFLITE_DCHECK(node->builtin_data != nullptr); |
| 179 | + const auto& params = |
| 180 | + *(reinterpret_cast<TfLiteConvParams*>(node->builtin_data)); |
| 181 | + TFLITE_DCHECK(node->user_data != nullptr); |
| 182 | + const auto& sdata = |
| 183 | + *(static_cast<const OpDataStreamingConv*>(node->user_data)); |
| 184 | + const auto& data = sdata.op_data; |
| 185 | + |
| 186 | + switch (input->type) { // Already know in/out types are same. |
| 187 | + case kTfLiteInt16: { |
| 188 | + if (bias == nullptr || bias->type == kTfLiteInt32) { |
| 189 | + StreamingConvPerChannel( |
| 190 | + StreamingConvParamsQuantized(params, data), |
| 191 | + data.per_channel_output_multiplier, data.per_channel_output_shift, |
| 192 | + tflite::micro::GetTensorShape(input), |
| 193 | + tflite::micro::GetTensorData<int16_t>(input), |
| 194 | + tflite::micro::GetTensorShape(filter), |
| 195 | + tflite::micro::GetTensorData<int8_t>(filter), |
| 196 | + tflite::micro::GetTensorShape(bias), |
| 197 | + tflite::micro::GetOptionalTensorData<std::int32_t>(bias), |
| 198 | + tflite::micro::GetTensorShape(output), |
| 199 | + tflite::micro::GetTensorData<int16_t>(output), |
| 200 | + (int16_t*)sdata.input_state); |
| 201 | + } else if (bias->type == kTfLiteInt64) { |
| 202 | + StreamingConvPerChannel( |
| 203 | + StreamingConvParamsQuantized(params, data), |
| 204 | + data.per_channel_output_multiplier, data.per_channel_output_shift, |
| 205 | + tflite::micro::GetTensorShape(input), |
| 206 | + tflite::micro::GetTensorData<int16_t>(input), |
| 207 | + tflite::micro::GetTensorShape(filter), |
| 208 | + tflite::micro::GetTensorData<int8_t>(filter), |
| 209 | + tflite::micro::GetTensorShape(bias), |
| 210 | + tflite::micro::GetOptionalTensorData<std::int64_t>(bias), |
| 211 | + tflite::micro::GetTensorShape(output), |
| 212 | + tflite::micro::GetTensorData<int16_t>(output), |
| 213 | + (int16_t*)sdata.input_state); |
| 214 | + } else { |
| 215 | + MicroPrintf("Bias type %s (%d) not supported.", |
| 216 | + TfLiteTypeGetName(bias->type), bias->type); |
| 217 | + return kTfLiteError; |
| 218 | + } |
| 219 | + break; |
| 220 | + } |
| 221 | + default: |
| 222 | + MicroPrintf("Type %s (%d) not supported.", TfLiteTypeGetName(input->type), |
| 223 | + input->type); |
| 224 | + return kTfLiteError; |
| 225 | + } |
| 226 | + return kTfLiteOk; |
| 227 | +} |
| 228 | + |
| 229 | +} // namespace |
| 230 | + |
| 231 | +TFLMRegistration Register_STREAMING_CONV_2D() { |
| 232 | + return tflite::micro::RegisterOp(StreamingConvInit, StreamingConvPrepare, |
| 233 | + StreamingConvEval); |
| 234 | +} |
| 235 | + |
| 236 | +} // namespace tflite |
0 commit comments