From ac4bec3066bfb829193c2a047c6fdc2dd53d4660 Mon Sep 17 00:00:00 2001 From: ddavis-2015 Date: Tue, 8 Jul 2025 11:03:03 -0700 Subject: [PATCH 1/3] Support for DECODE operator @tensorflow/micro Add initial support for DECODE operator. Add reference implementation. Add LUT decompression support. Update op resolvers. Update Makefiles and Bazel BUILD files. Add kernel unit test. bug=fixes #3131 --- python/tflite_micro/python_ops_resolver.cc | 3 +- tensorflow/lite/micro/kernels/BUILD | 20 + tensorflow/lite/micro/kernels/Makefile.inc | 1 + tensorflow/lite/micro/kernels/decode.cc | 148 ++++ tensorflow/lite/micro/kernels/decode_state.cc | 36 + tensorflow/lite/micro/kernels/decode_state.h | 87 +++ .../lite/micro/kernels/decode_state_lut.cc | 630 ++++++++++++++++++ .../lite/micro/kernels/decode_state_lut.h | 92 +++ tensorflow/lite/micro/kernels/decode_test.cc | 333 +++++++++ tensorflow/lite/micro/kernels/micro_ops.h | 1 + .../lite/micro/micro_mutable_op_resolver.h | 7 +- .../micro/tools/benchmarking/op_resolver.h | 3 +- tensorflow/lite/micro/tools/make/Makefile | 3 + 13 files changed, 1361 insertions(+), 3 deletions(-) create mode 100644 tensorflow/lite/micro/kernels/decode.cc create mode 100644 tensorflow/lite/micro/kernels/decode_state.cc create mode 100644 tensorflow/lite/micro/kernels/decode_state.h create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.cc create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.h create mode 100644 tensorflow/lite/micro/kernels/decode_test.cc diff --git a/python/tflite_micro/python_ops_resolver.cc b/python/tflite_micro/python_ops_resolver.cc index f5d6e636c16..34fc82956bc 100644 --- a/python/tflite_micro/python_ops_resolver.cc +++ b/python/tflite_micro/python_ops_resolver.cc @@ -1,4 +1,4 @@ -/* Copyright 2022 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -40,6 +40,7 @@ PythonOpsResolver::PythonOpsResolver() { AddConv2D(); AddCos(); AddCumSum(); + AddDecode(); AddDelay(); AddDepthToSpace(); AddDepthwiseConv2D(); diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD index 7b5ddc7b306..71cb5cd3fb0 100644 --- a/tensorflow/lite/micro/kernels/BUILD +++ b/tensorflow/lite/micro/kernels/BUILD @@ -236,6 +236,9 @@ tflm_kernel_cc_library( "conv.cc", "conv_common.cc", "cumsum.cc", + "decode.cc", + "decode_state.cc", + "decode_state_lut.cc", "depth_to_space.cc", "depthwise_conv.cc", "depthwise_conv_common.cc", @@ -326,6 +329,8 @@ tflm_kernel_cc_library( "batch_matmul.h", "circular_buffer.h", "conv.h", + "decode_state.h", + "decode_state_lut.h", "depthwise_conv.h", "dequantize.h", "ethosu.h", @@ -642,6 +647,21 @@ tflm_cc_test( ], ) +tflm_cc_test( + name = "decode_test", + srcs = [ + "decode_test.cc", + ], + deps = [ + ":kernel_runner", + "//tensorflow/lite/c:common", + "//tensorflow/lite/micro:debug_log", + "//tensorflow/lite/micro:op_resolvers", + "//tensorflow/lite/micro:test_helpers", + "//tensorflow/lite/micro/testing:micro_test", + ], +) + tflm_cc_test( name = "decompress_test", srcs = [ diff --git a/tensorflow/lite/micro/kernels/Makefile.inc b/tensorflow/lite/micro/kernels/Makefile.inc index f4456242fef..49c033b84e4 100644 --- a/tensorflow/lite/micro/kernels/Makefile.inc +++ b/tensorflow/lite/micro/kernels/Makefile.inc @@ -123,6 +123,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/ceil_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/comparisons_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum_test.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv_test.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize_test.cc \ diff --git a/tensorflow/lite/micro/kernels/decode.cc b/tensorflow/lite/micro/kernels/decode.cc new file mode 100644 index 00000000000..6c1478bb7f7 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode.cc @@ -0,0 +1,148 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/decode_state.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_context.h" +#include "tensorflow/lite/micro/micro_log.h" + +namespace tflite { +namespace { + +TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { + const size_t num_inputs = NumInputs(node); + const size_t num_outputs = NumOutputs(node); + TF_LITE_ENSURE(context, num_outputs > 0); + TF_LITE_ENSURE_EQ(context, num_inputs, num_outputs * 2); + + MicroContext* const micro_context = GetMicroContext(context); + + node->user_data = micro_context->AllocatePersistentBuffer( + num_outputs * sizeof(DecodeState*)); + TF_LITE_ENSURE(context, node->user_data != nullptr); + DecodeState** const dsp_arr = + reinterpret_cast(node->user_data); + + TfLiteTensor* input = nullptr; + TfLiteTensor* ancillary = nullptr; + TfLiteTensor* output = nullptr; + TfLiteStatus status = kTfLiteOk; + + for (size_t i = 0; i < num_inputs; i += 2) { + input = micro_context->AllocateTempInputTensor(node, i); + if (input == nullptr) { + MicroPrintf("failed to allocate input tensor %u", i); + status = kTfLiteError; + break; + } + ancillary = micro_context->AllocateTempInputTensor(node, i + 1); + if (ancillary == nullptr) { + MicroPrintf("failed to allocate ancillary tensor %u", i + 1); + status = kTfLiteError; + break; + } + output = micro_context->AllocateTempOutputTensor(node, i / 2); + if (output == nullptr) { + MicroPrintf("failed to allocate output tensor %u", i / 2); + status = kTfLiteError; + break; + } + + if (DecodeState::Version(*ancillary) != 1) { + MicroPrintf("version %u != 1", DecodeState::Version(*ancillary)); + status = kTfLiteError; + break; + } + + DecodeState* dsp = nullptr; + switch (DecodeState::Type(*ancillary)) { + case DecodeState::kDcmTypeLUT: + dsp = DecodeState::CreateDecodeStateLUT( + context, micro_context->GetAlternateProfiler()); + break; + case DecodeState::kDcmTypeCustom: + MicroPrintf("Custom decode type not yet supported"); + break; + default: + MicroPrintf("unsupported decode type %u", + DecodeState::Type(*ancillary)); + break; + } + + if (dsp != nullptr) { + status = dsp->Setup(*input, *ancillary, *output); + if (status != kTfLiteOk) { + break; + } + dsp_arr[i / 2] = dsp; + } else { + MicroPrintf("failed to allocate DecodeState[%u]", i / 2); + break; + } + + micro_context->DeallocateTempTfLiteTensor(input); + micro_context->DeallocateTempTfLiteTensor(ancillary); + micro_context->DeallocateTempTfLiteTensor(output); + input = nullptr; + ancillary = nullptr; + output = nullptr; + } + + if (input != nullptr) { + micro_context->DeallocateTempTfLiteTensor(input); + } + if (ancillary != nullptr) { + micro_context->DeallocateTempTfLiteTensor(ancillary); + } + if (output != nullptr) { + micro_context->DeallocateTempTfLiteTensor(output); + } + + return status; +} + +TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { + const size_t num_inputs = NumInputs(node); + DecodeState** const dsp_arr = + reinterpret_cast(node->user_data); + + for (size_t i = 0; i < num_inputs; i += 2) { + const TfLiteEvalTensor* input = + tflite::micro::GetEvalInput(context, node, i); + TF_LITE_ENSURE(context, input != nullptr); + const TfLiteEvalTensor* ancillary = + tflite::micro::GetEvalInput(context, node, i + 1); + TF_LITE_ENSURE(context, ancillary != nullptr); + const TfLiteEvalTensor* output = + tflite::micro::GetEvalOutput(context, node, i / 2); + TF_LITE_ENSURE(context, output != nullptr); + + TfLiteStatus status = dsp_arr[i / 2]->Decode(*input, *ancillary, *output); + TF_LITE_ENSURE(context, status == kTfLiteOk); + } + + return kTfLiteOk; +} + +} // namespace + +TFLMRegistration Register_DECODE() { + return tflite::micro::RegisterOp(nullptr, Prepare, Eval); +} + +} // namespace tflite diff --git a/tensorflow/lite/micro/kernels/decode_state.cc b/tensorflow/lite/micro/kernels/decode_state.cc new file mode 100644 index 00000000000..87bb6a506d3 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode_state.cc @@ -0,0 +1,36 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/decode_state.h" + +#include "tensorflow/lite/micro/kernels/decode_state_lut.h" +#include "tensorflow/lite/micro/micro_context.h" + +namespace tflite { + +DecodeState* DecodeState::CreateDecodeStateLUT( + const TfLiteContext* context, MicroProfilerInterface* profiler) { + MicroContext* const micro_context = GetMicroContext(context); + void* buffer = + micro_context->AllocatePersistentBuffer(sizeof(DecodeStateLUT)); + if (buffer == nullptr) { + return nullptr; + } + DecodeState* dsp = new (buffer) DecodeStateLUT(context, profiler); + + return dsp; +} + +} // namespace tflite diff --git a/tensorflow/lite/micro/kernels/decode_state.h b/tensorflow/lite/micro/kernels/decode_state.h new file mode 100644 index 00000000000..80594fd2c26 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode_state.h @@ -0,0 +1,87 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_ +#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_ + +#include + +#include "tensorflow/lite/c/common.h" +#include "tensorflow/lite/core/c/c_api_types.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/compatibility.h" +#include "tensorflow/lite/micro/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_profiler_interface.h" + +namespace tflite { + +struct DecodeState { + DecodeState() = delete; + + DecodeState(const TfLiteContext* context, MicroProfilerInterface* profiler) + : context_(context), micro_profiler_(profiler) {} + + virtual TfLiteStatus Setup(const TfLiteTensor& input, + const TfLiteTensor& ancillary, + const TfLiteTensor& output) = 0; + virtual TfLiteStatus Decode(const TfLiteEvalTensor& input, + const TfLiteEvalTensor& ancillary, + const TfLiteEvalTensor& output) = 0; + + static DecodeState* CreateDecodeStateLUT(const TfLiteContext* context, + MicroProfilerInterface* profiler); + + static uint8_t Type(const TfLiteTensor& ancillary) { + return GetTensorData(&ancillary)[kDcmDecodeTypeOffset]; + } + + static uint8_t Type(const TfLiteEvalTensor& ancillary) { + return micro::GetTensorData(&ancillary)[kDcmDecodeTypeOffset]; + } + + static uint8_t Version(const TfLiteTensor& ancillary) { + return GetTensorData(&ancillary)[kDcmVersionOffset]; + } + + static uint8_t Version(const TfLiteEvalTensor& ancillary) { + return micro::GetTensorData(&ancillary)[kDcmVersionOffset]; + } + + protected: + virtual ~DecodeState() = default; + + // Decode Common Metadata constants + public: + static constexpr uint8_t kDcmTypeLUT = 0; + static constexpr uint8_t kDcmTypeCustom = 127; + + static constexpr size_t kDcmSizeInBytes = 16; + + private: + static constexpr size_t kDcmDecodeTypeOffset = 0; + static constexpr size_t kDcmVersionOffset = 1; + + // DecodeState vars + protected: + const TfLiteContext* context_; + MicroProfilerInterface* micro_profiler_; + + private: + TF_LITE_REMOVE_VIRTUAL_DELETE +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_ diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.cc b/tensorflow/lite/micro/kernels/decode_state_lut.cc new file mode 100644 index 00000000000..477c21d80a7 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode_state_lut.cc @@ -0,0 +1,630 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/kernels/decode_state_lut.h" + +#include +#include + +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/micro_log.h" +#include "tensorflow/lite/micro/micro_profiler.h" + +namespace tflite { + +TfLiteStatus DecodeStateLUT::Setup(const TfLiteTensor& input, + const TfLiteTensor& ancillary, + const TfLiteTensor& output) { + const uint8_t* const ancillary_data = GetTensorData(&ancillary); + if (ancillary_data[kDcmVersionOffset] != 1) { + MicroPrintf("unsupported version %u", ancillary_data[kDcmVersionOffset]); + return kTfLiteError; + } + + // resolve num_channels_ and use_alternate_axis_ + if (output.quantization.type == kTfLiteAffineQuantization && + output.quantization.params != nullptr) { + const TfLiteAffineQuantization* quantization = + reinterpret_cast(output.quantization.params); + num_channels_ = quantization->scale->size; + if ((quantization->quantized_dimension == output.dims->size - 1) && + num_channels_ > 1) { + use_alternate_axis_ = true; + } else if (quantization->quantized_dimension != 0) { + MicroPrintf("unsupported quantization axis %u", + quantization->quantized_dimension); + return kTfLiteError; + } + } + + compressed_indices_ = GetTensorData(&input); + count_indices_ = NumElements(&output); + elements_per_channel_ = + use_alternate_axis_ ? 1 : count_indices_ / num_channels_; + value_table_ = &ancillary_data[kDcmSizeInBytes]; + value_table_channel_stride_ = ancillary_data[kDcmValueTableStrideOffset]; + compressed_bit_width_ = + ancillary_data[kDcmParamsOffset] & kDcmParamsBitWidthMask; + + return kTfLiteOk; +} + +TfLiteStatus DecodeStateLUT::Decode(const TfLiteEvalTensor& input, + const TfLiteEvalTensor& ancillary, + const TfLiteEvalTensor& output) { + void* const buffer = const_cast(micro::GetTensorData(&output)); + TFLITE_DCHECK(buffer != nullptr); + + switch (output.type) { + case kTfLiteBool: + DecompressToBuffer(buffer); + break; + case kTfLiteFloat32: + DecompressToBuffer(buffer); + break; + case kTfLiteInt8: + DecompressToBuffer(buffer); + break; + case kTfLiteInt16: + DecompressToBuffer(buffer); + break; + case kTfLiteInt32: + DecompressToBuffer(buffer); + break; + case kTfLiteInt64: + DecompressToBuffer(buffer); + break; + default: + MicroPrintf("unsupported tensor type %s", TfLiteTypeGetName(output.type)); + return kTfLiteError; + } + + return kTfLiteOk; +} + +template +T* DecodeStateLUT::DecompressToBuffer(void* buffer) { + TFLITE_DCHECK(compressed_bit_width_ <= kMaxBitWidth); + TFLITE_DCHECK(compressed_bit_width_ > 0); + + if (std::is_same::value && compressed_bit_width_ == 4 && + !use_alternate_axis_) { + DecompressToBufferWidth4_16(static_cast(buffer)); + } else if (std::is_same::value && compressed_bit_width_ == 3 && + !use_alternate_axis_) { + DecompressToBufferWidth3_32(static_cast(buffer)); + } else if (std::is_same::value && compressed_bit_width_ == 2 && + !use_alternate_axis_) { + DecompressToBufferWidth2_16(static_cast(buffer)); + } else { + DecompressToBufferWidthAny(static_cast(buffer)); + } + + return static_cast(buffer); +} + +template bool* DecodeStateLUT::DecompressToBuffer(void*); +template float* DecodeStateLUT::DecompressToBuffer(void*); +template int8_t* DecodeStateLUT::DecompressToBuffer(void*); +template int16_t* DecodeStateLUT::DecompressToBuffer(void*); +template int32_t* DecodeStateLUT::DecompressToBuffer(void*); +template int64_t* DecodeStateLUT::DecompressToBuffer(void*); + +void DecodeStateLUT::DecompressToBufferWidth4_16(int8_t* buffer) { + ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_); + + const size_t stride = value_table_channel_stride_; + const uint8_t* value_table = static_cast(value_table_); + const size_t max_count = elements_per_channel_; + size_t current_offset = 0; + + for (size_t channel = 0; channel < num_channels_; channel++) { + size_t count = max_count; + + // process elements at start of channel up to next uint64_t alignment of + // compressed_indices_ + while (count > 0 && (current_offset & 0x0F)) { + const size_t index = GetNextTableIndexWidth4(current_offset++); + *buffer++ = value_table[index]; + count -= 1; + } + + // process elements in current channel in groups of 16 + if (count >= 16) { + const uint64_t* indices = reinterpret_cast( + &compressed_indices_[current_offset >> 1]); + + while (count >= 16) { + count -= 16; + uint64_t index = *indices++; + uint64_t value, value2; + + value = static_cast(value_table[(index >> 4) & 0x0F]); + value |= static_cast(value_table[index & 0x0F]) << 8; + value |= static_cast(value_table[(index >> 12) & 0x0F]) << 16; + value |= static_cast(value_table[(index >> 8) & 0x0F]) << 24; + value |= static_cast(value_table[(index >> 20) & 0x0F]) << 32; + value |= static_cast(value_table[(index >> 16) & 0x0F]) << 40; + value |= static_cast(value_table[(index >> 28) & 0x0F]) << 48; + value |= static_cast(value_table[(index >> 24) & 0x0F]) << 56; + + *reinterpret_cast(buffer) = value; + + value2 = static_cast(value_table[(index >> 36) & 0x0F]); + value2 |= static_cast(value_table[(index >> 32) & 0x0F]) << 8; + value2 |= static_cast(value_table[(index >> 44) & 0x0F]) + << 16; + value2 |= static_cast(value_table[(index >> 40) & 0x0F]) + << 24; + value2 |= static_cast(value_table[(index >> 52) & 0x0F]) + << 32; + value2 |= static_cast(value_table[(index >> 48) & 0x0F]) + << 40; + value2 |= static_cast(value_table[(index >> 60) & 0x0F]) + << 48; + value2 |= static_cast(value_table[(index >> 56) & 0x0F]) + << 56; + + *reinterpret_cast(buffer + 8) = value2; + + buffer += 16; + } + + current_offset = + (reinterpret_cast(indices) - compressed_indices_) + << 1; + } + + // process remaining elements in current channel + while (count > 0) { + count -= 1; + const size_t index = GetNextTableIndexWidth4(current_offset++); + *buffer++ = value_table[index]; + } + + value_table += stride; + } +} + +void DecodeStateLUT::DecompressToBufferWidth2_16(int8_t* buffer) { + ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_); + + const size_t stride = value_table_channel_stride_; + const uint8_t* value_table = static_cast(value_table_); + const size_t max_count = elements_per_channel_; + size_t current_offset = 0; + + for (size_t channel = 0; channel < num_channels_; channel++) { + size_t count = max_count; + + // process elements at start of channel up to next uint32_t alignment of + // compressed_indices_ + while (count > 0 && (current_offset & 0x0F)) { + const size_t index = GetNextTableIndexWidth2(current_offset++); + *buffer++ = value_table[index]; + count -= 1; + } + + // process elements in current channel in groups of 16 + if (count >= 16) { + const uint32_t* indices = reinterpret_cast( + &compressed_indices_[current_offset >> 2]); + + while (count >= 16) { + count -= 16; + uint32_t index = *indices++; + uint64_t value, value2; + + value = static_cast(value_table[(index >> 6) & 0x03]); + value |= static_cast(value_table[(index >> 4) & 0x03]) << 8; + value |= static_cast(value_table[(index >> 2) & 0x03]) << 16; + value |= static_cast(value_table[index & 0x03]) << 24; + value |= static_cast(value_table[(index >> 14) & 0x03]) << 32; + value |= static_cast(value_table[(index >> 12) & 0x03]) << 40; + value |= static_cast(value_table[(index >> 10) & 0x03]) << 48; + value |= static_cast(value_table[(index >> 8) & 0x03]) << 56; + + *reinterpret_cast(buffer) = value; + + value2 = static_cast(value_table[(index >> 22) & 0x03]); + value2 |= static_cast(value_table[(index >> 20) & 0x03]) << 8; + value2 |= static_cast(value_table[(index >> 18) & 0x03]) + << 16; + value2 |= static_cast(value_table[(index >> 16) & 0x03]) + << 24; + value2 |= static_cast(value_table[(index >> 30) & 0x03]) + << 32; + value2 |= static_cast(value_table[(index >> 28) & 0x03]) + << 40; + value2 |= static_cast(value_table[(index >> 26) & 0x03]) + << 48; + value2 |= static_cast(value_table[(index >> 24) & 0x03]) + << 56; + + *reinterpret_cast(buffer + 8) = value2; + + buffer += 16; + } + + current_offset = + (reinterpret_cast(indices) - compressed_indices_) + << 2; + } + + // process remaining elements in current channel + while (count > 0) { + count -= 1; + const size_t index = GetNextTableIndexWidth2(current_offset++); + *buffer++ = value_table[index]; + } + + value_table += stride; + } +} + +void DecodeStateLUT::DecompressToBufferWidth3_32(int8_t* buffer) { + ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_); + + const size_t stride = value_table_channel_stride_; + const uint8_t* value_table = static_cast(value_table_); + const size_t max_count = elements_per_channel_; + size_t current_offset = 0; + + for (size_t channel = 0; channel < num_channels_; channel++) { + size_t count = max_count; + + // process elements at start of channel up to next uint32_t alignment of + // compressed_indices_ + while (count > 0 && (current_offset & 0x1F)) { + const size_t index = GetNextTableIndexWidth3(current_offset++); + *buffer++ = value_table[index]; + count -= 1; + } + + // process elements in current channel in groups of 32 + if (count >= 32) { + const uint32_t* indices = reinterpret_cast( + &compressed_indices_[(current_offset >> 5) * 12]); + + while (count >= 32) { + count -= 32; + uint32_t index0 = *indices++; + uint32_t index1 = *indices++; + uint32_t index2 = *indices++; + uint64_t value, value2; + + value = static_cast(value_table[(index0 >> 5) & 0x07]); + value |= static_cast(value_table[(index0 >> 2) & 0x07]) << 8; + value |= + static_cast( + value_table[((index0 << 1) & 0b110) | ((index0 >> 15) & 0b1)]) + << 16; + value |= static_cast(value_table[(index0 >> 12) & 0x07]) + << 24; + value |= static_cast(value_table[(index0 >> 9) & 0x07]) << 32; + value |= + static_cast( + value_table[((index0 >> 6) & 0b100) | ((index0 >> 22) & 0b11)]) + << 40; + value |= static_cast(value_table[(index0 >> 19) & 0x07]) + << 48; + value |= static_cast(value_table[(index0 >> 16) & 0x07]) + << 56; + + *reinterpret_cast(buffer) = value; + + value2 = static_cast(value_table[(index0 >> 29) & 0x07]); + value2 |= static_cast(value_table[(index0 >> 26) & 0x07]) + << 8; + value2 |= + static_cast( + value_table[((index0 >> 23) & 0b110) | ((index1 >> 7) & 0b1)]) + << 16; + value2 |= static_cast(value_table[(index1 >> 4) & 0x07]) + << 24; + value2 |= static_cast(value_table[(index1 >> 1) & 0x07]) + << 32; + value2 |= + static_cast( + value_table[((index1 << 2) & 0b100) | ((index1 >> 14) & 0b11)]) + << 40; + value2 |= static_cast(value_table[(index1 >> 11) & 0x07]) + << 48; + value2 |= static_cast(value_table[(index1 >> 8) & 0x07]) + << 56; + + *reinterpret_cast(buffer + 8) = value2; + + value = static_cast(value_table[(index1 >> 21) & 0x07]); + value |= static_cast(value_table[(index1 >> 18) & 0x07]) << 8; + value |= + static_cast( + value_table[((index1 >> 15) & 0b110) | ((index1 >> 31) & 0b1)]) + << 16; + value |= static_cast(value_table[(index1 >> 28) & 0x07]) + << 24; + value |= static_cast(value_table[(index1 >> 25) & 0x07]) + << 32; + value |= + static_cast( + value_table[((index1 >> 22) & 0b100) | ((index2 >> 6) & 0b11)]) + << 40; + value |= static_cast(value_table[(index2 >> 3) & 0x07]) << 48; + value |= static_cast(value_table[(index2 >> 0) & 0x07]) << 56; + + *reinterpret_cast(buffer + 16) = value; + + value2 = static_cast(value_table[(index2 >> 13) & 0x07]); + value2 |= static_cast(value_table[(index2 >> 10) & 0x07]) + << 8; + value2 |= + static_cast( + value_table[((index2 >> 7) & 0b110) | ((index2 >> 23) & 0b1)]) + << 16; + value2 |= static_cast(value_table[(index2 >> 20) & 0x07]) + << 24; + value2 |= static_cast(value_table[(index2 >> 17) & 0x07]) + << 32; + value2 |= + static_cast( + value_table[((index2 >> 14) & 0b100) | ((index2 >> 30) & 0b11)]) + << 40; + value2 |= static_cast(value_table[(index2 >> 27) & 0x07]) + << 48; + value2 |= static_cast(value_table[(index2 >> 24) & 0x07]) + << 56; + + *reinterpret_cast(buffer + 24) = value2; + + buffer += 32; + current_offset += 32; + } + } + + // process remaining elements in current channel + while (count > 0) { + count -= 1; + const size_t index = GetNextTableIndexWidth3(current_offset++); + *buffer++ = value_table[index]; + } + + value_table += stride; + } +} + +// TODO(ddavis-2015): templating GetNextTableIndexWidth makes this method +// more than 2x faster, but with a large code size increase +template +void DecodeStateLUT::DecompressToBufferWidthAny(T* buffer) { + ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_); + + if (use_alternate_axis_) { + const size_t stride = value_table_channel_stride_; + size_t current_offset = 0; + size_t count = count_indices_; + + while (count > 0) { + const T* value_table = static_cast(value_table_); + for (size_t channel = 0; channel < num_channels_; channel++) { + size_t index; + switch (compressed_bit_width_) { + case 1: + index = GetNextTableIndexWidth1(current_offset); + break; + case 2: + index = GetNextTableIndexWidth2(current_offset); + break; + case 3: + index = GetNextTableIndexWidth3(current_offset); + break; + case 4: + index = GetNextTableIndexWidth4(current_offset); + break; + case 5: + index = GetNextTableIndexWidth5(current_offset); + break; + case 6: + index = GetNextTableIndexWidth6(current_offset); + break; + case 7: + index = GetNextTableIndexWidth7(current_offset); + break; + } + current_offset++; + *buffer++ = value_table[index]; + value_table += stride; + } + count -= num_channels_; + } + } else { + const size_t stride = value_table_channel_stride_; + const T* value_table = static_cast(value_table_); + const size_t max_count = elements_per_channel_; + size_t current_offset = 0; + + for (size_t channel = 0; channel < num_channels_; channel++) { + size_t count = max_count; + + while (count-- > 0) { + size_t index; + switch (compressed_bit_width_) { + case 1: + index = GetNextTableIndexWidth1(current_offset); + break; + case 2: + index = GetNextTableIndexWidth2(current_offset); + break; + case 3: + index = GetNextTableIndexWidth3(current_offset); + break; + case 4: + index = GetNextTableIndexWidth4(current_offset); + break; + case 5: + index = GetNextTableIndexWidth5(current_offset); + break; + case 6: + index = GetNextTableIndexWidth6(current_offset); + break; + case 7: + index = GetNextTableIndexWidth7(current_offset); + break; + } + current_offset++; + *buffer++ = value_table[index]; + } + value_table += stride; + } + } +} + +template void DecodeStateLUT::DecompressToBufferWidthAny(bool*); +template void DecodeStateLUT::DecompressToBufferWidthAny(float*); +template void DecodeStateLUT::DecompressToBufferWidthAny(int8_t*); +template void DecodeStateLUT::DecompressToBufferWidthAny(int16_t*); +template void DecodeStateLUT::DecompressToBufferWidthAny(int32_t*); +template void DecodeStateLUT::DecompressToBufferWidthAny(int64_t*); + +inline size_t DecodeStateLUT::GetNextTableIndexWidth7( + const size_t current_offset) { + const size_t current_byte_index = (current_offset >> 3) * 7; + const uint8_t* indices = &compressed_indices_[current_byte_index]; + switch (current_offset & 0b111) { + case 0: + return indices[0] >> 1; + case 1: + return ((indices[0] & 0b1) << 6) | (indices[1] >> 2); + case 2: + return ((indices[1] & 0b11) << 5) | (indices[2] >> 3); + case 3: + return ((indices[2] & 0b111) << 4) | (indices[3] >> 4); + case 4: + return ((indices[3] & 0x0F) << 3) | (indices[4] >> 5); + case 5: + return ((indices[4] & 0x1F) << 2) | (indices[5] >> 6); + case 6: + return ((indices[5] & 0x3F) << 1) | (indices[6] >> 7); + case 7: + return indices[6] & 0x7F; + } + // NOTREACHED + return 0; +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth6( + const size_t current_offset) { + const size_t current_byte_index = (current_offset >> 2) * 3; + const uint8_t* indices = &compressed_indices_[current_byte_index]; + switch (current_offset & 0b11) { + case 0: + return indices[0] >> 2; + case 1: + return ((indices[0] & 0b11) << 4) | (indices[1] >> 4); + case 2: + return ((indices[1] & 0x0F) << 2) | (indices[2] >> 6); + case 3: + return indices[2] & 0x3F; + } + // NOTREACHED + return 0; +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth5( + const size_t current_offset) { + const size_t current_byte_index = (current_offset >> 3) * 5; + const uint8_t* indices = &compressed_indices_[current_byte_index]; + switch (current_offset & 0b111) { + case 0: + return indices[0] >> 3; + case 1: + return ((indices[0] & 0b111) << 2) | (indices[1] >> 6); + case 2: + return (indices[1] >> 1) & 0x1F; + case 3: + return ((indices[1] & 0b1) << 4) | (indices[2] >> 4); + case 4: + return ((indices[2] & 0x0F) << 1) | (indices[3] >> 7); + case 5: + return (indices[3] >> 2) & 0x1F; + case 6: + return ((indices[3] & 0b11) << 3) | (indices[4] >> 5); + case 7: + return indices[4] & 0x1F; + } + // NOTREACHED + return 0; +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth4( + const size_t current_offset) { + if (current_offset & 1) { + return compressed_indices_[current_offset >> 1] & 0x0F; + } else { + return compressed_indices_[current_offset >> 1] >> 4; + } +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth3( + const size_t current_offset) { + const size_t current_byte_index = (current_offset >> 3) * 3; + const uint8_t* indices = &compressed_indices_[current_byte_index]; + switch (current_offset & 0b111) { + case 0: + return indices[0] >> 5; + case 1: + return (indices[0] >> 2) & 0b111; + case 2: + return ((indices[0] & 0b11) << 1) | (indices[1] >> 7); + case 3: + return (indices[1] >> 4) & 0b111; + case 4: + return (indices[1] >> 1) & 0b111; + case 5: + return ((indices[1] & 0b1) << 2) | (indices[2] >> 6); + case 6: + return (indices[2] >> 3) & 0b111; + case 7: + return indices[2] & 0b111; + } + // NOTREACHED + return 0; +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth2( + const size_t current_offset) { + if (current_offset & 0b10) { + if (current_offset & 1) { + return compressed_indices_[current_offset >> 2] & 0x03; + } else { + return (compressed_indices_[current_offset >> 2] >> 2) & 0x03; + } + } else { + if (current_offset & 1) { + return (compressed_indices_[current_offset >> 2] >> 4) & 0x03; + } else { + return (compressed_indices_[current_offset >> 2] >> 6) & 0x03; + } + } +} + +inline size_t DecodeStateLUT::GetNextTableIndexWidth1( + const size_t current_offset) { + const size_t shift = ~current_offset & 0b111; + return (compressed_indices_[current_offset >> 3] >> shift) & 0b1; +} + +} // namespace tflite diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.h b/tensorflow/lite/micro/kernels/decode_state_lut.h new file mode 100644 index 00000000000..dbb64683960 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode_state_lut.h @@ -0,0 +1,92 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_ +#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_ + +#include + +#include "tensorflow/lite/micro/compatibility.h" +#include "tensorflow/lite/micro/kernels/decode_state.h" + +namespace tflite { + +struct DecodeStateLUT : public DecodeState { + DecodeStateLUT() = delete; + + DecodeStateLUT(const TfLiteContext* context, MicroProfilerInterface* profiler) + : DecodeState(context, profiler) {} + + virtual TfLiteStatus Setup(const TfLiteTensor& input, + const TfLiteTensor& ancillary, + const TfLiteTensor& output) override; + virtual TfLiteStatus Decode(const TfLiteEvalTensor& input, + const TfLiteEvalTensor& ancillary, + const TfLiteEvalTensor& output) override; + + protected: + // LUT compression constants + static constexpr size_t kMaxBitWidth = 7; + static constexpr size_t kMaxValueTableChannelStride = 128; + + private: + // LUT Decode Common Metadata constants + static constexpr size_t kDcmVersionOffset = 4; + static constexpr size_t kDcmParamsOffset = 5; + static constexpr uint8_t kDcmParamsBitWidthMask = 0x07; + static constexpr size_t kDcmValueTableStrideOffset = 6; + + protected: + virtual ~DecodeStateLUT() = default; + + template + T* DecompressToBuffer(void* buffer); + + // optimized C++ for INT8, use_alt_axis == false + void DecompressToBufferWidth4_16(int8_t* buffer); + void DecompressToBufferWidth3_32(int8_t* buffer); + void DecompressToBufferWidth2_16(int8_t* buffer); + + // generic C++ for any bit width and value table type + template + void DecompressToBufferWidthAny(T* buffer); + + // Optimized C++ table index fetch + inline size_t GetNextTableIndexWidth7(const size_t current_offset); + inline size_t GetNextTableIndexWidth6(const size_t current_offset); + inline size_t GetNextTableIndexWidth5(const size_t current_offset); + inline size_t GetNextTableIndexWidth4(const size_t current_offset); + inline size_t GetNextTableIndexWidth3(const size_t current_offset); + inline size_t GetNextTableIndexWidth2(const size_t current_offset); + inline size_t GetNextTableIndexWidth1(const size_t current_offset); + + protected: + const uint8_t* compressed_indices_ = nullptr; + size_t count_indices_ = 0; + size_t num_channels_ = 1; + size_t elements_per_channel_ = 0; // computed from use_alternate_axis_ + const void* value_table_ = nullptr; // Pointer into FlatBuffer values + uint8_t value_table_channel_stride_ = 0; // elements per channel + uint8_t compressed_bit_width_ = 0; // 1 to 7 bits + bool use_alternate_axis_ = false; // shape channel axis: + // false = first, true = last + + private: + TF_LITE_REMOVE_VIRTUAL_DELETE +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_ diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc new file mode 100644 index 00000000000..3008736e535 --- /dev/null +++ b/tensorflow/lite/micro/kernels/decode_test.cc @@ -0,0 +1,333 @@ +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include + +#include "tensorflow/lite/core/c/common.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/micro/kernels/decode_state.h" +#include "tensorflow/lite/micro/kernels/kernel_runner.h" +#include "tensorflow/lite/micro/test_helpers.h" +#include "tensorflow/lite/micro/testing/micro_test.h" + +namespace tflite { +namespace testing { +namespace { + +struct TensorInDatum { + const void* const data; + const TfLiteIntArray& dims; +}; + +struct TensorOutDatum { + void* const data; + const TfLiteIntArray& dims; + const TfLiteType type; + const TfLiteFloatArray& scales; + const TfLiteIntArray& zero_points; + const int quantized_dimension; + + // initialized by CreatePerChannelQuantizedTensor + const TfLiteAffineQuantization affine_quantization; +}; + +template +struct AncillaryLUT { + AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes], + const T (&values)[N]) { + std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_)); + std::copy(std::begin(values), std::end(values), std::begin(value_table_)); + } + + private: + uint8_t dcm_[tflite::DecodeState::kDcmSizeInBytes]; + T value_table_[N > 0 ? N : 1]; // assure not zero length +}; + +constexpr int kBitWidthLUT = 2; + +constexpr int8_t kAncillaryDataLUT0[] = {1, 2, 3, 4}; +constexpr int16_t kAncillaryDataLUT1[] = {5, 6, 7, 8}; + +constexpr uint8_t kDcmLUT0[tflite::DecodeState::kDcmSizeInBytes] = { + tflite::DecodeState::kDcmTypeLUT, // type: LUT + 1, // DCM version: 1 + 0, // reserved + 0, // reserved + 1, // LUT version: 1 + kBitWidthLUT, // Parameters: bit-width 2 + std::size(kAncillaryDataLUT0), // channel stride +}; + +constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = { + tflite::DecodeState::kDcmTypeLUT, // type: LUT + 1, // DCM version: 1 + 0, // reserved + 0, // reserved + 1, // LUT version: 1 + kBitWidthLUT, // Parameters: bit-width 2 + std::size(kAncillaryDataLUT1), // channel stride +}; + +// Align the tensor data the same as a Buffer in the TfLite schema +alignas(16) const + AncillaryLUT kAncillaryLUT0 = { + {kDcmLUT0}, {kAncillaryDataLUT0}}; +alignas(16) const + AncillaryLUT kAncillaryLUT1 = { + {kDcmLUT1}, {kAncillaryDataLUT1}}; +alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4}; + +// Tensor shapes as TfLiteIntArray +constexpr int kOutputShapeLUT[] = {3, 1, 2, 4}; +constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)}; +constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)}; +constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)}; + +constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1}; +constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5}; + +template +TfLiteStatus CheckOutput(const TfLiteTensor& output, + const void* const expected) { + const T* const expected_data = reinterpret_cast(expected); + const T* const output_data = tflite::GetTensorData(&output); + + constexpr float kTolerance = 1e-5; + const size_t kOutputCount = tflite::NumElements(&output); + for (size_t i = 0; i < kOutputCount; i++) { + TF_LITE_MICRO_EXPECT_NEAR(expected_data[i], output_data[i], kTolerance); + TF_LITE_MICRO_CHECK_FAIL(); + } + + return kTfLiteOk; +} + +template +TfLiteStatus ExecuteDecodeTest( + TfLiteTensor* tensors, const TFLMRegistration& registration, + const std::initializer_list& expected) { + int kInputArrayData[kNumInputs + 1] = {kNumInputs}; + for (size_t i = 0; i < kNumInputs; i++) { + kInputArrayData[i + 1] = i; + } + TfLiteIntArray* inputs_array = IntArrayFromInts(kInputArrayData); + + int kOutputArrayData[kNumOutputs + 1] = {kNumOutputs}; + for (size_t i = 0; i < kNumOutputs; i++) { + kOutputArrayData[i + 1] = i + kNumInputs; + } + TfLiteIntArray* outputs_array = IntArrayFromInts(kOutputArrayData); + + micro::KernelRunner runner(registration, tensors, kNumInputs + kNumOutputs, + inputs_array, outputs_array, nullptr); + + if (runner.InitAndPrepare() != kTfLiteOk || runner.Invoke() != kTfLiteOk) { + return kTfLiteError; + } + + const TfLiteTensor* const output_tensors = &tensors[kNumInputs]; + TfLiteStatus status = kTfLiteError; + for (size_t i = 0; i < kNumOutputs; i++) { + switch (output_tensors[i].type) { + case kTfLiteInt8: + status = CheckOutput(output_tensors[i], expected.begin()[i]); + break; + case kTfLiteInt16: + status = CheckOutput(output_tensors[i], expected.begin()[i]); + break; + default: + TF_LITE_MICRO_FAIL("unsupported tensor type in test"); + break; + } + } + + return status; +} + +template +void TestDecode(const std::initializer_list& encodes, + const std::initializer_list& ancillaries, + const std::initializer_list& outputs, + const std::initializer_list& expected, + const TFLMRegistration& registration, + const TfLiteStatus expected_status = kTfLiteOk) { + TfLiteTensor tensors[kNumInputs + kNumOutputs] = {}; + + for (size_t i = 0; i < kNumInputs; i += 2) { + const TensorInDatum& tid_encode = *encodes.begin()[i / 2]; + tensors[i] = CreateTensor(tid_encode.data, + const_cast(&tid_encode.dims), + false, kTfLiteUInt8); + const TensorInDatum& tid_ancillary = *ancillaries.begin()[i / 2]; + tensors[i + 1] = CreateTensor( + tid_ancillary.data, const_cast(&tid_ancillary.dims), + false, kTfLiteUInt8); + } + for (size_t i = 0; i < kNumOutputs; i++) { + const TensorOutDatum& tod = *outputs.begin()[i]; + if (tod.scales.size == 0) { + tensors[i + kNumInputs] = CreateTensor( + tod.data, const_cast(&tod.dims), false, tod.type); + } else { + tensors[i + kNumInputs] = CreatePerChannelQuantizedTensor( + tod.data, const_cast(&tod.dims), + const_cast(&tod.scales), + const_cast(&tod.zero_points), + const_cast(&tod.affine_quantization), + tod.quantized_dimension, false, tod.type); + } + } + + TfLiteStatus s = ExecuteDecodeTest( + tensors, registration, expected); + TF_LITE_MICRO_EXPECT_EQ(s, expected_status); +} + +} // namespace +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(DecodeSingleTensor) { + // Align the tensor data the same as a Buffer in the TfLite schema + alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {}; + + const TfLiteIntArray* const encoded_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT); + static const tflite::testing::TensorInDatum tid_encode = { + tflite::testing::kEncodedLUT, + *encoded_dims, + }; + static constexpr std::initializer_list + encodes = { + &tid_encode, + }; + + const TfLiteIntArray* const ancillary_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0); + static const tflite::testing::TensorInDatum tid_ancillary = { + &tflite::testing::kAncillaryLUT0, + *ancillary_dims, + }; + static constexpr std::initializer_list + ancillaries = {&tid_ancillary}; + + const TfLiteIntArray* const output_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT); + constexpr float output_scales_data[] = {0}; + const TfLiteFloatArray* const output_scales = + tflite::testing::FloatArrayFromFloats(output_scales_data); + constexpr int output_zero_points_data[] = {0}; + const TfLiteIntArray* const output_zero_points = + tflite::testing::IntArrayFromInts(output_zero_points_data); + static const tflite::testing::TensorOutDatum tod = { + output_data, + *output_dims, + kTfLiteInt8, + *output_scales, + *output_zero_points, + 0, + {}, + }; + static constexpr std::initializer_list + outputs = {&tod}; + + const std::initializer_list expected = { + tflite::testing::kExpectLUT0, + }; + + tflite::testing::TestDecode( + encodes, ancillaries, outputs, expected, tflite::Register_DECODE()); +} + +TF_LITE_MICRO_TEST(DecodeTwoTensors) { + // Align the tensor data the same as a Buffer in the TfLite schema + alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {}; + alignas(16) + int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {}; + + const TfLiteIntArray* const encoded_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT); + static const tflite::testing::TensorInDatum tid_encode0 = { + tflite::testing::kEncodedLUT, + *encoded_dims, + }; + static const tflite::testing::TensorInDatum tid_encode1 = { + tflite::testing::kEncodedLUT, + *encoded_dims, + }; + static constexpr std::initializer_list + encodes = {&tid_encode0, &tid_encode1}; + + const TfLiteIntArray* const ancillary_dims0 = + tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0); + static const tflite::testing::TensorInDatum tid_ancillary0 = { + &tflite::testing::kAncillaryLUT0, + *ancillary_dims0, + }; + const TfLiteIntArray* const ancillary_dims1 = + tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1); + static const tflite::testing::TensorInDatum tid_ancillary1 = { + &tflite::testing::kAncillaryLUT1, + *ancillary_dims1, + }; + static constexpr std::initializer_list + ancillaries = {&tid_ancillary0, &tid_ancillary1}; + + const TfLiteIntArray* const output_dims = + tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT); + constexpr float output_scales_data[] = {1, 1.0f}; + const TfLiteFloatArray* const output_scales = + tflite::testing::FloatArrayFromFloats(output_scales_data); + constexpr int output_zero_points_data[] = {1, 0}; + const TfLiteIntArray* const output_zero_points = + tflite::testing::IntArrayFromInts(output_zero_points_data); + static const tflite::testing::TensorOutDatum tod0 = { + output_data0, + *output_dims, + kTfLiteInt8, + *output_scales, + *output_zero_points, + 0, + {}, + }; + static const tflite::testing::TensorOutDatum tod1 = { + output_data1, + *output_dims, + kTfLiteInt16, + *output_scales, + *output_zero_points, + 0, + {}, + }; + static constexpr std::initializer_list + outputs = {&tod0, &tod1}; + + const std::initializer_list expected = { + tflite::testing::kExpectLUT0, + tflite::testing::kExpectLUT1, + }; + + tflite::testing::TestDecode( + encodes, ancillaries, outputs, expected, tflite::Register_DECODE()); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h index 2e33a6730bd..8b76ca2cc17 100644 --- a/tensorflow/lite/micro/kernels/micro_ops.h +++ b/tensorflow/lite/micro/kernels/micro_ops.h @@ -53,6 +53,7 @@ TFLMRegistration Register_CONCATENATION(); TFLMRegistration Register_CONV_2D(); TFLMRegistration Register_COS(); TFLMRegistration Register_CUMSUM(); +TFLMRegistration Register_DECODE(); TFLMRegistration Register_DEPTH_TO_SPACE(); TFLMRegistration Register_DEPTHWISE_CONV_2D(); TFLMRegistration Register_DEQUANTIZE(); diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h index f3f2080f0aa..6a638d93b97 100644 --- a/tensorflow/lite/micro/micro_mutable_op_resolver.h +++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h @@ -1,4 +1,4 @@ -/* Copyright 2024 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -206,6 +206,11 @@ class MicroMutableOpResolver : public MicroOpResolver { ParseCumsum); } + TfLiteStatus AddDecode() { + const TFLMRegistration& registration = tflite::Register_DECODE(); + return AddCustom("TFLM_DECODE", ®istration); + } + TfLiteStatus AddDelay() { // TODO(b/286250473): change back name to "Delay" and remove namespace return AddCustom("SignalDelay", tflite::tflm_signal::Register_DELAY()); diff --git a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h index 9b98849c472..651429b76ec 100644 --- a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h +++ b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -45,6 +45,7 @@ inline TfLiteStatus CreateOpResolver(TflmOpResolver& op_resolver) { TF_LITE_ENSURE_STATUS(op_resolver.AddConv2D()); TF_LITE_ENSURE_STATUS(op_resolver.AddCos()); TF_LITE_ENSURE_STATUS(op_resolver.AddCumSum()); + TF_LITE_ENSURE_STATUS(op_resolver.AddDecode()); TF_LITE_ENSURE_STATUS(op_resolver.AddDelay()); TF_LITE_ENSURE_STATUS(op_resolver.AddDepthToSpace()); TF_LITE_ENSURE_STATUS(op_resolver.AddDepthwiseConv2D()); diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 0bf5532badf..a43abf7f7f7 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -386,6 +386,9 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv_common.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state.cc \ +$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_lut.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress_common.cc \ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space.cc \ From 15ac156290878ced26340859f19b94ecec0f1885 Mon Sep 17 00:00:00 2001 From: ddavis-2015 Date: Tue, 8 Jul 2025 12:01:19 -0700 Subject: [PATCH 2/3] update copyright --- tensorflow/lite/micro/kernels/micro_ops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h index 8b76ca2cc17..b715c735017 100644 --- a/tensorflow/lite/micro/kernels/micro_ops.h +++ b/tensorflow/lite/micro/kernels/micro_ops.h @@ -1,4 +1,4 @@ -/* Copyright 2023 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2025 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 6f96b2983a150c728ea00837e9b9953c9479634f Mon Sep 17 00:00:00 2001 From: ddavis-2015 Date: Tue, 8 Jul 2025 17:05:24 -0700 Subject: [PATCH 3/3] Don't use constructors with global objects (bluepill will not call them). Cleanup unit test. --- tensorflow/lite/micro/kernels/decode_test.cc | 121 ++++++++++--------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc index 3008736e535..69ee7f61a5f 100644 --- a/tensorflow/lite/micro/kernels/decode_test.cc +++ b/tensorflow/lite/micro/kernels/decode_test.cc @@ -47,6 +47,7 @@ struct TensorOutDatum { template struct AncillaryLUT { + AncillaryLUT() = delete; AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes], const T (&values)[N]) { std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_)); @@ -84,19 +85,11 @@ constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = { }; // Align the tensor data the same as a Buffer in the TfLite schema -alignas(16) const - AncillaryLUT kAncillaryLUT0 = { - {kDcmLUT0}, {kAncillaryDataLUT0}}; -alignas(16) const - AncillaryLUT kAncillaryLUT1 = { - {kDcmLUT1}, {kAncillaryDataLUT1}}; alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4}; // Tensor shapes as TfLiteIntArray constexpr int kOutputShapeLUT[] = {3, 1, 2, 4}; constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)}; -constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)}; -constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)}; constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1}; constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5}; @@ -204,39 +197,55 @@ void TestDecode(const std::initializer_list& encodes, TF_LITE_MICRO_TESTS_BEGIN +using tflite::testing::AncillaryLUT; +using tflite::testing::kAncillaryDataLUT0; +using tflite::testing::kAncillaryDataLUT1; +using tflite::testing::kDcmLUT0; +using tflite::testing::kDcmLUT1; +using tflite::testing::kEncodedLUT; +using tflite::testing::kEncodedShapeLUT; +using tflite::testing::kExpectLUT0; +using tflite::testing::kExpectLUT1; +using tflite::testing::kOutputShapeLUT; +using tflite::testing::TensorInDatum; +using tflite::testing::TensorOutDatum; + TF_LITE_MICRO_TEST(DecodeSingleTensor) { // Align the tensor data the same as a Buffer in the TfLite schema - alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {}; + alignas(16) int8_t output_data[std::size(kExpectLUT0)] = {}; + alignas(16) const AncillaryLUT + kAncillaryLUT = {{kDcmLUT0}, {kAncillaryDataLUT0}}; + + constexpr int kAncillaryShapeLUT[] = {1, sizeof(kAncillaryLUT)}; const TfLiteIntArray* const encoded_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT); - static const tflite::testing::TensorInDatum tid_encode = { - tflite::testing::kEncodedLUT, + tflite::testing::IntArrayFromInts(kEncodedShapeLUT); + static const TensorInDatum tid_encode = { + kEncodedLUT, *encoded_dims, }; - static constexpr std::initializer_list - encodes = { - &tid_encode, - }; + static constexpr std::initializer_list encodes = { + &tid_encode, + }; const TfLiteIntArray* const ancillary_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0); - static const tflite::testing::TensorInDatum tid_ancillary = { - &tflite::testing::kAncillaryLUT0, + tflite::testing::IntArrayFromInts(kAncillaryShapeLUT); + static const TensorInDatum tid_ancillary = { + &kAncillaryLUT, *ancillary_dims, }; - static constexpr std::initializer_list - ancillaries = {&tid_ancillary}; + static constexpr std::initializer_list ancillaries = { + &tid_ancillary}; const TfLiteIntArray* const output_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT); + tflite::testing::IntArrayFromInts(kOutputShapeLUT); constexpr float output_scales_data[] = {0}; const TfLiteFloatArray* const output_scales = tflite::testing::FloatArrayFromFloats(output_scales_data); constexpr int output_zero_points_data[] = {0}; const TfLiteIntArray* const output_zero_points = tflite::testing::IntArrayFromInts(output_zero_points_data); - static const tflite::testing::TensorOutDatum tod = { + static const TensorOutDatum tod = { output_data, *output_dims, kTfLiteInt8, @@ -245,12 +254,10 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) { 0, {}, }; - static constexpr std::initializer_list - outputs = {&tod}; + static constexpr std::initializer_list outputs = { + &tod}; - const std::initializer_list expected = { - tflite::testing::kExpectLUT0, - }; + const std::initializer_list expected = {kExpectLUT0}; tflite::testing::TestDecode( @@ -259,47 +266,53 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) { TF_LITE_MICRO_TEST(DecodeTwoTensors) { // Align the tensor data the same as a Buffer in the TfLite schema - alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {}; - alignas(16) - int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {}; + alignas(16) int8_t output_data0[std::size(kExpectLUT0)] = {}; + alignas(16) int16_t output_data1[std::size(kExpectLUT1)] = {}; + alignas(16) const AncillaryLUT + kAncillaryLUT0 = {{kDcmLUT0}, {kAncillaryDataLUT0}}; + alignas(16) const AncillaryLUT + kAncillaryLUT1 = {{kDcmLUT1}, {kAncillaryDataLUT1}}; + + constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)}; + constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)}; const TfLiteIntArray* const encoded_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT); - static const tflite::testing::TensorInDatum tid_encode0 = { - tflite::testing::kEncodedLUT, + tflite::testing::IntArrayFromInts(kEncodedShapeLUT); + static const TensorInDatum tid_encode0 = { + kEncodedLUT, *encoded_dims, }; - static const tflite::testing::TensorInDatum tid_encode1 = { - tflite::testing::kEncodedLUT, + static const TensorInDatum tid_encode1 = { + kEncodedLUT, *encoded_dims, }; - static constexpr std::initializer_list - encodes = {&tid_encode0, &tid_encode1}; + static constexpr std::initializer_list encodes = { + &tid_encode0, &tid_encode1}; const TfLiteIntArray* const ancillary_dims0 = - tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0); - static const tflite::testing::TensorInDatum tid_ancillary0 = { - &tflite::testing::kAncillaryLUT0, + tflite::testing::IntArrayFromInts(kAncillaryShapeLUT0); + static const TensorInDatum tid_ancillary0 = { + &kAncillaryLUT0, *ancillary_dims0, }; const TfLiteIntArray* const ancillary_dims1 = - tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1); - static const tflite::testing::TensorInDatum tid_ancillary1 = { - &tflite::testing::kAncillaryLUT1, + tflite::testing::IntArrayFromInts(kAncillaryShapeLUT1); + static const TensorInDatum tid_ancillary1 = { + &kAncillaryLUT1, *ancillary_dims1, }; - static constexpr std::initializer_list - ancillaries = {&tid_ancillary0, &tid_ancillary1}; + static constexpr std::initializer_list ancillaries = { + &tid_ancillary0, &tid_ancillary1}; const TfLiteIntArray* const output_dims = - tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT); + tflite::testing::IntArrayFromInts(kOutputShapeLUT); constexpr float output_scales_data[] = {1, 1.0f}; const TfLiteFloatArray* const output_scales = tflite::testing::FloatArrayFromFloats(output_scales_data); constexpr int output_zero_points_data[] = {1, 0}; const TfLiteIntArray* const output_zero_points = tflite::testing::IntArrayFromInts(output_zero_points_data); - static const tflite::testing::TensorOutDatum tod0 = { + static const TensorOutDatum tod0 = { output_data0, *output_dims, kTfLiteInt8, @@ -308,7 +321,7 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) { 0, {}, }; - static const tflite::testing::TensorOutDatum tod1 = { + static const TensorOutDatum tod1 = { output_data1, *output_dims, kTfLiteInt16, @@ -317,13 +330,11 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) { 0, {}, }; - static constexpr std::initializer_list - outputs = {&tod0, &tod1}; + static constexpr std::initializer_list outputs = { + &tod0, &tod1}; - const std::initializer_list expected = { - tflite::testing::kExpectLUT0, - tflite::testing::kExpectLUT1, - }; + const std::initializer_list expected = {kExpectLUT0, + kExpectLUT1}; tflite::testing::TestDecode(