From ac4bec3066bfb829193c2a047c6fdc2dd53d4660 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 11:03:03 -0700
Subject: [PATCH 1/3] Support for DECODE operator

@tensorflow/micro

Add initial support for DECODE operator.
Add reference implementation.
Add LUT decompression support.
Update op resolvers.
Update Makefiles and Bazel BUILD files.
Add kernel unit test.

bug=fixes #3131
---
 python/tflite_micro/python_ops_resolver.cc    |   3 +-
 tensorflow/lite/micro/kernels/BUILD           |  20 +
 tensorflow/lite/micro/kernels/Makefile.inc    |   1 +
 tensorflow/lite/micro/kernels/decode.cc       | 148 ++++
 tensorflow/lite/micro/kernels/decode_state.cc |  36 +
 tensorflow/lite/micro/kernels/decode_state.h  |  87 +++
 .../lite/micro/kernels/decode_state_lut.cc    | 630 ++++++++++++++++++
 .../lite/micro/kernels/decode_state_lut.h     |  92 +++
 tensorflow/lite/micro/kernels/decode_test.cc  | 333 +++++++++
 tensorflow/lite/micro/kernels/micro_ops.h     |   1 +
 .../lite/micro/micro_mutable_op_resolver.h    |   7 +-
 .../micro/tools/benchmarking/op_resolver.h    |   3 +-
 tensorflow/lite/micro/tools/make/Makefile     |   3 +
 13 files changed, 1361 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/decode.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state.h
 create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.cc
 create mode 100644 tensorflow/lite/micro/kernels/decode_state_lut.h
 create mode 100644 tensorflow/lite/micro/kernels/decode_test.cc

diff --git a/python/tflite_micro/python_ops_resolver.cc b/python/tflite_micro/python_ops_resolver.cc
index f5d6e636c16..34fc82956bc 100644
--- a/python/tflite_micro/python_ops_resolver.cc
+++ b/python/tflite_micro/python_ops_resolver.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@ PythonOpsResolver::PythonOpsResolver() {
   AddConv2D();
   AddCos();
   AddCumSum();
+  AddDecode();
   AddDelay();
   AddDepthToSpace();
   AddDepthwiseConv2D();
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 7b5ddc7b306..71cb5cd3fb0 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -236,6 +236,9 @@ tflm_kernel_cc_library(
         "conv.cc",
         "conv_common.cc",
         "cumsum.cc",
+        "decode.cc",
+        "decode_state.cc",
+        "decode_state_lut.cc",
         "depth_to_space.cc",
         "depthwise_conv.cc",
         "depthwise_conv_common.cc",
@@ -326,6 +329,8 @@ tflm_kernel_cc_library(
         "batch_matmul.h",
         "circular_buffer.h",
         "conv.h",
+        "decode_state.h",
+        "decode_state_lut.h",
         "depthwise_conv.h",
         "dequantize.h",
         "ethosu.h",
@@ -642,6 +647,21 @@ tflm_cc_test(
     ],
 )
 
+tflm_cc_test(
+    name = "decode_test",
+    srcs = [
+        "decode_test.cc",
+    ],
+    deps = [
+        ":kernel_runner",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:debug_log",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflm_cc_test(
     name = "decompress_test",
     srcs = [
diff --git a/tensorflow/lite/micro/kernels/Makefile.inc b/tensorflow/lite/micro/kernels/Makefile.inc
index f4456242fef..49c033b84e4 100644
--- a/tensorflow/lite/micro/kernels/Makefile.inc
+++ b/tensorflow/lite/micro/kernels/Makefile.inc
@@ -123,6 +123,7 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/ceil_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/comparisons_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum_test.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depthwise_conv_test.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/dequantize_test.cc \
diff --git a/tensorflow/lite/micro/kernels/decode.cc b/tensorflow/lite/micro/kernels/decode.cc
new file mode 100644
index 00000000000..6c1478bb7f7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode.cc
@@ -0,0 +1,148 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_context.h"
+#include "tensorflow/lite/micro/micro_log.h"
+
+namespace tflite {
+namespace {
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const size_t num_inputs = NumInputs(node);
+  const size_t num_outputs = NumOutputs(node);
+  TF_LITE_ENSURE(context, num_outputs > 0);
+  TF_LITE_ENSURE_EQ(context, num_inputs, num_outputs * 2);
+
+  MicroContext* const micro_context = GetMicroContext(context);
+
+  node->user_data = micro_context->AllocatePersistentBuffer(
+      num_outputs * sizeof(DecodeState*));
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  DecodeState** const dsp_arr =
+      reinterpret_cast<DecodeState**>(node->user_data);
+
+  TfLiteTensor* input = nullptr;
+  TfLiteTensor* ancillary = nullptr;
+  TfLiteTensor* output = nullptr;
+  TfLiteStatus status = kTfLiteOk;
+
+  for (size_t i = 0; i < num_inputs; i += 2) {
+    input = micro_context->AllocateTempInputTensor(node, i);
+    if (input == nullptr) {
+      MicroPrintf("failed to allocate input tensor %u", i);
+      status = kTfLiteError;
+      break;
+    }
+    ancillary = micro_context->AllocateTempInputTensor(node, i + 1);
+    if (ancillary == nullptr) {
+      MicroPrintf("failed to allocate ancillary tensor %u", i + 1);
+      status = kTfLiteError;
+      break;
+    }
+    output = micro_context->AllocateTempOutputTensor(node, i / 2);
+    if (output == nullptr) {
+      MicroPrintf("failed to allocate output tensor %u", i / 2);
+      status = kTfLiteError;
+      break;
+    }
+
+    if (DecodeState::Version(*ancillary) != 1) {
+      MicroPrintf("version %u != 1", DecodeState::Version(*ancillary));
+      status = kTfLiteError;
+      break;
+    }
+
+    DecodeState* dsp = nullptr;
+    switch (DecodeState::Type(*ancillary)) {
+      case DecodeState::kDcmTypeLUT:
+        dsp = DecodeState::CreateDecodeStateLUT(
+            context, micro_context->GetAlternateProfiler());
+        break;
+      case DecodeState::kDcmTypeCustom:
+        MicroPrintf("Custom decode type not yet supported");
+        break;
+      default:
+        MicroPrintf("unsupported decode type %u",
+                    DecodeState::Type(*ancillary));
+        break;
+    }
+
+    if (dsp != nullptr) {
+      status = dsp->Setup(*input, *ancillary, *output);
+      if (status != kTfLiteOk) {
+        break;
+      }
+      dsp_arr[i / 2] = dsp;
+    } else {
+      MicroPrintf("failed to allocate DecodeState[%u]", i / 2);
+      break;
+    }
+
+    micro_context->DeallocateTempTfLiteTensor(input);
+    micro_context->DeallocateTempTfLiteTensor(ancillary);
+    micro_context->DeallocateTempTfLiteTensor(output);
+    input = nullptr;
+    ancillary = nullptr;
+    output = nullptr;
+  }
+
+  if (input != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(input);
+  }
+  if (ancillary != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(ancillary);
+  }
+  if (output != nullptr) {
+    micro_context->DeallocateTempTfLiteTensor(output);
+  }
+
+  return status;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const size_t num_inputs = NumInputs(node);
+  DecodeState** const dsp_arr =
+      reinterpret_cast<DecodeState**>(node->user_data);
+
+  for (size_t i = 0; i < num_inputs; i += 2) {
+    const TfLiteEvalTensor* input =
+        tflite::micro::GetEvalInput(context, node, i);
+    TF_LITE_ENSURE(context, input != nullptr);
+    const TfLiteEvalTensor* ancillary =
+        tflite::micro::GetEvalInput(context, node, i + 1);
+    TF_LITE_ENSURE(context, ancillary != nullptr);
+    const TfLiteEvalTensor* output =
+        tflite::micro::GetEvalOutput(context, node, i / 2);
+    TF_LITE_ENSURE(context, output != nullptr);
+
+    TfLiteStatus status = dsp_arr[i / 2]->Decode(*input, *ancillary, *output);
+    TF_LITE_ENSURE(context, status == kTfLiteOk);
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TFLMRegistration Register_DECODE() {
+  return tflite::micro::RegisterOp(nullptr, Prepare, Eval);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state.cc b/tensorflow/lite/micro/kernels/decode_state.cc
new file mode 100644
index 00000000000..87bb6a506d3
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state.cc
@@ -0,0 +1,36 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+#include "tensorflow/lite/micro/micro_context.h"
+
+namespace tflite {
+
+DecodeState* DecodeState::CreateDecodeStateLUT(
+    const TfLiteContext* context, MicroProfilerInterface* profiler) {
+  MicroContext* const micro_context = GetMicroContext(context);
+  void* buffer =
+      micro_context->AllocatePersistentBuffer(sizeof(DecodeStateLUT));
+  if (buffer == nullptr) {
+    return nullptr;
+  }
+  DecodeState* dsp = new (buffer) DecodeStateLUT(context, profiler);
+
+  return dsp;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state.h b/tensorflow/lite/micro/kernels/decode_state.h
new file mode 100644
index 00000000000..80594fd2c26
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state.h
@@ -0,0 +1,87 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_profiler_interface.h"
+
+namespace tflite {
+
+struct DecodeState {
+  DecodeState() = delete;
+
+  DecodeState(const TfLiteContext* context, MicroProfilerInterface* profiler)
+      : context_(context), micro_profiler_(profiler) {}
+
+  virtual TfLiteStatus Setup(const TfLiteTensor& input,
+                             const TfLiteTensor& ancillary,
+                             const TfLiteTensor& output) = 0;
+  virtual TfLiteStatus Decode(const TfLiteEvalTensor& input,
+                              const TfLiteEvalTensor& ancillary,
+                              const TfLiteEvalTensor& output) = 0;
+
+  static DecodeState* CreateDecodeStateLUT(const TfLiteContext* context,
+                                           MicroProfilerInterface* profiler);
+
+  static uint8_t Type(const TfLiteTensor& ancillary) {
+    return GetTensorData<uint8_t>(&ancillary)[kDcmDecodeTypeOffset];
+  }
+
+  static uint8_t Type(const TfLiteEvalTensor& ancillary) {
+    return micro::GetTensorData<uint8_t>(&ancillary)[kDcmDecodeTypeOffset];
+  }
+
+  static uint8_t Version(const TfLiteTensor& ancillary) {
+    return GetTensorData<uint8_t>(&ancillary)[kDcmVersionOffset];
+  }
+
+  static uint8_t Version(const TfLiteEvalTensor& ancillary) {
+    return micro::GetTensorData<uint8_t>(&ancillary)[kDcmVersionOffset];
+  }
+
+ protected:
+  virtual ~DecodeState() = default;
+
+  // Decode Common Metadata constants
+ public:
+  static constexpr uint8_t kDcmTypeLUT = 0;
+  static constexpr uint8_t kDcmTypeCustom = 127;
+
+  static constexpr size_t kDcmSizeInBytes = 16;
+
+ private:
+  static constexpr size_t kDcmDecodeTypeOffset = 0;
+  static constexpr size_t kDcmVersionOffset = 1;
+
+  // DecodeState vars
+ protected:
+  const TfLiteContext* context_;
+  MicroProfilerInterface* micro_profiler_;
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_H_
diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.cc b/tensorflow/lite/micro/kernels/decode_state_lut.cc
new file mode 100644
index 00000000000..477c21d80a7
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state_lut.cc
@@ -0,0 +1,630 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/decode_state_lut.h"
+
+#include <cstddef>
+#include <type_traits>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_log.h"
+#include "tensorflow/lite/micro/micro_profiler.h"
+
+namespace tflite {
+
+TfLiteStatus DecodeStateLUT::Setup(const TfLiteTensor& input,
+                                   const TfLiteTensor& ancillary,
+                                   const TfLiteTensor& output) {
+  const uint8_t* const ancillary_data = GetTensorData<uint8_t>(&ancillary);
+  if (ancillary_data[kDcmVersionOffset] != 1) {
+    MicroPrintf("unsupported version %u", ancillary_data[kDcmVersionOffset]);
+    return kTfLiteError;
+  }
+
+  // resolve num_channels_ and use_alternate_axis_
+  if (output.quantization.type == kTfLiteAffineQuantization &&
+      output.quantization.params != nullptr) {
+    const TfLiteAffineQuantization* quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(output.quantization.params);
+    num_channels_ = quantization->scale->size;
+    if ((quantization->quantized_dimension == output.dims->size - 1) &&
+        num_channels_ > 1) {
+      use_alternate_axis_ = true;
+    } else if (quantization->quantized_dimension != 0) {
+      MicroPrintf("unsupported quantization axis %u",
+                  quantization->quantized_dimension);
+      return kTfLiteError;
+    }
+  }
+
+  compressed_indices_ = GetTensorData<uint8_t>(&input);
+  count_indices_ = NumElements(&output);
+  elements_per_channel_ =
+      use_alternate_axis_ ? 1 : count_indices_ / num_channels_;
+  value_table_ = &ancillary_data[kDcmSizeInBytes];
+  value_table_channel_stride_ = ancillary_data[kDcmValueTableStrideOffset];
+  compressed_bit_width_ =
+      ancillary_data[kDcmParamsOffset] & kDcmParamsBitWidthMask;
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus DecodeStateLUT::Decode(const TfLiteEvalTensor& input,
+                                    const TfLiteEvalTensor& ancillary,
+                                    const TfLiteEvalTensor& output) {
+  void* const buffer = const_cast<void*>(micro::GetTensorData<void>(&output));
+  TFLITE_DCHECK(buffer != nullptr);
+
+  switch (output.type) {
+    case kTfLiteBool:
+      DecompressToBuffer<bool>(buffer);
+      break;
+    case kTfLiteFloat32:
+      DecompressToBuffer<float>(buffer);
+      break;
+    case kTfLiteInt8:
+      DecompressToBuffer<int8_t>(buffer);
+      break;
+    case kTfLiteInt16:
+      DecompressToBuffer<int16_t>(buffer);
+      break;
+    case kTfLiteInt32:
+      DecompressToBuffer<int32_t>(buffer);
+      break;
+    case kTfLiteInt64:
+      DecompressToBuffer<int64_t>(buffer);
+      break;
+    default:
+      MicroPrintf("unsupported tensor type %s", TfLiteTypeGetName(output.type));
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+T* DecodeStateLUT::DecompressToBuffer(void* buffer) {
+  TFLITE_DCHECK(compressed_bit_width_ <= kMaxBitWidth);
+  TFLITE_DCHECK(compressed_bit_width_ > 0);
+
+  if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 4 &&
+      !use_alternate_axis_) {
+    DecompressToBufferWidth4_16(static_cast<int8_t*>(buffer));
+  } else if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 3 &&
+             !use_alternate_axis_) {
+    DecompressToBufferWidth3_32(static_cast<int8_t*>(buffer));
+  } else if (std::is_same<T, int8_t>::value && compressed_bit_width_ == 2 &&
+             !use_alternate_axis_) {
+    DecompressToBufferWidth2_16(static_cast<int8_t*>(buffer));
+  } else {
+    DecompressToBufferWidthAny<T>(static_cast<T*>(buffer));
+  }
+
+  return static_cast<T*>(buffer);
+}
+
+template bool* DecodeStateLUT::DecompressToBuffer<bool>(void*);
+template float* DecodeStateLUT::DecompressToBuffer<float>(void*);
+template int8_t* DecodeStateLUT::DecompressToBuffer<int8_t>(void*);
+template int16_t* DecodeStateLUT::DecompressToBuffer<int16_t>(void*);
+template int32_t* DecodeStateLUT::DecompressToBuffer<int32_t>(void*);
+template int64_t* DecodeStateLUT::DecompressToBuffer<int64_t>(void*);
+
+void DecodeStateLUT::DecompressToBufferWidth4_16(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint64_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x0F)) {
+      const size_t index = GetNextTableIndexWidth4(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 16
+    if (count >= 16) {
+      const uint64_t* indices = reinterpret_cast<const uint64_t*>(
+          &compressed_indices_[current_offset >> 1]);
+
+      while (count >= 16) {
+        count -= 16;
+        uint64_t index = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index >> 4) & 0x0F]);
+        value |= static_cast<uint64_t>(value_table[index & 0x0F]) << 8;
+        value |= static_cast<uint64_t>(value_table[(index >> 12) & 0x0F]) << 16;
+        value |= static_cast<uint64_t>(value_table[(index >> 8) & 0x0F]) << 24;
+        value |= static_cast<uint64_t>(value_table[(index >> 20) & 0x0F]) << 32;
+        value |= static_cast<uint64_t>(value_table[(index >> 16) & 0x0F]) << 40;
+        value |= static_cast<uint64_t>(value_table[(index >> 28) & 0x0F]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index >> 24) & 0x0F]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index >> 36) & 0x0F]);
+        value2 |= static_cast<uint64_t>(value_table[(index >> 32) & 0x0F]) << 8;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 44) & 0x0F])
+                  << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 40) & 0x0F])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 52) & 0x0F])
+                  << 32;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 48) & 0x0F])
+                  << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 60) & 0x0F])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 56) & 0x0F])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        buffer += 16;
+      }
+
+      current_offset =
+          (reinterpret_cast<const uint8_t*>(indices) - compressed_indices_)
+          << 1;
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth4(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+void DecodeStateLUT::DecompressToBufferWidth2_16(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint32_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x0F)) {
+      const size_t index = GetNextTableIndexWidth2(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 16
+    if (count >= 16) {
+      const uint32_t* indices = reinterpret_cast<const uint32_t*>(
+          &compressed_indices_[current_offset >> 2]);
+
+      while (count >= 16) {
+        count -= 16;
+        uint32_t index = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index >> 6) & 0x03]);
+        value |= static_cast<uint64_t>(value_table[(index >> 4) & 0x03]) << 8;
+        value |= static_cast<uint64_t>(value_table[(index >> 2) & 0x03]) << 16;
+        value |= static_cast<uint64_t>(value_table[index & 0x03]) << 24;
+        value |= static_cast<uint64_t>(value_table[(index >> 14) & 0x03]) << 32;
+        value |= static_cast<uint64_t>(value_table[(index >> 12) & 0x03]) << 40;
+        value |= static_cast<uint64_t>(value_table[(index >> 10) & 0x03]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index >> 8) & 0x03]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index >> 22) & 0x03]);
+        value2 |= static_cast<uint64_t>(value_table[(index >> 20) & 0x03]) << 8;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 18) & 0x03])
+                  << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 16) & 0x03])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 30) & 0x03])
+                  << 32;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 28) & 0x03])
+                  << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 26) & 0x03])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index >> 24) & 0x03])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        buffer += 16;
+      }
+
+      current_offset =
+          (reinterpret_cast<const uint8_t*>(indices) - compressed_indices_)
+          << 2;
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth2(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+void DecodeStateLUT::DecompressToBufferWidth3_32(int8_t* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  const size_t stride = value_table_channel_stride_;
+  const uint8_t* value_table = static_cast<const uint8_t*>(value_table_);
+  const size_t max_count = elements_per_channel_;
+  size_t current_offset = 0;
+
+  for (size_t channel = 0; channel < num_channels_; channel++) {
+    size_t count = max_count;
+
+    // process elements at start of channel up to next uint32_t alignment of
+    // compressed_indices_
+    while (count > 0 && (current_offset & 0x1F)) {
+      const size_t index = GetNextTableIndexWidth3(current_offset++);
+      *buffer++ = value_table[index];
+      count -= 1;
+    }
+
+    // process elements in current channel in groups of 32
+    if (count >= 32) {
+      const uint32_t* indices = reinterpret_cast<const uint32_t*>(
+          &compressed_indices_[(current_offset >> 5) * 12]);
+
+      while (count >= 32) {
+        count -= 32;
+        uint32_t index0 = *indices++;
+        uint32_t index1 = *indices++;
+        uint32_t index2 = *indices++;
+        uint64_t value, value2;
+
+        value = static_cast<uint64_t>(value_table[(index0 >> 5) & 0x07]);
+        value |= static_cast<uint64_t>(value_table[(index0 >> 2) & 0x07]) << 8;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index0 << 1) & 0b110) | ((index0 >> 15) & 0b1)])
+            << 16;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 12) & 0x07])
+                 << 24;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 9) & 0x07]) << 32;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index0 >> 6) & 0b100) | ((index0 >> 22) & 0b11)])
+            << 40;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 19) & 0x07])
+                 << 48;
+        value |= static_cast<uint64_t>(value_table[(index0 >> 16) & 0x07])
+                 << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index0 >> 29) & 0x07]);
+        value2 |= static_cast<uint64_t>(value_table[(index0 >> 26) & 0x07])
+                  << 8;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index0 >> 23) & 0b110) | ((index1 >> 7) & 0b1)])
+            << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 4) & 0x07])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 1) & 0x07])
+                  << 32;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index1 << 2) & 0b100) | ((index1 >> 14) & 0b11)])
+            << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 11) & 0x07])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index1 >> 8) & 0x07])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 8) = value2;
+
+        value = static_cast<uint64_t>(value_table[(index1 >> 21) & 0x07]);
+        value |= static_cast<uint64_t>(value_table[(index1 >> 18) & 0x07]) << 8;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index1 >> 15) & 0b110) | ((index1 >> 31) & 0b1)])
+            << 16;
+        value |= static_cast<uint64_t>(value_table[(index1 >> 28) & 0x07])
+                 << 24;
+        value |= static_cast<uint64_t>(value_table[(index1 >> 25) & 0x07])
+                 << 32;
+        value |=
+            static_cast<uint64_t>(
+                value_table[((index1 >> 22) & 0b100) | ((index2 >> 6) & 0b11)])
+            << 40;
+        value |= static_cast<uint64_t>(value_table[(index2 >> 3) & 0x07]) << 48;
+        value |= static_cast<uint64_t>(value_table[(index2 >> 0) & 0x07]) << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 16) = value;
+
+        value2 = static_cast<uint64_t>(value_table[(index2 >> 13) & 0x07]);
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 10) & 0x07])
+                  << 8;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index2 >> 7) & 0b110) | ((index2 >> 23) & 0b1)])
+            << 16;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 20) & 0x07])
+                  << 24;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 17) & 0x07])
+                  << 32;
+        value2 |=
+            static_cast<uint64_t>(
+                value_table[((index2 >> 14) & 0b100) | ((index2 >> 30) & 0b11)])
+            << 40;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 27) & 0x07])
+                  << 48;
+        value2 |= static_cast<uint64_t>(value_table[(index2 >> 24) & 0x07])
+                  << 56;
+
+        *reinterpret_cast<uint64_t*>(buffer + 24) = value2;
+
+        buffer += 32;
+        current_offset += 32;
+      }
+    }
+
+    // process remaining elements in current channel
+    while (count > 0) {
+      count -= 1;
+      const size_t index = GetNextTableIndexWidth3(current_offset++);
+      *buffer++ = value_table[index];
+    }
+
+    value_table += stride;
+  }
+}
+
+// TODO(ddavis-2015): templating GetNextTableIndexWidth<N> makes this method
+// more than 2x faster, but with a large code size increase
+template <typename T>
+void DecodeStateLUT::DecompressToBufferWidthAny(T* buffer) {
+  ScopedMicroProfiler scoped_profiler(__func__, micro_profiler_);
+
+  if (use_alternate_axis_) {
+    const size_t stride = value_table_channel_stride_;
+    size_t current_offset = 0;
+    size_t count = count_indices_;
+
+    while (count > 0) {
+      const T* value_table = static_cast<const T*>(value_table_);
+      for (size_t channel = 0; channel < num_channels_; channel++) {
+        size_t index;
+        switch (compressed_bit_width_) {
+          case 1:
+            index = GetNextTableIndexWidth1(current_offset);
+            break;
+          case 2:
+            index = GetNextTableIndexWidth2(current_offset);
+            break;
+          case 3:
+            index = GetNextTableIndexWidth3(current_offset);
+            break;
+          case 4:
+            index = GetNextTableIndexWidth4(current_offset);
+            break;
+          case 5:
+            index = GetNextTableIndexWidth5(current_offset);
+            break;
+          case 6:
+            index = GetNextTableIndexWidth6(current_offset);
+            break;
+          case 7:
+            index = GetNextTableIndexWidth7(current_offset);
+            break;
+        }
+        current_offset++;
+        *buffer++ = value_table[index];
+        value_table += stride;
+      }
+      count -= num_channels_;
+    }
+  } else {
+    const size_t stride = value_table_channel_stride_;
+    const T* value_table = static_cast<const T*>(value_table_);
+    const size_t max_count = elements_per_channel_;
+    size_t current_offset = 0;
+
+    for (size_t channel = 0; channel < num_channels_; channel++) {
+      size_t count = max_count;
+
+      while (count-- > 0) {
+        size_t index;
+        switch (compressed_bit_width_) {
+          case 1:
+            index = GetNextTableIndexWidth1(current_offset);
+            break;
+          case 2:
+            index = GetNextTableIndexWidth2(current_offset);
+            break;
+          case 3:
+            index = GetNextTableIndexWidth3(current_offset);
+            break;
+          case 4:
+            index = GetNextTableIndexWidth4(current_offset);
+            break;
+          case 5:
+            index = GetNextTableIndexWidth5(current_offset);
+            break;
+          case 6:
+            index = GetNextTableIndexWidth6(current_offset);
+            break;
+          case 7:
+            index = GetNextTableIndexWidth7(current_offset);
+            break;
+        }
+        current_offset++;
+        *buffer++ = value_table[index];
+      }
+      value_table += stride;
+    }
+  }
+}
+
+template void DecodeStateLUT::DecompressToBufferWidthAny(bool*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(float*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int8_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int16_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int32_t*);
+template void DecodeStateLUT::DecompressToBufferWidthAny(int64_t*);
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth7(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 7;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 1;
+    case 1:
+      return ((indices[0] & 0b1) << 6) | (indices[1] >> 2);
+    case 2:
+      return ((indices[1] & 0b11) << 5) | (indices[2] >> 3);
+    case 3:
+      return ((indices[2] & 0b111) << 4) | (indices[3] >> 4);
+    case 4:
+      return ((indices[3] & 0x0F) << 3) | (indices[4] >> 5);
+    case 5:
+      return ((indices[4] & 0x1F) << 2) | (indices[5] >> 6);
+    case 6:
+      return ((indices[5] & 0x3F) << 1) | (indices[6] >> 7);
+    case 7:
+      return indices[6] & 0x7F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth6(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 2) * 3;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b11) {
+    case 0:
+      return indices[0] >> 2;
+    case 1:
+      return ((indices[0] & 0b11) << 4) | (indices[1] >> 4);
+    case 2:
+      return ((indices[1] & 0x0F) << 2) | (indices[2] >> 6);
+    case 3:
+      return indices[2] & 0x3F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth5(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 5;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 3;
+    case 1:
+      return ((indices[0] & 0b111) << 2) | (indices[1] >> 6);
+    case 2:
+      return (indices[1] >> 1) & 0x1F;
+    case 3:
+      return ((indices[1] & 0b1) << 4) | (indices[2] >> 4);
+    case 4:
+      return ((indices[2] & 0x0F) << 1) | (indices[3] >> 7);
+    case 5:
+      return (indices[3] >> 2) & 0x1F;
+    case 6:
+      return ((indices[3] & 0b11) << 3) | (indices[4] >> 5);
+    case 7:
+      return indices[4] & 0x1F;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth4(
+    const size_t current_offset) {
+  if (current_offset & 1) {
+    return compressed_indices_[current_offset >> 1] & 0x0F;
+  } else {
+    return compressed_indices_[current_offset >> 1] >> 4;
+  }
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth3(
+    const size_t current_offset) {
+  const size_t current_byte_index = (current_offset >> 3) * 3;
+  const uint8_t* indices = &compressed_indices_[current_byte_index];
+  switch (current_offset & 0b111) {
+    case 0:
+      return indices[0] >> 5;
+    case 1:
+      return (indices[0] >> 2) & 0b111;
+    case 2:
+      return ((indices[0] & 0b11) << 1) | (indices[1] >> 7);
+    case 3:
+      return (indices[1] >> 4) & 0b111;
+    case 4:
+      return (indices[1] >> 1) & 0b111;
+    case 5:
+      return ((indices[1] & 0b1) << 2) | (indices[2] >> 6);
+    case 6:
+      return (indices[2] >> 3) & 0b111;
+    case 7:
+      return indices[2] & 0b111;
+  }
+  // NOTREACHED
+  return 0;
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth2(
+    const size_t current_offset) {
+  if (current_offset & 0b10) {
+    if (current_offset & 1) {
+      return compressed_indices_[current_offset >> 2] & 0x03;
+    } else {
+      return (compressed_indices_[current_offset >> 2] >> 2) & 0x03;
+    }
+  } else {
+    if (current_offset & 1) {
+      return (compressed_indices_[current_offset >> 2] >> 4) & 0x03;
+    } else {
+      return (compressed_indices_[current_offset >> 2] >> 6) & 0x03;
+    }
+  }
+}
+
+inline size_t DecodeStateLUT::GetNextTableIndexWidth1(
+    const size_t current_offset) {
+  const size_t shift = ~current_offset & 0b111;
+  return (compressed_indices_[current_offset >> 3] >> shift) & 0b1;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/decode_state_lut.h b/tensorflow/lite/micro/kernels/decode_state_lut.h
new file mode 100644
index 00000000000..dbb64683960
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_state_lut.h
@@ -0,0 +1,92 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+
+namespace tflite {
+
+struct DecodeStateLUT : public DecodeState {
+  DecodeStateLUT() = delete;
+
+  DecodeStateLUT(const TfLiteContext* context, MicroProfilerInterface* profiler)
+      : DecodeState(context, profiler) {}
+
+  virtual TfLiteStatus Setup(const TfLiteTensor& input,
+                             const TfLiteTensor& ancillary,
+                             const TfLiteTensor& output) override;
+  virtual TfLiteStatus Decode(const TfLiteEvalTensor& input,
+                              const TfLiteEvalTensor& ancillary,
+                              const TfLiteEvalTensor& output) override;
+
+ protected:
+  // LUT compression constants
+  static constexpr size_t kMaxBitWidth = 7;
+  static constexpr size_t kMaxValueTableChannelStride = 128;
+
+ private:
+  // LUT Decode Common Metadata constants
+  static constexpr size_t kDcmVersionOffset = 4;
+  static constexpr size_t kDcmParamsOffset = 5;
+  static constexpr uint8_t kDcmParamsBitWidthMask = 0x07;
+  static constexpr size_t kDcmValueTableStrideOffset = 6;
+
+ protected:
+  virtual ~DecodeStateLUT() = default;
+
+  template <typename T>
+  T* DecompressToBuffer(void* buffer);
+
+  // optimized C++ for INT8, use_alt_axis == false
+  void DecompressToBufferWidth4_16(int8_t* buffer);
+  void DecompressToBufferWidth3_32(int8_t* buffer);
+  void DecompressToBufferWidth2_16(int8_t* buffer);
+
+  // generic C++ for any bit width and value table type
+  template <typename T>
+  void DecompressToBufferWidthAny(T* buffer);
+
+  // Optimized C++ table index fetch
+  inline size_t GetNextTableIndexWidth7(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth6(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth5(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth4(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth3(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth2(const size_t current_offset);
+  inline size_t GetNextTableIndexWidth1(const size_t current_offset);
+
+ protected:
+  const uint8_t* compressed_indices_ = nullptr;
+  size_t count_indices_ = 0;
+  size_t num_channels_ = 1;
+  size_t elements_per_channel_ = 0;         // computed from use_alternate_axis_
+  const void* value_table_ = nullptr;       // Pointer into FlatBuffer values
+  uint8_t value_table_channel_stride_ = 0;  // elements per channel
+  uint8_t compressed_bit_width_ = 0;        // 1 to 7 bits
+  bool use_alternate_axis_ = false;         // shape channel axis:
+                                            // false = first, true = last
+
+ private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_KERNELS_DECODE_STATE_LUT_H_
diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc
new file mode 100644
index 00000000000..3008736e535
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/decode_test.cc
@@ -0,0 +1,333 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <initializer_list>
+#include <type_traits>
+
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/decode_state.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+struct TensorInDatum {
+  const void* const data;
+  const TfLiteIntArray& dims;
+};
+
+struct TensorOutDatum {
+  void* const data;
+  const TfLiteIntArray& dims;
+  const TfLiteType type;
+  const TfLiteFloatArray& scales;
+  const TfLiteIntArray& zero_points;
+  const int quantized_dimension;
+
+  // initialized by CreatePerChannelQuantizedTensor
+  const TfLiteAffineQuantization affine_quantization;
+};
+
+template <typename T, size_t N>
+struct AncillaryLUT {
+  AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes],
+               const T (&values)[N]) {
+    std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_));
+    std::copy(std::begin(values), std::end(values), std::begin(value_table_));
+  }
+
+ private:
+  uint8_t dcm_[tflite::DecodeState::kDcmSizeInBytes];
+  T value_table_[N > 0 ? N : 1];  // assure not zero length
+};
+
+constexpr int kBitWidthLUT = 2;
+
+constexpr int8_t kAncillaryDataLUT0[] = {1, 2, 3, 4};
+constexpr int16_t kAncillaryDataLUT1[] = {5, 6, 7, 8};
+
+constexpr uint8_t kDcmLUT0[tflite::DecodeState::kDcmSizeInBytes] = {
+    tflite::DecodeState::kDcmTypeLUT,  // type: LUT
+    1,                                 // DCM version: 1
+    0,                                 // reserved
+    0,                                 // reserved
+    1,                                 // LUT version: 1
+    kBitWidthLUT,                      // Parameters: bit-width 2
+    std::size(kAncillaryDataLUT0),     // channel stride
+};
+
+constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = {
+    tflite::DecodeState::kDcmTypeLUT,  // type: LUT
+    1,                                 // DCM version: 1
+    0,                                 // reserved
+    0,                                 // reserved
+    1,                                 // LUT version: 1
+    kBitWidthLUT,                      // Parameters: bit-width 2
+    std::size(kAncillaryDataLUT1),     // channel stride
+};
+
+// Align the tensor data the same as a Buffer in the TfLite schema
+alignas(16) const
+    AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)> kAncillaryLUT0 = {
+        {kDcmLUT0}, {kAncillaryDataLUT0}};
+alignas(16) const
+    AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)> kAncillaryLUT1 = {
+        {kDcmLUT1}, {kAncillaryDataLUT1}};
+alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4};
+
+// Tensor shapes as TfLiteIntArray
+constexpr int kOutputShapeLUT[] = {3, 1, 2, 4};
+constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)};
+constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
+constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
+
+constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1};
+constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5};
+
+template <typename T>
+TfLiteStatus CheckOutput(const TfLiteTensor& output,
+                         const void* const expected) {
+  const T* const expected_data = reinterpret_cast<const T*>(expected);
+  const T* const output_data = tflite::GetTensorData<T>(&output);
+
+  constexpr float kTolerance = 1e-5;
+  const size_t kOutputCount = tflite::NumElements(&output);
+  for (size_t i = 0; i < kOutputCount; i++) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_data[i], output_data[i], kTolerance);
+    TF_LITE_MICRO_CHECK_FAIL();
+  }
+
+  return kTfLiteOk;
+}
+
+template <size_t kNumInputs, size_t kNumOutputs>
+TfLiteStatus ExecuteDecodeTest(
+    TfLiteTensor* tensors, const TFLMRegistration& registration,
+    const std::initializer_list<const void*>& expected) {
+  int kInputArrayData[kNumInputs + 1] = {kNumInputs};
+  for (size_t i = 0; i < kNumInputs; i++) {
+    kInputArrayData[i + 1] = i;
+  }
+  TfLiteIntArray* inputs_array = IntArrayFromInts(kInputArrayData);
+
+  int kOutputArrayData[kNumOutputs + 1] = {kNumOutputs};
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    kOutputArrayData[i + 1] = i + kNumInputs;
+  }
+  TfLiteIntArray* outputs_array = IntArrayFromInts(kOutputArrayData);
+
+  micro::KernelRunner runner(registration, tensors, kNumInputs + kNumOutputs,
+                             inputs_array, outputs_array, nullptr);
+
+  if (runner.InitAndPrepare() != kTfLiteOk || runner.Invoke() != kTfLiteOk) {
+    return kTfLiteError;
+  }
+
+  const TfLiteTensor* const output_tensors = &tensors[kNumInputs];
+  TfLiteStatus status = kTfLiteError;
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    switch (output_tensors[i].type) {
+      case kTfLiteInt8:
+        status = CheckOutput<int8_t>(output_tensors[i], expected.begin()[i]);
+        break;
+      case kTfLiteInt16:
+        status = CheckOutput<int16_t>(output_tensors[i], expected.begin()[i]);
+        break;
+      default:
+        TF_LITE_MICRO_FAIL("unsupported tensor type in test");
+        break;
+    }
+  }
+
+  return status;
+}
+
+template <size_t kNumInputs, size_t kNumOutputs>
+void TestDecode(const std::initializer_list<const TensorInDatum*>& encodes,
+                const std::initializer_list<const TensorInDatum*>& ancillaries,
+                const std::initializer_list<const TensorOutDatum*>& outputs,
+                const std::initializer_list<const void*>& expected,
+                const TFLMRegistration& registration,
+                const TfLiteStatus expected_status = kTfLiteOk) {
+  TfLiteTensor tensors[kNumInputs + kNumOutputs] = {};
+
+  for (size_t i = 0; i < kNumInputs; i += 2) {
+    const TensorInDatum& tid_encode = *encodes.begin()[i / 2];
+    tensors[i] = CreateTensor(tid_encode.data,
+                              const_cast<TfLiteIntArray*>(&tid_encode.dims),
+                              false, kTfLiteUInt8);
+    const TensorInDatum& tid_ancillary = *ancillaries.begin()[i / 2];
+    tensors[i + 1] = CreateTensor(
+        tid_ancillary.data, const_cast<TfLiteIntArray*>(&tid_ancillary.dims),
+        false, kTfLiteUInt8);
+  }
+  for (size_t i = 0; i < kNumOutputs; i++) {
+    const TensorOutDatum& tod = *outputs.begin()[i];
+    if (tod.scales.size == 0) {
+      tensors[i + kNumInputs] = CreateTensor(
+          tod.data, const_cast<TfLiteIntArray*>(&tod.dims), false, tod.type);
+    } else {
+      tensors[i + kNumInputs] = CreatePerChannelQuantizedTensor(
+          tod.data, const_cast<TfLiteIntArray*>(&tod.dims),
+          const_cast<TfLiteFloatArray*>(&tod.scales),
+          const_cast<TfLiteIntArray*>(&tod.zero_points),
+          const_cast<TfLiteAffineQuantization*>(&tod.affine_quantization),
+          tod.quantized_dimension, false, tod.type);
+    }
+  }
+
+  TfLiteStatus s = ExecuteDecodeTest<kNumInputs, kNumOutputs>(
+      tensors, registration, expected);
+  TF_LITE_MICRO_EXPECT_EQ(s, expected_status);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(DecodeSingleTensor) {
+  // Align the tensor data the same as a Buffer in the TfLite schema
+  alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {};
+
+  const TfLiteIntArray* const encoded_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
+  static const tflite::testing::TensorInDatum tid_encode = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      encodes = {
+          &tid_encode,
+      };
+
+  const TfLiteIntArray* const ancillary_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
+  static const tflite::testing::TensorInDatum tid_ancillary = {
+      &tflite::testing::kAncillaryLUT0,
+      *ancillary_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      ancillaries = {&tid_ancillary};
+
+  const TfLiteIntArray* const output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+  constexpr float output_scales_data[] = {0};
+  const TfLiteFloatArray* const output_scales =
+      tflite::testing::FloatArrayFromFloats(output_scales_data);
+  constexpr int output_zero_points_data[] = {0};
+  const TfLiteIntArray* const output_zero_points =
+      tflite::testing::IntArrayFromInts(output_zero_points_data);
+  static const tflite::testing::TensorOutDatum tod = {
+      output_data,
+      *output_dims,
+      kTfLiteInt8,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
+      outputs = {&tod};
+
+  const std::initializer_list<const void*> expected = {
+      tflite::testing::kExpectLUT0,
+  };
+
+  tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
+                              outputs.size()>(
+      encodes, ancillaries, outputs, expected, tflite::Register_DECODE());
+}
+
+TF_LITE_MICRO_TEST(DecodeTwoTensors) {
+  // Align the tensor data the same as a Buffer in the TfLite schema
+  alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {};
+  alignas(16)
+      int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {};
+
+  const TfLiteIntArray* const encoded_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
+  static const tflite::testing::TensorInDatum tid_encode0 = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static const tflite::testing::TensorInDatum tid_encode1 = {
+      tflite::testing::kEncodedLUT,
+      *encoded_dims,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      encodes = {&tid_encode0, &tid_encode1};
+
+  const TfLiteIntArray* const ancillary_dims0 =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
+  static const tflite::testing::TensorInDatum tid_ancillary0 = {
+      &tflite::testing::kAncillaryLUT0,
+      *ancillary_dims0,
+  };
+  const TfLiteIntArray* const ancillary_dims1 =
+      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1);
+  static const tflite::testing::TensorInDatum tid_ancillary1 = {
+      &tflite::testing::kAncillaryLUT1,
+      *ancillary_dims1,
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
+      ancillaries = {&tid_ancillary0, &tid_ancillary1};
+
+  const TfLiteIntArray* const output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+  constexpr float output_scales_data[] = {1, 1.0f};
+  const TfLiteFloatArray* const output_scales =
+      tflite::testing::FloatArrayFromFloats(output_scales_data);
+  constexpr int output_zero_points_data[] = {1, 0};
+  const TfLiteIntArray* const output_zero_points =
+      tflite::testing::IntArrayFromInts(output_zero_points_data);
+  static const tflite::testing::TensorOutDatum tod0 = {
+      output_data0,
+      *output_dims,
+      kTfLiteInt8,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static const tflite::testing::TensorOutDatum tod1 = {
+      output_data1,
+      *output_dims,
+      kTfLiteInt16,
+      *output_scales,
+      *output_zero_points,
+      0,
+      {},
+  };
+  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
+      outputs = {&tod0, &tod1};
+
+  const std::initializer_list<const void*> expected = {
+      tflite::testing::kExpectLUT0,
+      tflite::testing::kExpectLUT1,
+  };
+
+  tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
+                              outputs.size()>(
+      encodes, ancillaries, outputs, expected, tflite::Register_DECODE());
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 2e33a6730bd..8b76ca2cc17 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -53,6 +53,7 @@ TFLMRegistration Register_CONCATENATION();
 TFLMRegistration Register_CONV_2D();
 TFLMRegistration Register_COS();
 TFLMRegistration Register_CUMSUM();
+TFLMRegistration Register_DECODE();
 TFLMRegistration Register_DEPTH_TO_SPACE();
 TFLMRegistration Register_DEPTHWISE_CONV_2D();
 TFLMRegistration Register_DEQUANTIZE();
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index f3f2080f0aa..6a638d93b97 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -206,6 +206,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       ParseCumsum);
   }
 
+  TfLiteStatus AddDecode() {
+    const TFLMRegistration& registration = tflite::Register_DECODE();
+    return AddCustom("TFLM_DECODE", &registration);
+  }
+
   TfLiteStatus AddDelay() {
     // TODO(b/286250473): change back name to "Delay" and remove namespace
     return AddCustom("SignalDelay", tflite::tflm_signal::Register_DELAY());
diff --git a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
index 9b98849c472..651429b76ec 100644
--- a/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
+++ b/tensorflow/lite/micro/tools/benchmarking/op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ inline TfLiteStatus CreateOpResolver(TflmOpResolver& op_resolver) {
   TF_LITE_ENSURE_STATUS(op_resolver.AddConv2D());
   TF_LITE_ENSURE_STATUS(op_resolver.AddCos());
   TF_LITE_ENSURE_STATUS(op_resolver.AddCumSum());
+  TF_LITE_ENSURE_STATUS(op_resolver.AddDecode());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDelay());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDepthToSpace());
   TF_LITE_ENSURE_STATUS(op_resolver.AddDepthwiseConv2D());
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 0bf5532badf..a43abf7f7f7 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -386,6 +386,9 @@ $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/concatenation.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/conv_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/cumsum.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state.cc \
+$(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decode_state_lut.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/decompress_common.cc \
 $(TENSORFLOW_ROOT)tensorflow/lite/micro/kernels/depth_to_space.cc \

From 15ac156290878ced26340859f19b94ecec0f1885 Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 12:01:19 -0700
Subject: [PATCH 2/3] update copyright

---
 tensorflow/lite/micro/kernels/micro_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 8b76ca2cc17..b715c735017 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 6f96b2983a150c728ea00837e9b9953c9479634f Mon Sep 17 00:00:00 2001
From: ddavis-2015 <ddavis@bdti.com>
Date: Tue, 8 Jul 2025 17:05:24 -0700
Subject: [PATCH 3/3] Don't use constructors with global objects (bluepill will
 not call them).

Cleanup unit test.
---
 tensorflow/lite/micro/kernels/decode_test.cc | 121 ++++++++++---------
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/decode_test.cc b/tensorflow/lite/micro/kernels/decode_test.cc
index 3008736e535..69ee7f61a5f 100644
--- a/tensorflow/lite/micro/kernels/decode_test.cc
+++ b/tensorflow/lite/micro/kernels/decode_test.cc
@@ -47,6 +47,7 @@ struct TensorOutDatum {
 
 template <typename T, size_t N>
 struct AncillaryLUT {
+  AncillaryLUT() = delete;
   AncillaryLUT(const uint8_t (&dcm)[tflite::DecodeState::kDcmSizeInBytes],
                const T (&values)[N]) {
     std::copy(std::begin(dcm), std::end(dcm), std::begin(dcm_));
@@ -84,19 +85,11 @@ constexpr uint8_t kDcmLUT1[tflite::DecodeState::kDcmSizeInBytes] = {
 };
 
 // Align the tensor data the same as a Buffer in the TfLite schema
-alignas(16) const
-    AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)> kAncillaryLUT0 = {
-        {kDcmLUT0}, {kAncillaryDataLUT0}};
-alignas(16) const
-    AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)> kAncillaryLUT1 = {
-        {kDcmLUT1}, {kAncillaryDataLUT1}};
 alignas(16) const uint8_t kEncodedLUT[] = {0x1B, 0xE4};
 
 // Tensor shapes as TfLiteIntArray
 constexpr int kOutputShapeLUT[] = {3, 1, 2, 4};
 constexpr int kEncodedShapeLUT[] = {1, sizeof(kEncodedLUT)};
-constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
-constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
 
 constexpr int8_t kExpectLUT0[] = {1, 2, 3, 4, 4, 3, 2, 1};
 constexpr int16_t kExpectLUT1[] = {5, 6, 7, 8, 8, 7, 6, 5};
@@ -204,39 +197,55 @@ void TestDecode(const std::initializer_list<const TensorInDatum*>& encodes,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
+using tflite::testing::AncillaryLUT;
+using tflite::testing::kAncillaryDataLUT0;
+using tflite::testing::kAncillaryDataLUT1;
+using tflite::testing::kDcmLUT0;
+using tflite::testing::kDcmLUT1;
+using tflite::testing::kEncodedLUT;
+using tflite::testing::kEncodedShapeLUT;
+using tflite::testing::kExpectLUT0;
+using tflite::testing::kExpectLUT1;
+using tflite::testing::kOutputShapeLUT;
+using tflite::testing::TensorInDatum;
+using tflite::testing::TensorOutDatum;
+
 TF_LITE_MICRO_TEST(DecodeSingleTensor) {
   // Align the tensor data the same as a Buffer in the TfLite schema
-  alignas(16) int8_t output_data[std::size(tflite::testing::kExpectLUT0)] = {};
+  alignas(16) int8_t output_data[std::size(kExpectLUT0)] = {};
+  alignas(16) const AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)>
+      kAncillaryLUT = {{kDcmLUT0}, {kAncillaryDataLUT0}};
+
+  constexpr int kAncillaryShapeLUT[] = {1, sizeof(kAncillaryLUT)};
 
   const TfLiteIntArray* const encoded_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
-  static const tflite::testing::TensorInDatum tid_encode = {
-      tflite::testing::kEncodedLUT,
+      tflite::testing::IntArrayFromInts(kEncodedShapeLUT);
+  static const TensorInDatum tid_encode = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      encodes = {
-          &tid_encode,
-      };
+  static constexpr std::initializer_list<const TensorInDatum*> encodes = {
+      &tid_encode,
+  };
 
   const TfLiteIntArray* const ancillary_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
-  static const tflite::testing::TensorInDatum tid_ancillary = {
-      &tflite::testing::kAncillaryLUT0,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT);
+  static const TensorInDatum tid_ancillary = {
+      &kAncillaryLUT,
       *ancillary_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      ancillaries = {&tid_ancillary};
+  static constexpr std::initializer_list<const TensorInDatum*> ancillaries = {
+      &tid_ancillary};
 
   const TfLiteIntArray* const output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+      tflite::testing::IntArrayFromInts(kOutputShapeLUT);
   constexpr float output_scales_data[] = {0};
   const TfLiteFloatArray* const output_scales =
       tflite::testing::FloatArrayFromFloats(output_scales_data);
   constexpr int output_zero_points_data[] = {0};
   const TfLiteIntArray* const output_zero_points =
       tflite::testing::IntArrayFromInts(output_zero_points_data);
-  static const tflite::testing::TensorOutDatum tod = {
+  static const TensorOutDatum tod = {
       output_data,
       *output_dims,
       kTfLiteInt8,
@@ -245,12 +254,10 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) {
       0,
       {},
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
-      outputs = {&tod};
+  static constexpr std::initializer_list<const TensorOutDatum*> outputs = {
+      &tod};
 
-  const std::initializer_list<const void*> expected = {
-      tflite::testing::kExpectLUT0,
-  };
+  const std::initializer_list<const void*> expected = {kExpectLUT0};
 
   tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
                               outputs.size()>(
@@ -259,47 +266,53 @@ TF_LITE_MICRO_TEST(DecodeSingleTensor) {
 
 TF_LITE_MICRO_TEST(DecodeTwoTensors) {
   // Align the tensor data the same as a Buffer in the TfLite schema
-  alignas(16) int8_t output_data0[std::size(tflite::testing::kExpectLUT0)] = {};
-  alignas(16)
-      int16_t output_data1[std::size(tflite::testing::kExpectLUT1)] = {};
+  alignas(16) int8_t output_data0[std::size(kExpectLUT0)] = {};
+  alignas(16) int16_t output_data1[std::size(kExpectLUT1)] = {};
+  alignas(16) const AncillaryLUT<int8_t, std::size(kAncillaryDataLUT0)>
+      kAncillaryLUT0 = {{kDcmLUT0}, {kAncillaryDataLUT0}};
+  alignas(16) const AncillaryLUT<int16_t, std::size(kAncillaryDataLUT1)>
+      kAncillaryLUT1 = {{kDcmLUT1}, {kAncillaryDataLUT1}};
+
+  constexpr int kAncillaryShapeLUT0[] = {1, sizeof(kAncillaryLUT0)};
+  constexpr int kAncillaryShapeLUT1[] = {1, sizeof(kAncillaryLUT1)};
 
   const TfLiteIntArray* const encoded_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kEncodedShapeLUT);
-  static const tflite::testing::TensorInDatum tid_encode0 = {
-      tflite::testing::kEncodedLUT,
+      tflite::testing::IntArrayFromInts(kEncodedShapeLUT);
+  static const TensorInDatum tid_encode0 = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static const tflite::testing::TensorInDatum tid_encode1 = {
-      tflite::testing::kEncodedLUT,
+  static const TensorInDatum tid_encode1 = {
+      kEncodedLUT,
       *encoded_dims,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      encodes = {&tid_encode0, &tid_encode1};
+  static constexpr std::initializer_list<const TensorInDatum*> encodes = {
+      &tid_encode0, &tid_encode1};
 
   const TfLiteIntArray* const ancillary_dims0 =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT0);
-  static const tflite::testing::TensorInDatum tid_ancillary0 = {
-      &tflite::testing::kAncillaryLUT0,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT0);
+  static const TensorInDatum tid_ancillary0 = {
+      &kAncillaryLUT0,
       *ancillary_dims0,
   };
   const TfLiteIntArray* const ancillary_dims1 =
-      tflite::testing::IntArrayFromInts(tflite::testing::kAncillaryShapeLUT1);
-  static const tflite::testing::TensorInDatum tid_ancillary1 = {
-      &tflite::testing::kAncillaryLUT1,
+      tflite::testing::IntArrayFromInts(kAncillaryShapeLUT1);
+  static const TensorInDatum tid_ancillary1 = {
+      &kAncillaryLUT1,
       *ancillary_dims1,
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorInDatum*>
-      ancillaries = {&tid_ancillary0, &tid_ancillary1};
+  static constexpr std::initializer_list<const TensorInDatum*> ancillaries = {
+      &tid_ancillary0, &tid_ancillary1};
 
   const TfLiteIntArray* const output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShapeLUT);
+      tflite::testing::IntArrayFromInts(kOutputShapeLUT);
   constexpr float output_scales_data[] = {1, 1.0f};
   const TfLiteFloatArray* const output_scales =
       tflite::testing::FloatArrayFromFloats(output_scales_data);
   constexpr int output_zero_points_data[] = {1, 0};
   const TfLiteIntArray* const output_zero_points =
       tflite::testing::IntArrayFromInts(output_zero_points_data);
-  static const tflite::testing::TensorOutDatum tod0 = {
+  static const TensorOutDatum tod0 = {
       output_data0,
       *output_dims,
       kTfLiteInt8,
@@ -308,7 +321,7 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) {
       0,
       {},
   };
-  static const tflite::testing::TensorOutDatum tod1 = {
+  static const TensorOutDatum tod1 = {
       output_data1,
       *output_dims,
       kTfLiteInt16,
@@ -317,13 +330,11 @@ TF_LITE_MICRO_TEST(DecodeTwoTensors) {
       0,
       {},
   };
-  static constexpr std::initializer_list<const tflite::testing::TensorOutDatum*>
-      outputs = {&tod0, &tod1};
+  static constexpr std::initializer_list<const TensorOutDatum*> outputs = {
+      &tod0, &tod1};
 
-  const std::initializer_list<const void*> expected = {
-      tflite::testing::kExpectLUT0,
-      tflite::testing::kExpectLUT1,
-  };
+  const std::initializer_list<const void*> expected = {kExpectLUT0,
+                                                       kExpectLUT1};
 
   tflite::testing::TestDecode<encodes.size() + ancillaries.size(),
                               outputs.size()>(