Update parser/serialiser to support nested sequences

Conchylicultor · copybara-github · commit efcba76dedf2 · 2019-11-05T19:01:17.000-08:00
PiperOrigin-RevId: 278754939
diff --git a/tensorflow_datasets/core/example_parser.py b/tensorflow_datasets/core/example_parser.py
@@ -63,10 +63,25 @@ def parse_example(self, serialized_example):
       example: A nested `dict` of `tf.Tensor` values. The structure and tensors
         shape/dtype match the  `example_specs` provided at construction.
     """
+    nested_feature_specs = self._build_feature_specs()
+
+    # Because of RaggedTensor specs, feature_specs can be a 2-level nested dict,
+    # so have to wrap `tf.io.parse_single_example` between
+    # `flatten_nest_dict`/`pack_as_nest_dict`.
+    # {
+    #     'video/image': tf.io.FixedLenSequenceFeature(...),
+    #     'video/object/bbox': {
+    #         'ragged_flat_values': tf.io.FixedLenSequenceFeature(...),
+    #         'ragged_row_lengths_0', tf.io.FixedLenSequenceFeature(...),
+    #     },
+    # }
+    flat_feature_specs = utils.flatten_nest_dict(nested_feature_specs)
     example = tf.io.parse_single_example(
         serialized=serialized_example,
-        features=self._build_feature_specs(),
+        features=flat_feature_specs,
     )
+    example = utils.pack_as_nest_dict(example, nested_feature_specs)
+
     example = {
         k: _deserialize_single_field(example_data, tensor_info)
         for k, (example_data, tensor_info)
@@ -79,9 +94,12 @@ def parse_example(self, serialized_example):
 
 def _deserialize_single_field(example_data, tensor_info):
   """Reconstruct the serialized field."""
+  # Ragged tensor case:
+  if tensor_info.sequence_rank > 1:
+    example_data = _dict_to_ragged(example_data, tensor_info)
 
   # Restore shape if possible. TF Example flattened it.
-  if tensor_info.shape.count(None) < 2:
+  elif tensor_info.shape.count(None) < 2:
     shape = [-1 if i is None else i for i in tensor_info.shape]
     example_data = tf.reshape(example_data, shape)
 
@@ -91,6 +109,17 @@ def _deserialize_single_field(example_data, tensor_info):
   return example_data
 
 
+def _dict_to_ragged(example_data, tensor_info):
+  """Reconstruct the ragged tensor from the row ids."""
+  return tf.RaggedTensor.from_nested_row_lengths(
+      flat_values=example_data["ragged_flat_values"],
+      nested_row_lengths=[
+          example_data["ragged_row_lengths_{}".format(k)]
+          for k in range(tensor_info.sequence_rank - 1)
+      ],
+  )
+
+
 def _to_tf_example_spec(tensor_info):
   """Convert a `TensorInfo` into a feature proto object."""
   # Convert the dtype
@@ -126,6 +155,24 @@ def _to_tf_example_spec(tensor_info):
         allow_missing=True,
         default_value=tensor_info.default_value,
     )
+  elif tensor_info.sequence_rank > 1:  # RaggedTensor
+    # Decoding here should match encoding from `_add_ragged_fields` in
+    # `example_serializer.py`
+    tf_specs = {
+        "ragged_row_lengths_{}".format(k): tf.io.FixedLenSequenceFeature(  # pylint: disable=g-complex-comprehension
+            shape=(),
+            dtype=tf.int64,
+            allow_missing=True,
+        )
+        for k in range(tensor_info.sequence_rank - 1)
+    }
+    tf_specs["ragged_flat_values"] = tf.io.FixedLenSequenceFeature(
+        shape=tensor_info.shape[tensor_info.sequence_rank:],
+        dtype=dtype,
+        allow_missing=True,
+        default_value=tensor_info.default_value,
+    )
+    return tf_specs
   else:
     raise NotImplementedError(
         "Tensor with a unknown dimension not at the first position not "
diff --git a/tensorflow_datasets/core/example_serializer.py b/tensorflow_datasets/core/example_serializer.py
@@ -19,11 +19,13 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import numpy as np
 import six
 import tensorflow as tf
 
 from tensorflow_datasets.core import utils
+from tensorflow_datasets.core.features import feature as feature_lib
 
 
 class ExampleSerializer(object):
@@ -63,21 +65,39 @@ def _dict_to_tf_example(example_dict, tensor_info_dict=None):
     tensor_info_dict: `dict` of `tfds.feature.TensorInfo` If given, perform
       additional checks on the example dict (check dtype, shape, number of
       fields...)
+
+  Returns:
+    example_proto: `tf.train.Example`, the encoded example proto.
   """
-  def serialize_single_field(k, example_data, tensor_info):
+  def run_with_reraise(fn, k, example_data, tensor_info):
     with utils.try_reraise(
         "Error while serializing feature {} ({}): ".format(k, tensor_info)):
-      return _item_to_tf_feature(example_data, tensor_info)
+      return fn(example_data, tensor_info)
 
   if tensor_info_dict:
-    example_dict = {
-        k: serialize_single_field(k, example_data, tensor_info)
+    # Add the RaggedTensor fields for the nested sequences
+    # Nested sequences are encoded as {'flat_values':, 'row_lengths':}, so need
+    # to flatten the example nested dict again.
+    # Ex:
+    # Input: {'objects/tokens': [[0, 1, 2], [], [3, 4]]}
+    # Output: {
+    #     'objects/tokens/flat_values': [0, 1, 2, 3, 4],
+    #     'objects/tokens/row_lengths_0': [3, 0, 2],
+    # }
+    example_dict = utils.flatten_nest_dict({
+        k: run_with_reraise(_add_ragged_fields, k, example_data, tensor_info)
         for k, (example_data, tensor_info)
         in utils.zip_dict(example_dict, tensor_info_dict)
+    })
+    example_dict = {
+        k: run_with_reraise(_item_to_tf_feature, k, item, tensor_info)
+        for k, (item, tensor_info) in example_dict.items()
     }
   else:
+    # TODO(epot): The following code is only executed in tests and could be
+    # cleanned-up, as TensorInfo is always passed to _item_to_tf_feature.
     example_dict = {
-        k: serialize_single_field(k, example_data, None)
+        k: run_with_reraise(_item_to_tf_feature, k, example_data, None)
         for k, example_data in example_dict.items()
     }
 
@@ -88,18 +108,31 @@ def _is_string(item):
   """Check if the object contains string or bytes."""
   if isinstance(item, (six.binary_type, six.string_types)):
     return True
-  elif (isinstance(item, (tuple, list)) and
-        all(isinstance(x, (six.binary_type, six.string_types)) for x in item)):
+  elif (isinstance(item, (tuple, list)) and all(_is_string(x) for x in item)):
     return True
   elif (isinstance(item, np.ndarray) and  # binary or unicode
         (item.dtype.kind in ("U", "S") or item.dtype == object)):
     return True
   return False
 
 
+def _item_to_np_array(item, dtype, shape):
+  """Single item to a np.array."""
+  original_item = item
+  item = np.array(item, dtype=dtype.as_numpy_dtype)
+  utils.assert_shape_match(item.shape, shape)
+  if dtype == tf.string and not _is_string(original_item):
+    raise ValueError(
+        "Unsuported value: {}\nCould not convert to bytes list.".format(item))
+  return item
+
+
 def _item_to_tf_feature(item, tensor_info=None):
   """Single item to a tf.train.Feature."""
   v = item
+  # TODO(epot): tensor_info is only None for file_format_adapter tests.
+  # tensor_info could be made required to cleanup some of the following code,
+  # for instance by re-using _item_to_np_array.
   if not tensor_info and isinstance(v, (list, tuple)) and not v:
     raise ValueError(
         "Received an empty list value, so is unable to infer the "
@@ -146,3 +179,150 @@ def _item_to_tf_feature(item, tensor_info=None):
         "This may indicate that one of the FeatureConnectors received an "
         "unsupported value as input.".format(repr(v), repr(type(v)))
     )
+
+
+RaggedExtraction = collections.namedtuple("RaggedExtraction", [
+    "nested_list",
+    "flat_values",
+    "nested_row_lengths",
+    "curr_ragged_rank",
+    "tensor_info",
+])
+
+
+def _add_ragged_fields(example_data, tensor_info):
+  """Optionally convert the ragged data into flat/row_lengths fields.
+
+  Example:
+
+  ```
+  example_data = [
+      [1, 2, 3],
+      [],
+      [4, 5]
+  ]
+  tensor_info = TensorInfo(shape=(None, None,), sequence_rank=2, ...)
+  out = _add_ragged_fields(example_data, tensor_info)
+  out == {
+      'ragged_flat_values': ([0, 1, 2, 3, 4, 5], TensorInfo(shape=(), ...)),
+      'ragged_row_length_0': ([3, 0, 2], TensorInfo(shape=(None,), ...))
+  }
+  ```
+
+  If `example_data` isn't ragged, `example_data` and `tensor_info` are
+  forwarded as-is.
+
+  Args:
+    example_data: Data to optionally convert to ragged data.
+    tensor_info: TensorInfo associated with the given data.
+
+  Returns:
+    A tuple(example_data, tensor_info) if the tensor isn't ragged, or a dict of
+      tuple(example_data, tensor_info) if the tensor is ragged.
+  """
+  # Step 1: Extract the ragged tensor info
+  if tensor_info.sequence_rank:
+    # If the input is ragged, extract the nested values.
+    # 1-level sequences are converted as numpy and stacked.
+    # If the sequence is empty, a np.empty(shape=(0, ...)) array is returned.
+    example_data, nested_row_lengths = _extract_ragged_attributes(
+        example_data, tensor_info)
+
+  # Step 2: Format the ragged tensor data as dict
+  # No sequence or 1-level sequence, forward the data.
+  # Could eventually handle multi-level sequences with static lengths
+  # in a smarter way.
+  if tensor_info.sequence_rank < 2:
+    return (example_data, tensor_info)
+  # Multiple level sequence:
+  else:
+    tensor_info_length = feature_lib.TensorInfo(shape=(None,), dtype=tf.int64)
+    ragged_attr_dict = {
+        "ragged_row_lengths_{}".format(i): (length, tensor_info_length)
+        for i, length in enumerate(nested_row_lengths)
+    }
+    tensor_info_flat = feature_lib.TensorInfo(
+        shape=(None,) + tensor_info.shape[tensor_info.sequence_rank:],
+        dtype=tensor_info.dtype,
+    )
+    ragged_attr_dict["ragged_flat_values"] = (example_data, tensor_info_flat)
+    return ragged_attr_dict
+
+
+def _extract_ragged_attributes(nested_list, tensor_info):
+  """Extract the values for the tf.RaggedTensor __init__.
+
+  This extract the ragged tensor attributes which allow reconstruct the
+  ragged tensor with `tf.RaggedTensor.from_nested_row_lengths`.
+
+  Args:
+    nested_list: A nested list containing the ragged tensor values
+    tensor_info: The specs of the ragged tensor
+
+  Returns:
+    flat_values: The flatten values of the ragged tensor. All values from each
+      list will be converted to np.array and stacked together.
+    nested_row_lengths: The row lengths for each ragged dimensions.
+  """
+  assert tensor_info.sequence_rank, "{} is not ragged.".format(tensor_info)
+
+  flat_values = []
+  nested_row_lengths = [[] for _ in range(tensor_info.sequence_rank)]
+  # Reccursivelly append to `flat_values`, `nested_row_lengths`
+  _fill_ragged_attribute(RaggedExtraction(
+      nested_list=nested_list,
+      flat_values=flat_values,
+      nested_row_lengths=nested_row_lengths,
+      curr_ragged_rank=0,
+      tensor_info=tensor_info,
+  ))
+  if not flat_values:  # The full sequence is empty
+    flat_values = np.empty(
+        shape=(0,) + tensor_info.shape[tensor_info.sequence_rank:],
+        dtype=tensor_info.dtype.as_numpy_dtype,
+    )
+  else:  # Otherwise, merge all flat values together, some might be empty
+    flat_values = np.stack(flat_values)
+  return flat_values, nested_row_lengths[1:]
+
+
+def _fill_ragged_attribute(ext):
+  """Recurse the nested_list from the given RaggedExtraction.
+
+  Args:
+    ext: RaggedExtraction tuple containing the input/outputs
+
+  Returns:
+    None, the function mutate instead `ext.nested_row_lengths` and
+      `ext.flat_values` lists.
+  """
+  # Register the current sequence length.
+  # Could be 0 in case of empty list or an np.empty(shape=(0, ...)).
+  curr_sequence_length = len(ext.nested_list)
+  ext.nested_row_lengths[ext.curr_ragged_rank].append(curr_sequence_length)
+  # Sanity check if sequence is static, but should have been catched before
+  # by `Sequence.encode_example`
+  expected_sequence_length = ext.tensor_info.shape[ext.curr_ragged_rank]
+  if (expected_sequence_length is not None and
+      expected_sequence_length != curr_sequence_length):
+    raise ValueError(
+        "Received length {} do not match the expected one {} from {}.".format(
+            curr_sequence_length, expected_sequence_length, ext.tensor_info))
+
+  if ext.curr_ragged_rank < ext.tensor_info.sequence_rank - 1:
+    # If there are additional Sequence dimension, recurse 1 level deeper.
+    for sub_list in ext.nested_list:
+      _fill_ragged_attribute(ext._replace(
+          nested_list=sub_list,
+          curr_ragged_rank=ext.curr_ragged_rank + 1,
+      ))
+  else:
+    # Otherwise, we reached the max level deep, so add the current items
+    for item in ext.nested_list:
+      item = _item_to_np_array(  # Normalize the item
+          item,
+          dtype=ext.tensor_info.dtype,
+          # We only check the non-ragged shape
+          shape=ext.tensor_info.shape[ext.tensor_info.sequence_rank:],
+      )
+      ext.flat_values.append(item)
diff --git a/tensorflow_datasets/core/example_serializer_test.py b/tensorflow_datasets/core/example_serializer_test.py