tensorflow
diff --git a/‎tensorflow_datasets/core/dataset_builder_test.py
Lines changed: 86 additions & 0 deletions b/‎tensorflow_datasets/core/dataset_builder_test.py
Lines changed: 86 additions & 0 deletions
diff --git a/‎tensorflow_datasets/core/features/feature.py
Lines changed: 24 additions & 0 deletions b/‎tensorflow_datasets/core/features/feature.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎tensorflow_datasets/core/features/sequence_feature.py
Lines changed: 9 additions & 11 deletions b/‎tensorflow_datasets/core/features/sequence_feature.py
Lines changed: 9 additions & 11 deletions
@@ -489,5 +489,91 @@ def test_supervised_keys(self):
 
 
 
+
+class NestedSequenceBuilder(dataset_builder.GeneratorBasedBuilder):
+  """Dataset containing nested sequences."""
+
+  VERSION = utils.Version("0.0.1")
+
+  def _info(self):
+    return dataset_info.DatasetInfo(
+        builder=self,
+        features=features.FeaturesDict({
+            "frames": features.Sequence({
+                "coordinates": features.Sequence(
+                    features.Tensor(shape=(2,), dtype=tf.int32)
+                ),
+            }),
+        }),
+    )
+
+  def _split_generators(self, dl_manager):
+    # Split the 30 examples from the generator into 2 train shards and 1 test
+    # shard.
+    del dl_manager
+    return [
+        splits_lib.SplitGenerator(
+            name=splits_lib.Split.TRAIN,
+            gen_kwargs={},
+        ),
+    ]
+
+  def _generate_examples(self):
+    ex0 = [
+        [[0, 1], [2, 3], [4, 5]],
+        [],
+        [[6, 7]]
+    ]
+    ex1 = []
+    ex2 = [
+        [[10, 11]],
+        [[12, 13], [14, 15]],
+    ]
+    for i, ex in enumerate([ex0, ex1, ex2]):
+      yield i, {"frames": {"coordinates": ex}}
+
+
+class NestedSequenceBuilderTest(testing.TestCase):
+  """Test of the NestedSequenceBuilder."""
+
+  @testing.run_in_graph_and_eager_modes()
+  def test_nested_sequence(self):
+    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
+      ds_train, ds_info = registered.load(
+          name="nested_sequence_builder",
+          data_dir=tmp_dir,
+          split="train",
+          with_info=True,
+          shuffle_files=False)
+      ex0, ex1, ex2 = [
+          ex["frames"]["coordinates"]
+          for ex in dataset_utils.as_numpy(ds_train)
+      ]
+      self.assertAllEqual(ex0, tf.ragged.constant([
+          [[0, 1], [2, 3], [4, 5]],
+          [],
+          [[6, 7]],
+      ], inner_shape=(2,)))
+      self.assertAllEqual(ex1, tf.ragged.constant([], ragged_rank=1))
+      self.assertAllEqual(ex2, tf.ragged.constant([
+          [[10, 11]],
+          [[12, 13], [14, 15]],
+      ], inner_shape=(2,)))
+
+      self.assertEqual(
+          ds_info.features.dtype,
+          {"frames": {"coordinates": tf.int32}},
+      )
+      self.assertEqual(
+          ds_info.features.shape,
+          {"frames": {"coordinates": (None, None, 2)}},
+      )
+      nested_tensor_info = ds_info.features.get_tensor_info()
+      self.assertEqual(
+          nested_tensor_info["frames"]["coordinates"].sequence_rank,
+          2,
+      )
+
+
 if __name__ == "__main__":
   testing.test_main()
@@ -331,6 +331,25 @@ def decode_batch_example(self, tfexample_data):
         name='sequence_decode',
     )
 
+  def decode_ragged_example(self, tfexample_data):
+    """Decode nested features from a tf.RaggedTensor.
+
+    This function is used to decode features wrapped in nested
+    `tfds.features.Sequence()`.
+    By default, this function apply `decode_batch_example` on the flat values
+    of the ragged tensor. For optimization, features can
+    overwrite this method to apply a custom batch decoding.
+
+    Args:
+      tfexample_data: `tf.RaggedTensor` inputs containing the nested encoded
+        examples.
+
+    Returns:
+      tensor_data: The decoded `tf.RaggedTensor` or dictionary of tensor,
+        output of the tf.data.Dataset object
+    """
+    return tf.ragged.map_flat_values(self.decode_batch_example, tfexample_data)
+
   def _flatten(self, x):
     """Flatten the input dict into a list of values.
 
@@ -509,6 +528,11 @@ def decode_batch_example(self, example_data):
     # Overwrite the `tf.map_fn`, decoding is a no-op
     return self.decode_example(example_data)
 
+  def decode_ragged_example(self, example_data):
+    """See base class for details."""
+    # Overwrite the `tf.map_fn`, decoding is a no-op
+    return self.decode_example(example_data)
+
   def encode_example(self, example_data):
     """See base class for details."""
     np_dtype = np.dtype(self.dtype.as_numpy_dtype)
 
@@ -144,15 +144,20 @@ def _build_empty_np(serialized_info):
         for sequence_elem in sequence_elements
     ]
 
-    # Then merge the elements back together
+    # Then convert back list[nested dict] => nested dict[list]
     def _stack_nested(sequence_elements):
+      """Recursivelly stack the tensors from the same dict field."""
       if isinstance(sequence_elements[0], dict):
         return {
             # Stack along the first dimension
             k: _stack_nested(sub_sequence)
             for k, sub_sequence in utils.zip_dict(*sequence_elements)
         }
-      return stack_arrays(*sequence_elements)
+      # Note: As each field can be a nested ragged list, we don't check here
+      # that all elements from the list have matching dtype/shape.
+      # Checking is done in `example_serializer` when elements
+      # are converted to numpy array and stacked togethers.
+      return list(sequence_elements)
 
     return _stack_nested(sequence_elements)
 
@@ -203,14 +208,7 @@ def __repr__(self):
     return '{}({})'.format(type(self).__name__, inner_feature_repr)
 
 
-def stack_arrays(*elems):
-  if isinstance(elems[0], np.ndarray):
-    return np.stack(elems)
-  else:
-    return [e for e in elems]
-
-
-def np_to_list(elem):
+def _np_to_list(elem):
   """Returns list from list, tuple or ndarray."""
   if isinstance(elem, list):
     return elem
@@ -227,7 +225,7 @@ def np_to_list(elem):
 def _transpose_dict_list(dict_list):
   """Transpose a nested dict[list] into a list[nested dict]."""
   # 1. Unstack numpy arrays into list
-  dict_list = utils.map_nested(np_to_list, dict_list, dict_only=True)
+  dict_list = utils.map_nested(_np_to_list, dict_list, dict_only=True)
 
   # 2. Extract the sequence length (and ensure the length is constant for all
   # elements)