tensorflow
diff --git a/‎tensorflow_datasets/core/dataset_builder.py
Lines changed: 34 additions & 10 deletions b/‎tensorflow_datasets/core/dataset_builder.py
Lines changed: 34 additions & 10 deletions
diff --git a/‎tensorflow_datasets/core/decode/__init__.py
Lines changed: 28 additions & 0 deletions b/‎tensorflow_datasets/core/decode/__init__.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎tensorflow_datasets/core/decode/base.py
Lines changed: 183 additions & 0 deletions b/‎tensorflow_datasets/core/decode/base.py
Lines changed: 183 additions & 0 deletions
@@ -312,8 +312,10 @@ def as_dataset(self,
                  split=None,
                  batch_size=None,
                  shuffle_files=None,
+                 decoders=None,
                  as_supervised=False,
                  in_memory=None):
+    # pylint: disable=line-too-long
     """Constructs a `tf.data.Dataset`.
 
     Callers must pass arguments as keyword arguments.
@@ -330,6 +332,9 @@ def as_dataset(self,
         `tf.data.Dataset`.
       shuffle_files: `bool`, whether to shuffle the input files.
         Defaults to `True` if `split == tfds.Split.TRAIN` and `False` otherwise.
+      decoders: Nested dict of `Decoder` objects which allow to customize the
+        decoding. The structure should match the feature structure, but only
+        customized feature keys need to be present.
       as_supervised: `bool`, if `True`, the returned `tf.data.Dataset`
         will have a 2-tuple structure `(input, label)` according to
         `builder.info.supervised_keys`. If `False`, the default,
@@ -347,6 +352,7 @@ def as_dataset(self,
       If `batch_size` is -1, will return feature dictionaries containing
       the entire dataset in `tf.Tensor`s instead of a `tf.data.Dataset`.
     """
+    # pylint: enable=line-too-long
     logging.info("Constructing tf.data.Dataset for split %s, from %s",
                  split, self._data_dir)
     if not tf.io.gfile.exists(self._data_dir):
@@ -365,14 +371,21 @@ def as_dataset(self,
         self._build_single_dataset,
         shuffle_files=shuffle_files,
         batch_size=batch_size,
+        decoders=decoders,
         as_supervised=as_supervised,
         in_memory=in_memory,
     )
     datasets = utils.map_nested(build_single_dataset, split, map_tuple=True)
     return datasets
 
-  def _build_single_dataset(self, split, shuffle_files, batch_size,
-                            as_supervised, in_memory):
+  def _build_single_dataset(
+      self,
+      split,
+      shuffle_files,
+      batch_size,
+      decoders,
+      as_supervised,
+      in_memory):
     """as_dataset for a single split."""
     if isinstance(split, six.string_types):
       split = splits_lib.Split(split)
@@ -424,13 +437,15 @@ def _build_single_dataset(self, split, shuffle_files, batch_size,
       # If using in_memory, escape all device contexts so we can load the data
       # with a local Session.
       with tf.device(None):
-        dataset = self._as_dataset(split=split, shuffle_files=shuffle_files)
+        dataset = self._as_dataset(
+            split=split, shuffle_files=shuffle_files, decoders=decoders)
         # Use padded_batch so that features with unknown shape are supported.
         dataset = dataset.padded_batch(full_bs, dataset.output_shapes)
         dataset = tf.data.Dataset.from_tensor_slices(
             next(dataset_utils.as_numpy(dataset)))
     else:
-      dataset = self._as_dataset(split=split, shuffle_files=shuffle_files)
+      dataset = self._as_dataset(
+          split=split, shuffle_files=shuffle_files, decoders=decoders)
 
     if batch_size:
       # Use padded_batch so that features with unknown shape are supported.
@@ -567,16 +582,18 @@ def _download_and_prepare(self, dl_manager, download_config=None):
     raise NotImplementedError
 
   @abc.abstractmethod
-  def _as_dataset(self, split, shuffle_files=None):
+  def _as_dataset(self, split, decoders=None, shuffle_files=None):
     """Constructs a `tf.data.Dataset`.
 
     This is the internal implementation to overwrite called when user calls
     `as_dataset`. It should read the pre-processed datasets files and generate
     the `tf.data.Dataset` object.
 
     Args:
-      split (`tfds.Split`): which subset of the data to read.
-      shuffle_files (bool): whether to shuffle the input files. Optional,
+      split: `tfds.Split` which subset of the data to read.
+      decoders: Nested structure of `Decoder` object to customize the dataset
+        decoding.
+      shuffle_files: `bool`, whether to shuffle the input files. Optional,
         defaults to `True` if `split == tfds.Split.TRAIN` and `False` otherwise.
 
     Returns:
@@ -759,7 +776,12 @@ def _download_and_prepare(self, dl_manager, **prepare_split_kwargs):
     # Update the info object with the splits.
     self.info.update_splits_if_different(split_dict)
 
-  def _as_dataset(self, split=splits_lib.Split.TRAIN, shuffle_files=False):
+  def _as_dataset(
+      self,
+      split=splits_lib.Split.TRAIN,
+      decoders=None,
+      shuffle_files=False):
+
     if self.version.implements(utils.Experiment.S3):
       dataset = self._tfrecords_reader.read(
           self.name, split, self.info.splits.values(), shuffle_files)
@@ -780,9 +802,11 @@ def _as_dataset(self, split=splits_lib.Split.TRAIN, shuffle_files=False):
           dataset_from_file_fn=self._file_format_adapter.dataset_from_filename,
           shuffle_files=shuffle_files,
       )
+
+    decode_fn = functools.partial(
+        self.info.features.decode_example, decoders=decoders)
     dataset = dataset.map(
-        self.info.features.decode_example,
-        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+        decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
     return dataset
 
   def _slice_split_info_to_instruction_dicts(self, list_sliced_split_info):
 
@@ -0,0 +1,28 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Decoder public API.
+
+"""
+
+from tensorflow_datasets.core.decode.base import Decoder
+from tensorflow_datasets.core.decode.base import make_decoder
+from tensorflow_datasets.core.decode.base import SkipDecoding
+
+__all__ = [
+    'Decoder',
+    'make_decoder',
+    'SkipDecoding',
+]
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base decoders.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import functools
+
+import six
+from tensorflow_datasets.core import api_utils
+from tensorflow_datasets.core.utils import py_utils
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Decoder(object):
+  """Base decoder object.
+
+  `tfds.decode.Decoder` allows for overriding the default decoding by
+  implementing a subclass, or skipping it entirely with
+  `tfds.decode.SkipDecoding`.
+
+  Instead of subclassing, you can also create a `Decoder` from a function
+  with the `tfds.decode.make_decoder` decorator.
+
+  All decoders must derive from this base class. The implementation can
+  access the `self.feature` property which will correspond to the
+  `FeatureConnector` to which this decoder is applied.
+
+  To implement a decoder, the main method to override is `decode_example`,
+  which takes the serialized feature as input and returns the decoded feature.
+
+  If `decode_example` changes the output dtype, you must also override
+  the `dtype` property. This enables compatibility with
+  `tfds.features.Sequence`.
+  """
+
+  def __init__(self):
+    self.feature = None
+
+  @api_utils.disallow_positional_args
+  def setup(self, feature):
+    """Transformation contructor.
+
+    The initialization of decode object is deferred because the objects only
+    know the builder/features on which it is used after it has been
+    constructed, the initialization is done in this function.
+
+    Args:
+      feature: `tfds.features.FeatureConnector`, the feature to which is applied
+        this transformation.
+
+    """
+    self.feature = feature
+
+  @property
+  def dtype(self):
+    """Returns the `dtype` after decoding."""
+    tensor_info = self.feature.get_tensor_info()
+    return py_utils.map_nested(lambda t: t.dtype, tensor_info)
+
+  @abc.abstractmethod
+  def decode_example(self, serialized_example):
+    """Decode the example feature field (eg: image).
+
+    Args:
+      serialized_example: `tf.Tensor` as decoded, the dtype/shape should be
+        identical to `feature.get_serialized_info()`
+
+    Returns:
+      example: Decoded example.
+    """
+    raise NotImplementedError('Abstract class')
+
+
+class SkipDecoding(Decoder):
+  """Transformation which skip the decoding entirelly.
+
+  Example of usage:
+
+  ```python
+  ds = ds.load(
+      'imagenet2012',
+      split='train',
+      decoders={
+          'image': tfds.decode.SkipDecoding(),
+      }
+  )
+
+  for ex in ds.take(1):
+    assert ex['image'].dtype == tf.string
+  ```
+  """
+
+  @property
+  def dtype(self):
+    tensor_info = self.feature.get_serialized_info()
+    return py_utils.map_nested(lambda t: t.dtype, tensor_info)
+
+  def decode_example(self, serialized_example):
+    """Forward the serialized feature field."""
+    return serialized_example
+
+
+class DecoderFn(Decoder):
+  """Decoder created by `tfds.decoder.make_decoder` decorator."""
+
+  def __init__(self, fn, output_dtype, *args, **kwargs):
+    super(DecoderFn, self).__init__()
+    self._fn = fn
+    self._output_dtype = output_dtype
+    self._args = args
+    self._kwargs = kwargs
+
+  @property
+  def dtype(self):
+    if self._output_dtype is None:
+      return super(DecoderFn, self).dtype
+    else:
+      return self._output_dtype
+
+  def decode_example(self, serialized_example):
+    """Decode the example using the function."""
+    return self._fn(
+        serialized_example, self.feature, *self._args, **self._kwargs)
+
+
+def make_decoder(output_dtype=None):
+  """Decorator to create a decoder.
+
+  The decorated function should have the signature `(example, feature, *args,
+  **kwargs) -> decoded_example`.
+
+   * `example`: Serialized example before decoding
+   * `feature`: `FeatureConnector` associated with the example
+   * `*args, **kwargs`: Optional additional kwargs forwarded to the function
+
+  Example:
+
+  ```
+  @tfds.decode.make_decoder(output_dtype=tf.string)
+  def no_op_decoder(example, feature):
+    \"\"\"Decoder simply decoding feature normally.\"\"\"
+    return feature.decode_example(example)
+
+  tfds.load('mnist', split='train', decoder: {
+      'image': no_op_decoder(),
+  })
+  ```
+
+  Args:
+    output_dtype: The output dtype after decoding. Required only if the decoded
+      example has a different type than the `FeatureConnector.dtype` and is
+      used to decode features inside sequences (ex: videos)
+
+  Returns:
+    The decoder object
+  """  # pylint: disable=g-docstring-has-escape
+
+  def decorator(fn):
+
+    @functools.wraps(fn)
+    def decorated(*args, **kwargs):
+      return DecoderFn(fn, output_dtype, *args, **kwargs)
+    return decorated
+
+  return decorator