Skip to content

Commit c2dfce8

Browse files
pierrot0copybara-github
authored andcommitted
TFDS versioning: make it possible to mark a dataset version as readable but not preparable.
PiperOrigin-RevId: 281269279
1 parent 42f5bf8 commit c2dfce8

File tree

5 files changed

+73
-17
lines changed

5 files changed

+73
-17
lines changed

docs/datasets_versioning.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@ class Imagenet2012(tfds.core.GeneratorBasedBuilder):
5555
VERSION = tfds.core.Version('2.0.1', 'Encoding fix. No changes from user POV')
5656
SUPPORTED_VERSIONS = [
5757
tfds.core.Version('3.0.0', 'S3: tensorflow.org/datasets/splits'),
58-
tfds.core.Version('2.0.1', 'Encoding fix. No changes from user POV'),
5958
tfds.core.Version('1.0.0'),
59+
tfds.core.Version('0.0.9', tfds_version_to_prepare="v1.0.0"),
6060
]
6161
```
6262

@@ -70,6 +70,12 @@ Supported versions with a higher number than the canonical version number are
7070
considered experimental and might be broken. They will however eventually be
7171
made canonical.
7272

73+
A version can specify `tfds_version_to_prepare`. This means this dataset version
74+
can only be used with current version of TFDS code if it has already been
75+
prepared by an older version of the code, but cannot be prepared. The
76+
value of `tfds_version_to_prepare` specifies the last known version of TFDS
77+
which can be used to download and prepare the dataset at this version.
78+
7379
## Loading a specific version
7480

7581
When loading a dataset or a `DatasetBuilder`, you can specify the version to

tensorflow_datasets/core/dataset_builder.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -187,25 +187,36 @@ def __init__(self, data_dir=None, config=None, version=None):
187187
logging.info("Load pre-computed datasetinfo (eg: splits) from bucket.")
188188
self.info.initialize_from_bucket()
189189

190-
def _pick_version(self, requested_version):
191-
"""Returns utils.Version instance, or raise AssertionError."""
190+
@utils.memoized_property
191+
def canonical_version(self):
192192
if self._builder_config:
193-
canonical_version = self._builder_config.version
194-
supported_versions = self._builder_config.supported_versions
193+
return self._builder_config.version
195194
else:
196-
canonical_version = self.VERSION
197-
supported_versions = self.SUPPORTED_VERSIONS
198-
versions = [
195+
return self.VERSION
196+
197+
@utils.memoized_property
198+
def supported_versions(self):
199+
if self._builder_config:
200+
return self._builder_config.supported_versions
201+
else:
202+
return self.SUPPORTED_VERSIONS
203+
204+
@utils.memoized_property
205+
def versions(self):
206+
"""Versions (canonical + availables), in preference order."""
207+
return [
199208
utils.Version(v) if isinstance(v, six.string_types) else v
200-
for v in [canonical_version] + supported_versions
209+
for v in [self.canonical_version] + self.supported_versions
201210
]
211+
212+
def _pick_version(self, requested_version):
213+
"""Returns utils.Version instance, or raise AssertionError."""
202214
if requested_version == "experimental_latest":
203-
return max(versions)
204-
for version in versions:
215+
return max(self.versions)
216+
for version in self.versions:
205217
if requested_version is None or version.match(requested_version):
206218
return version
207-
available_versions = [str(v)
208-
for v in [canonical_version] + supported_versions]
219+
available_versions = [str(v) for v in self.versions]
209220
msg = "Dataset {} cannot be loaded at version {}, only: {}.".format(
210221
self.name, requested_version, ", ".join(available_versions))
211222
raise AssertionError(msg)
@@ -253,9 +264,17 @@ def download_and_prepare(self, download_dir=None, download_config=None):
253264
logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
254265
return
255266

256-
dl_manager = self._make_download_manager(
257-
download_dir=download_dir,
258-
download_config=download_config)
267+
if self.version.tfds_version_to_prepare:
268+
available_to_prepare = ", ".join(str(v) for v in self.versions
269+
if not v.tfds_version_to_prepare)
270+
raise AssertionError(
271+
"The version of the dataset you are trying to use ({}:{}) can only "
272+
"be generated using TFDS code synced @ {} or earlier. Either sync to "
273+
"that version of TFDS to first prepare the data or use another "
274+
"version of the dataset (available for `download_and_prepare`: "
275+
"{}).".format(
276+
self.name, self.version, self.version.tfds_version_to_prepare,
277+
available_to_prepare))
259278

260279
# Currently it's not possible to overwrite the data because it would
261280
# conflict with versioning: If the last version has already been generated,
@@ -266,13 +285,18 @@ def download_and_prepare(self, download_dir=None, download_config=None):
266285
"the same version {} already exists. If the dataset has changed, "
267286
"please update the version number.".format(self.name, self._data_dir,
268287
self.version))
288+
269289
logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
270290
if not utils.has_sufficient_disk_space(
271291
self.info.size_in_bytes, directory=self._data_dir_root):
272292
raise IOError("Not enough disk space. Needed: %s" %
273293
units.size_str(self.info.size_in_bytes))
274294
self._log_download_bytes()
275295

296+
dl_manager = self._make_download_manager(
297+
download_dir=download_dir,
298+
download_config=download_config)
299+
276300
# Create a tmp dir and rename to self._data_dir on successful exit.
277301
with file_format_adapter.incomplete_dir(self._data_dir) as tmp_data_dir:
278302
# Temporarily assign _data_dir to tmp_data_dir to avoid having to forward

tensorflow_datasets/core/dataset_builder_test.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,18 @@ def test_previous_supported_version(self):
286286
older_builder = DummyDatasetSharedGenerator(version="0.0.*")
287287
self.assertEqual(str(older_builder.info.version), "0.0.9")
288288

289+
def test_non_preparable_version(self, *unused_mocks):
290+
expected = (
291+
"The version of the dataset you are trying to use ("
292+
"dummy_dataset_shared_generator:0.0.7) can only be generated using TFDS"
293+
" code synced @ v1.0.0 or earlier. Either sync to that version of TFDS "
294+
"to first prepare the data or use another version of the dataset "
295+
"(available for `download_and_prepare`: 1.0.0, 2.0.0, 0.0.9, 0.0.8).")
296+
builder = DummyDatasetSharedGenerator(version="0.0.7")
297+
self.assertIsNotNone(builder)
298+
with self.assertRaisesWithPredicateMatch(AssertionError, expected):
299+
builder.download_and_prepare()
300+
289301
def test_invalid_split_dataset(self):
290302
with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
291303
with self.assertRaisesWithPredicateMatch(ValueError, "ALL is a special"):

tensorflow_datasets/core/utils/version.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,22 @@ class Version(object):
6363
Experiment.S3: True,
6464
}
6565

66-
def __init__(self, version_str, description=None, experiments=None):
66+
def __init__(self, version_str, description=None, experiments=None,
67+
tfds_version_to_prepare=None):
68+
"""Version init.
69+
70+
Args:
71+
version_str: string. Eg: "1.2.3".
72+
description: string, a description of what is new in this version.
73+
experiments: dict of experiments. See Experiment.
74+
tfds_version_to_prepare: string, defaults to None. If set, indicates that
75+
current version of TFDS cannot be used to `download_and_prepare` the
76+
dataset, but that TFDS at version {tfds_version_to_prepare} should be
77+
used instead.
78+
"""
6779
self.description = description
6880
self._experiments = self._DEFAULT_EXPERIMENTS.copy()
81+
self.tfds_version_to_prepare = tfds_version_to_prepare
6982
if experiments:
7083
self._experiments.update(experiments)
7184
self.major, self.minor, self.patch = _str_to_version(version_str)

tensorflow_datasets/testing/test_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ class DummyDatasetSharedGenerator(dataset_builder.GeneratorBasedBuilder):
361361
"2.0.0",
362362
"0.0.9",
363363
"0.0.8",
364+
utils.Version("0.0.7", tfds_version_to_prepare="v1.0.0"),
364365
]
365366

366367
def _info(self):

0 commit comments

Comments
 (0)