Skip to content

Commit ec93f31

Browse files
adarobcopybara-github
authored andcommitted
Download prepared dataset from GCS when available.
PiperOrigin-RevId: 278721408
1 parent 19ed6ed commit ec93f31

File tree

6 files changed

+86
-54
lines changed

6 files changed

+86
-54
lines changed

tensorflow_datasets/audio/nsynth.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,6 @@
3131
with a unique pitch, timbre, and envelope. Each note is annotated with three
3232
additional pieces of information based on a combination of human evaluation
3333
and heuristic algorithms: Source, Family, and Qualities.
34-
35-
To access the dataset without needing to run the expensive preparation, you
36-
can load it with `try_gcs` enabled (e.g., `tfds.load('nsynth', try_gcs=True)`).
37-
You can also download the prepared dataset from GCS
38-
(`gs://tfs-data/datasets/nsynth`) and copy it to your local data directory.
3934
"""
4035

4136
_FULL_DESCRIPTION = """\

tensorflow_datasets/core/dataset_builder.py

Lines changed: 33 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,10 @@
5252
REUSE_DATASET_IF_EXISTS = download.GenerateMode.REUSE_DATASET_IF_EXISTS
5353

5454
GCS_HOSTED_MSG = """\
55-
Dataset {name} is hosted on GCS. You can skip download_and_prepare by setting
56-
data_dir=gs://tfds-data/datasets. If you find
57-
that read performance is slow, copy the data locally with gsutil:
58-
gsutil -m cp -R {gcs_path} {local_data_dir_no_version}
55+
Dataset %s is hosted on GCS. It will automatically be downloaded to your
56+
local data directory. If you'd instead prefer to read directly from our public
57+
GCS bucket (recommended if you're running on GCP), you can instead set
58+
data_dir=gs://tfds-data/datasets.
5959
"""
6060

6161

@@ -253,10 +253,6 @@ def download_and_prepare(self, download_dir=None, download_config=None):
253253
logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
254254
return
255255

256-
# Data may exist on GCS
257-
if not data_exists:
258-
self._maybe_log_gcs_data_dir()
259-
260256
dl_manager = self._make_download_manager(
261257
download_dir=download_dir,
262258
download_config=download_config)
@@ -282,29 +278,35 @@ def download_and_prepare(self, download_dir=None, download_config=None):
282278
# Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
283279
# it to every sub function.
284280
with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
285-
self._download_and_prepare(
286-
dl_manager=dl_manager,
287-
download_config=download_config)
288-
289-
# NOTE: If modifying the lines below to put additional information in
290-
# DatasetInfo, you'll likely also want to update
291-
# DatasetInfo.read_from_directory to possibly restore these attributes
292-
# when reading from package data.
293-
294-
# Update the DatasetInfo metadata by computing statistics from the data.
295-
if (download_config.compute_stats == download.ComputeStatsMode.SKIP or
296-
download_config.compute_stats == download.ComputeStatsMode.AUTO and
297-
bool(self.info.splits.total_num_examples)
298-
):
299-
logging.info(
300-
"Skipping computing stats for mode %s.",
301-
download_config.compute_stats)
302-
else: # Mode is forced or stats do not exists yet
303-
logging.info("Computing statistics.")
304-
self.info.compute_dynamic_properties()
305-
self.info.size_in_bytes = dl_manager.downloaded_size
306-
# Write DatasetInfo to disk, even if we haven't computed the statistics.
307-
self.info.write_to_directory(self._data_dir)
281+
if (download_config.try_download_gcs and
282+
gcs_utils.is_dataset_on_gcs(self.info.full_name)):
283+
logging.warning(GCS_HOSTED_MSG, self.name)
284+
gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir)
285+
self.info.read_from_directory(self._data_dir)
286+
else:
287+
self._download_and_prepare(
288+
dl_manager=dl_manager,
289+
download_config=download_config)
290+
291+
# NOTE: If modifying the lines below to put additional information in
292+
# DatasetInfo, you'll likely also want to update
293+
# DatasetInfo.read_from_directory to possibly restore these attributes
294+
# when reading from package data.
295+
296+
# Update DatasetInfo metadata by computing statistics from the data.
297+
if (download_config.compute_stats == download.ComputeStatsMode.SKIP or
298+
download_config.compute_stats == download.ComputeStatsMode.AUTO
299+
and bool(self.info.splits.total_num_examples)
300+
):
301+
logging.info(
302+
"Skipping computing stats for mode %s.",
303+
download_config.compute_stats)
304+
else: # Mode is forced or stats do not exists yet
305+
logging.info("Computing statistics.")
306+
self.info.compute_dynamic_properties()
307+
self.info.size_in_bytes = dl_manager.downloaded_size
308+
# Write DatasetInfo to disk, even if we haven't computed statistics.
309+
self.info.write_to_directory(self._data_dir)
308310
self._log_download_done()
309311

310312
@api_utils.disallow_positional_args
@@ -504,18 +506,6 @@ def _build_single_dataset(
504506
return tf.data.experimental.get_single_element(dataset)
505507
return dataset
506508

507-
def _maybe_log_gcs_data_dir(self):
508-
"""If data is on GCS, set _data_dir to GCS path."""
509-
if not gcs_utils.is_dataset_on_gcs(self.info.full_name):
510-
return
511-
512-
gcs_path = os.path.join(constants.GCS_DATA_DIR, self.info.full_name)
513-
msg = GCS_HOSTED_MSG.format(
514-
name=self.name,
515-
gcs_path=gcs_path,
516-
local_data_dir_no_version=os.path.split(self._data_dir)[0])
517-
logging.info(msg)
518-
519509
def _relative_data_dir(self, with_version=True):
520510
"""Relative path of this dataset in data_dir."""
521511
builder_data_dir = self.name

tensorflow_datasets/core/dataset_builder_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,34 @@ def test_determinism(self):
145145
2],
146146
)
147147

148+
@testing.run_in_graph_and_eager_modes()
149+
def test_load_from_gcs(self):
150+
from tensorflow_datasets.image import mnist # pylint:disable=g-import-not-at-top
151+
with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
152+
with absltest.mock.patch.object(
153+
mnist.MNIST, "_download_and_prepare",
154+
side_effect=NotImplementedError):
155+
# Make sure the dataset cannot be generated.
156+
with self.assertRaises(NotImplementedError):
157+
registered.load(
158+
name="mnist",
159+
data_dir=tmp_dir)
160+
# Enable GCS access so that dataset will be loaded from GCS.
161+
with self.gcs_access():
162+
_, info = registered.load(
163+
name="mnist",
164+
data_dir=tmp_dir,
165+
with_info=True)
166+
self.assertSetEqual(
167+
set(["dataset_info.json", "image.image.json",
168+
"mnist-test.counts.txt-00000-of-00001",
169+
"mnist-test.tfrecord-00000-of-00001",
170+
"mnist-train.counts.txt-00000-of-00001"] +
171+
["mnist-train.tfrecord-0000%d-of-00010" % i for i in range(10)]),
172+
set(tf.io.gfile.listdir(os.path.join(tmp_dir, "mnist/1.0.0"))))
173+
174+
self.assertEqual(set(info.splits.keys()), set(["train", "test"]))
175+
148176
@testing.run_in_graph_and_eager_modes()
149177
def test_multi_split(self):
150178
with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:

tensorflow_datasets/core/download/download_manager.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,8 @@ def __init__(self,
5656
max_examples_per_split=None,
5757
register_checksums=False,
5858
beam_runner=None,
59-
beam_options=None):
59+
beam_options=None,
60+
try_download_gcs=True):
6061
"""Constructs a `DownloadConfig`.
6162
6263
Args:
@@ -78,6 +79,9 @@ def __init__(self,
7879
based on Beam for the generation.
7980
beam_options: `PipelineOptions` to pass to `beam.Pipeline`, only used for
8081
datasets based on Beam for the generation.
82+
try_download_gcs: `bool`, defaults to True. If True, prepared dataset
83+
will be downloaded from GCS, when available. If False, dataset will be
84+
downloaded and prepared from scratch.
8185
"""
8286
self.extract_dir = extract_dir
8387
self.manual_dir = manual_dir
@@ -89,6 +93,7 @@ def __init__(self,
8993
self.register_checksums = register_checksums
9094
self.beam_runner = beam_runner
9195
self.beam_options = beam_options
96+
self.try_download_gcs = try_download_gcs
9297

9398

9499
class DownloadManager(object):

tensorflow_datasets/core/utils/gcs_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import posixpath
1919
from xml.etree import ElementTree
2020

21+
import concurrent.futures
2122
import requests
2223
import tensorflow as tf
2324

@@ -71,3 +72,21 @@ def is_dataset_on_gcs(dataset_name):
7172
dir_name = posixpath.join(GCS_DATASETS_DIR, dataset_name)
7273
return len(gcs_files(prefix_filter=dir_name)) > 2
7374

75+
76+
def download_gcs_dataset(
77+
dataset_name, local_dataset_dir, max_simultaneous_downloads=50):
78+
"""Downloads prepared GCS dataset to local dataset directory."""
79+
gcs_paths_to_dl = gcs_files(posixpath.join(GCS_DATASETS_DIR, dataset_name))
80+
with utils.async_tqdm(
81+
total=len(gcs_paths_to_dl), desc="Dl Completed...", unit=" file") as pbar:
82+
def _copy_from_gcs(gcs_path):
83+
local_path = posixpath.join(
84+
local_dataset_dir, posixpath.basename(gcs_path))
85+
download_gcs_file(gcs_path, local_path)
86+
pbar.update(1)
87+
with concurrent.futures.ThreadPoolExecutor(
88+
max_workers=max_simultaneous_downloads) as executor:
89+
futures = [
90+
executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl]
91+
for future in concurrent.futures.as_completed(futures):
92+
future.result()

tensorflow_datasets/text/wikipedia.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,6 @@
4545
(https://dumps.wikimedia.org/) with one split per language. Each example
4646
contains the content of one full Wikipedia article with cleaning to strip
4747
markdown and unwanted sections (references, etc.).
48-
49-
To access the dataset without needing to run the expensive preparation, you can
50-
load it with `try_gcs` enabled (e.g., `tfds.load('wikipedia', try_gcs=True)`).
51-
You can also download the prepared dataset from GCS
52-
(`gs://tfs-data/datasets/wikipedia`) and copy it to your local data directory.
5348
"""
5449

5550
_LICENSE = (

0 commit comments

Comments
 (0)