Download prepared dataset from GCS when available.

adarob · copybara-github · commit ec93f3121369 · 2019-11-05T15:44:57.000-08:00
PiperOrigin-RevId: 278721408
diff --git a/tensorflow_datasets/audio/nsynth.py b/tensorflow_datasets/audio/nsynth.py
@@ -31,11 +31,6 @@
 with a unique pitch, timbre, and envelope. Each note is annotated with three
 additional pieces of information based on a combination of human evaluation
 and heuristic algorithms: Source, Family, and Qualities.
-
-To access the dataset without needing to run the expensive preparation, you
-can load it with `try_gcs` enabled (e.g., `tfds.load('nsynth', try_gcs=True)`).
-You can also download the prepared dataset from GCS
-(`gs://tfs-data/datasets/nsynth`) and copy it to your local data directory.
 """
 
 _FULL_DESCRIPTION = """\
diff --git a/tensorflow_datasets/core/dataset_builder.py b/tensorflow_datasets/core/dataset_builder.py
@@ -52,10 +52,10 @@
 REUSE_DATASET_IF_EXISTS = download.GenerateMode.REUSE_DATASET_IF_EXISTS
 
 GCS_HOSTED_MSG = """\
-Dataset {name} is hosted on GCS. You can skip download_and_prepare by setting
-data_dir=gs://tfds-data/datasets. If you find
-that read performance is slow, copy the data locally with gsutil:
-gsutil -m cp -R {gcs_path} {local_data_dir_no_version}
+Dataset %s is hosted on GCS. It will automatically be downloaded to your
+local data directory. If you'd instead prefer to read directly from our public
+GCS bucket (recommended if you're running on GCP), you can instead set
+data_dir=gs://tfds-data/datasets.
 """
 
 
@@ -253,10 +253,6 @@ def download_and_prepare(self, download_dir=None, download_config=None):
       logging.info("Reusing dataset %s (%s)", self.name, self._data_dir)
       return
 
-    # Data may exist on GCS
-    if not data_exists:
-      self._maybe_log_gcs_data_dir()
-
     dl_manager = self._make_download_manager(
         download_dir=download_dir,
         download_config=download_config)
@@ -282,29 +278,35 @@ def download_and_prepare(self, download_dir=None, download_config=None):
       # Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
       # it to every sub function.
       with utils.temporary_assignment(self, "_data_dir", tmp_data_dir):
-        self._download_and_prepare(
-            dl_manager=dl_manager,
-            download_config=download_config)
-
-        # NOTE: If modifying the lines below to put additional information in
-        # DatasetInfo, you'll likely also want to update
-        # DatasetInfo.read_from_directory to possibly restore these attributes
-        # when reading from package data.
-
-        # Update the DatasetInfo metadata by computing statistics from the data.
-        if (download_config.compute_stats == download.ComputeStatsMode.SKIP or
-            download_config.compute_stats == download.ComputeStatsMode.AUTO and
-            bool(self.info.splits.total_num_examples)
-           ):
-          logging.info(
-              "Skipping computing stats for mode %s.",
-              download_config.compute_stats)
-        else:  # Mode is forced or stats do not exists yet
-          logging.info("Computing statistics.")
-          self.info.compute_dynamic_properties()
-        self.info.size_in_bytes = dl_manager.downloaded_size
-        # Write DatasetInfo to disk, even if we haven't computed the statistics.
-        self.info.write_to_directory(self._data_dir)
+        if (download_config.try_download_gcs and
+            gcs_utils.is_dataset_on_gcs(self.info.full_name)):
+          logging.warning(GCS_HOSTED_MSG, self.name)
+          gcs_utils.download_gcs_dataset(self.info.full_name, self._data_dir)
+          self.info.read_from_directory(self._data_dir)
+        else:
+          self._download_and_prepare(
+              dl_manager=dl_manager,
+              download_config=download_config)
+
+          # NOTE: If modifying the lines below to put additional information in
+          # DatasetInfo, you'll likely also want to update
+          # DatasetInfo.read_from_directory to possibly restore these attributes
+          # when reading from package data.
+
+          # Update DatasetInfo metadata by computing statistics from the data.
+          if (download_config.compute_stats == download.ComputeStatsMode.SKIP or
+              download_config.compute_stats == download.ComputeStatsMode.AUTO
+              and bool(self.info.splits.total_num_examples)
+             ):
+            logging.info(
+                "Skipping computing stats for mode %s.",
+                download_config.compute_stats)
+          else:  # Mode is forced or stats do not exists yet
+            logging.info("Computing statistics.")
+            self.info.compute_dynamic_properties()
+          self.info.size_in_bytes = dl_manager.downloaded_size
+          # Write DatasetInfo to disk, even if we haven't computed statistics.
+          self.info.write_to_directory(self._data_dir)
     self._log_download_done()
 
   @api_utils.disallow_positional_args
@@ -504,18 +506,6 @@ def _build_single_dataset(
       return tf.data.experimental.get_single_element(dataset)
     return dataset
 
-  def _maybe_log_gcs_data_dir(self):
-    """If data is on GCS, set _data_dir to GCS path."""
-    if not gcs_utils.is_dataset_on_gcs(self.info.full_name):
-      return
-
-    gcs_path = os.path.join(constants.GCS_DATA_DIR, self.info.full_name)
-    msg = GCS_HOSTED_MSG.format(
-        name=self.name,
-        gcs_path=gcs_path,
-        local_data_dir_no_version=os.path.split(self._data_dir)[0])
-    logging.info(msg)
-
   def _relative_data_dir(self, with_version=True):
     """Relative path of this dataset in data_dir."""
     builder_data_dir = self.name
diff --git a/tensorflow_datasets/core/dataset_builder_test.py b/tensorflow_datasets/core/dataset_builder_test.py
@@ -145,6 +145,34 @@ def test_determinism(self):
            2],
       )
 
+  @testing.run_in_graph_and_eager_modes()
+  def test_load_from_gcs(self):
+    from tensorflow_datasets.image import mnist  # pylint:disable=g-import-not-at-top
+    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
+      with absltest.mock.patch.object(
+          mnist.MNIST, "_download_and_prepare",
+          side_effect=NotImplementedError):
+        # Make sure the dataset cannot be generated.
+        with self.assertRaises(NotImplementedError):
+          registered.load(
+              name="mnist",
+              data_dir=tmp_dir)
+        # Enable GCS access so that dataset will be loaded from GCS.
+        with self.gcs_access():
+          _, info = registered.load(
+              name="mnist",
+              data_dir=tmp_dir,
+              with_info=True)
+      self.assertSetEqual(
+          set(["dataset_info.json", "image.image.json",
+               "mnist-test.counts.txt-00000-of-00001",
+               "mnist-test.tfrecord-00000-of-00001",
+               "mnist-train.counts.txt-00000-of-00001"] +
+              ["mnist-train.tfrecord-0000%d-of-00010" % i for i in range(10)]),
+          set(tf.io.gfile.listdir(os.path.join(tmp_dir, "mnist/1.0.0"))))
+
+      self.assertEqual(set(info.splits.keys()), set(["train", "test"]))
+
   @testing.run_in_graph_and_eager_modes()
   def test_multi_split(self):
     with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
diff --git a/tensorflow_datasets/core/download/download_manager.py b/tensorflow_datasets/core/download/download_manager.py
@@ -56,7 +56,8 @@ def __init__(self,
                max_examples_per_split=None,
                register_checksums=False,
                beam_runner=None,
-               beam_options=None):
+               beam_options=None,
+               try_download_gcs=True):
     """Constructs a `DownloadConfig`.
 
     Args:
@@ -78,6 +79,9 @@ def __init__(self,
         based on Beam for the generation.
       beam_options: `PipelineOptions` to pass to `beam.Pipeline`, only used for
         datasets based on Beam for the generation.
+      try_download_gcs: `bool`, defaults to True. If True, prepared dataset
+        will be downloaded from GCS, when available. If False, dataset will be
+        downloaded and prepared from scratch.
     """
     self.extract_dir = extract_dir
     self.manual_dir = manual_dir
@@ -89,6 +93,7 @@ def __init__(self,
     self.register_checksums = register_checksums
     self.beam_runner = beam_runner
     self.beam_options = beam_options
+    self.try_download_gcs = try_download_gcs
 
 
 class DownloadManager(object):
diff --git a/tensorflow_datasets/core/utils/gcs_utils.py b/tensorflow_datasets/core/utils/gcs_utils.py
@@ -18,6 +18,7 @@
 import posixpath
 from xml.etree import ElementTree
 
+import concurrent.futures
 import requests
 import tensorflow as tf
 
@@ -71,3 +72,21 @@ def is_dataset_on_gcs(dataset_name):
   dir_name = posixpath.join(GCS_DATASETS_DIR, dataset_name)
   return len(gcs_files(prefix_filter=dir_name)) > 2
 
+
+def download_gcs_dataset(
+    dataset_name, local_dataset_dir, max_simultaneous_downloads=50):
+  """Downloads prepared GCS dataset to local dataset directory."""
+  gcs_paths_to_dl = gcs_files(posixpath.join(GCS_DATASETS_DIR, dataset_name))
+  with utils.async_tqdm(
+      total=len(gcs_paths_to_dl), desc="Dl Completed...", unit=" file") as pbar:
+    def _copy_from_gcs(gcs_path):
+      local_path = posixpath.join(
+          local_dataset_dir, posixpath.basename(gcs_path))
+      download_gcs_file(gcs_path, local_path)
+      pbar.update(1)
+    with concurrent.futures.ThreadPoolExecutor(
+        max_workers=max_simultaneous_downloads) as executor:
+      futures = [
+          executor.submit(_copy_from_gcs, path) for path in gcs_paths_to_dl]
+      for future in concurrent.futures.as_completed(futures):
+        future.result()
diff --git a/tensorflow_datasets/text/wikipedia.py b/tensorflow_datasets/text/wikipedia.py
@@ -45,11 +45,6 @@
 (https://dumps.wikimedia.org/) with one split per language. Each example
 contains the content of one full Wikipedia article with cleaning to strip
 markdown and unwanted sections (references, etc.).
-
-To access the dataset without needing to run the expensive preparation, you can
-load it with `try_gcs` enabled (e.g., `tfds.load('wikipedia', try_gcs=True)`).
-You can also download the prepared dataset from GCS
-(`gs://tfs-data/datasets/wikipedia`) and copy it to your local data directory.
 """
 
 _LICENSE = (