52
52
REUSE_DATASET_IF_EXISTS = download .GenerateMode .REUSE_DATASET_IF_EXISTS
53
53
54
54
GCS_HOSTED_MSG = """\
55
- Dataset {name} is hosted on GCS. You can skip download_and_prepare by setting
56
- data_dir=gs://tfds- data/datasets . If you find
57
- that read performance is slow, copy the data locally with gsutil:
58
- gsutil -m cp -R {gcs_path} {local_data_dir_no_version}
55
+ Dataset %s is hosted on GCS. It will automatically be downloaded to your
56
+ local data directory . If you'd instead prefer to read directly from our public
57
+ GCS bucket (recommended if you're running on GCP), you can instead set
58
+ data_dir=gs://tfds-data/datasets.
59
59
"""
60
60
61
61
@@ -253,10 +253,6 @@ def download_and_prepare(self, download_dir=None, download_config=None):
253
253
logging .info ("Reusing dataset %s (%s)" , self .name , self ._data_dir )
254
254
return
255
255
256
- # Data may exist on GCS
257
- if not data_exists :
258
- self ._maybe_log_gcs_data_dir ()
259
-
260
256
dl_manager = self ._make_download_manager (
261
257
download_dir = download_dir ,
262
258
download_config = download_config )
@@ -282,29 +278,35 @@ def download_and_prepare(self, download_dir=None, download_config=None):
282
278
# Temporarily assign _data_dir to tmp_data_dir to avoid having to forward
283
279
# it to every sub function.
284
280
with utils .temporary_assignment (self , "_data_dir" , tmp_data_dir ):
285
- self ._download_and_prepare (
286
- dl_manager = dl_manager ,
287
- download_config = download_config )
288
-
289
- # NOTE: If modifying the lines below to put additional information in
290
- # DatasetInfo, you'll likely also want to update
291
- # DatasetInfo.read_from_directory to possibly restore these attributes
292
- # when reading from package data.
293
-
294
- # Update the DatasetInfo metadata by computing statistics from the data.
295
- if (download_config .compute_stats == download .ComputeStatsMode .SKIP or
296
- download_config .compute_stats == download .ComputeStatsMode .AUTO and
297
- bool (self .info .splits .total_num_examples )
298
- ):
299
- logging .info (
300
- "Skipping computing stats for mode %s." ,
301
- download_config .compute_stats )
302
- else : # Mode is forced or stats do not exists yet
303
- logging .info ("Computing statistics." )
304
- self .info .compute_dynamic_properties ()
305
- self .info .size_in_bytes = dl_manager .downloaded_size
306
- # Write DatasetInfo to disk, even if we haven't computed the statistics.
307
- self .info .write_to_directory (self ._data_dir )
281
+ if (download_config .try_download_gcs and
282
+ gcs_utils .is_dataset_on_gcs (self .info .full_name )):
283
+ logging .warning (GCS_HOSTED_MSG , self .name )
284
+ gcs_utils .download_gcs_dataset (self .info .full_name , self ._data_dir )
285
+ self .info .read_from_directory (self ._data_dir )
286
+ else :
287
+ self ._download_and_prepare (
288
+ dl_manager = dl_manager ,
289
+ download_config = download_config )
290
+
291
+ # NOTE: If modifying the lines below to put additional information in
292
+ # DatasetInfo, you'll likely also want to update
293
+ # DatasetInfo.read_from_directory to possibly restore these attributes
294
+ # when reading from package data.
295
+
296
+ # Update DatasetInfo metadata by computing statistics from the data.
297
+ if (download_config .compute_stats == download .ComputeStatsMode .SKIP or
298
+ download_config .compute_stats == download .ComputeStatsMode .AUTO
299
+ and bool (self .info .splits .total_num_examples )
300
+ ):
301
+ logging .info (
302
+ "Skipping computing stats for mode %s." ,
303
+ download_config .compute_stats )
304
+ else : # Mode is forced or stats do not exists yet
305
+ logging .info ("Computing statistics." )
306
+ self .info .compute_dynamic_properties ()
307
+ self .info .size_in_bytes = dl_manager .downloaded_size
308
+ # Write DatasetInfo to disk, even if we haven't computed statistics.
309
+ self .info .write_to_directory (self ._data_dir )
308
310
self ._log_download_done ()
309
311
310
312
@api_utils .disallow_positional_args
@@ -504,18 +506,6 @@ def _build_single_dataset(
504
506
return tf .data .experimental .get_single_element (dataset )
505
507
return dataset
506
508
507
- def _maybe_log_gcs_data_dir (self ):
508
- """If data is on GCS, set _data_dir to GCS path."""
509
- if not gcs_utils .is_dataset_on_gcs (self .info .full_name ):
510
- return
511
-
512
- gcs_path = os .path .join (constants .GCS_DATA_DIR , self .info .full_name )
513
- msg = GCS_HOSTED_MSG .format (
514
- name = self .name ,
515
- gcs_path = gcs_path ,
516
- local_data_dir_no_version = os .path .split (self ._data_dir )[0 ])
517
- logging .info (msg )
518
-
519
509
def _relative_data_dir (self , with_version = True ):
520
510
"""Relative path of this dataset in data_dir."""
521
511
builder_data_dir = self .name
0 commit comments