Skip to content

Commit 37fc133

Browse files
cyfracopybara-github
authored andcommitted
Adding a field with instructions on how to prepare files in manual_dir.
PiperOrigin-RevId: 282898527
1 parent 3f3e107 commit 37fc133

File tree

15 files changed

+114
-6
lines changed

15 files changed

+114
-6
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,15 @@ class DatasetBuilder(object):
151151
# be available through tfds.{load, builder} or documented in overview.md.
152152
IN_DEVELOPMENT = False
153153

154+
# Must be set for datasets that use 'manual_dir' functionality - the ones
155+
# that require users to do additional steps to download the data
156+
# (this is usually due to some external regulations / rules).
157+
#
158+
# This field should contain a string with user instructions, including
159+
# the list of files that should be present. It will be
160+
# displayed in the dataset documentation.
161+
MANUAL_DOWNLOAD_INSTRUCTIONS = None
162+
154163

155164
@api_utils.disallow_positional_args
156165
def __init__(self, data_dir=None, config=None, version=None):
@@ -649,19 +658,26 @@ def _as_dataset(self, split, decoders=None, shuffle_files=False):
649658
raise NotImplementedError
650659

651660
def _make_download_manager(self, download_dir, download_config):
661+
"""Creates a new download manager object."""
652662
download_dir = download_dir or os.path.join(self._data_dir_root,
653663
"downloads")
654664
extract_dir = (download_config.extract_dir or
655665
os.path.join(download_dir, "extracted"))
656-
manual_dir = (download_config.manual_dir or
657-
os.path.join(download_dir, "manual"))
658-
manual_dir = os.path.join(manual_dir, self.name)
666+
667+
# Use manual_dir only if MANUAL_DOWNLOAD_INSTRUCTIONS are set.
668+
if self.MANUAL_DOWNLOAD_INSTRUCTIONS:
669+
manual_dir = (
670+
download_config.manual_dir or os.path.join(download_dir, "manual"))
671+
manual_dir = os.path.join(manual_dir, self.name)
672+
else:
673+
manual_dir = None
659674

660675
return download.DownloadManager(
661676
dataset_name=self.name,
662677
download_dir=download_dir,
663678
extract_dir=extract_dir,
664679
manual_dir=manual_dir,
680+
manual_dir_instructions=self.MANUAL_DOWNLOAD_INSTRUCTIONS,
665681
force_download=(download_config.download_mode == FORCE_REDOWNLOAD),
666682
force_extraction=(download_config.download_mode == FORCE_REDOWNLOAD),
667683
register_checksums=download_config.register_checksums,

tensorflow_datasets/core/download/download_manager.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def __init__(self,
149149
download_dir,
150150
extract_dir=None,
151151
manual_dir=None,
152+
manual_dir_instructions=None,
152153
dataset_name=None,
153154
force_download=False,
154155
force_extraction=False,
@@ -159,6 +160,8 @@ def __init__(self,
159160
download_dir: `str`, path to directory where downloads are stored.
160161
extract_dir: `str`, path to directory where artifacts are extracted.
161162
manual_dir: `str`, path to manually downloaded/extracted data directory.
163+
manual_dir_instructions: `str`, human readable instructions on how to
164+
prepare contents of the manual_dir for this dataset.
162165
dataset_name: `str`, name of dataset this instance will be used for. If
163166
provided, downloads will contain which datasets they were used for.
164167
force_download: `bool`, default to False. If True, always [re]download.
@@ -171,6 +174,7 @@ def __init__(self,
171174
self._extract_dir = os.path.expanduser(
172175
extract_dir or os.path.join(download_dir, 'extracted'))
173176
self._manual_dir = manual_dir and os.path.expanduser(manual_dir)
177+
self._manual_dir_instructions = manual_dir_instructions
174178
tf.io.gfile.makedirs(self._download_dir)
175179
tf.io.gfile.makedirs(self._extract_dir)
176180
self._force_download = force_download
@@ -372,10 +376,15 @@ def download_and_extract(self, url_or_urls):
372376
@property
373377
def manual_dir(self):
374378
"""Returns the directory containing the manually extracted data."""
379+
if not self._manual_dir:
380+
raise AssertionError(
381+
'Manual directory was enabled. '
382+
'Did you set MANUAL_DOWNLOAD_INSTRUCTIONS in your dataset?')
375383
if not tf.io.gfile.exists(self._manual_dir):
376384
raise AssertionError(
377385
'Manual directory {} does not exist. Create it and download/extract '
378-
'dataset artifacts in there.'.format(self._manual_dir))
386+
'dataset artifacts in there. Additional instructions: {}'.format(
387+
self._manual_dir, self._manual_dir_instructions))
379388
return self._manual_dir
380389

381390

tensorflow_datasets/image/abstract_reasoning.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,12 @@ def __init__(self, split_type="neutral", **kwargs):
149149

150150
class AbstractReasoning(tfds.core.BeamBasedBuilder):
151151
"""Abstract reasoning dataset."""
152+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
153+
Data can be downloaded from
154+
https://console.cloud.google.com/storage/browser/ravens-matrices
155+
Please put all the tar.gz files in manual_dir.
156+
"""
157+
152158
BUILDER_CONFIGS = [
153159
AbstractReasoningConfig(
154160
name="neutral",

tensorflow_datasets/image/cbis_ddsm.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,13 @@ def __init__(self, image_size=None, patch_size=None, **kwargs):
134134
class CuratedBreastImagingDDSM(tfds.core.GeneratorBasedBuilder):
135135
"""Curated Breast Imaging Subset of DDSM."""
136136

137+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
138+
You can download the images from
139+
https://wiki.cancerimagingarchive.net/display/Public/CBIS-DDSM
140+
Please look at the source file (cbis_ddsm.py) to see the instructions
141+
on how to conver them into png (using dcmj2pnm).
142+
"""
143+
137144
BUILDER_CONFIGS = [
138145
CuratedBreastImagingDDSMConfig(
139146
name='patches',

tensorflow_datasets/image/celebahq.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,13 @@ def __init__(self, resolution, **kwargs):
7777
class CelebAHq(tfds.core.GeneratorBasedBuilder):
7878
"""Celeba_HQ Dataset."""
7979

80+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
81+
manual_dir should contain multiple tar files with images (data2x2.tar,
82+
data4x4.tar .. data1024x1024.tar).
83+
Detailed instructions are here:
84+
https://github.com/tkarras/progressive_growing_of_gans#preparing-datasets-for-training
85+
"""
86+
8087
VERSION = tfds.core.Version("0.1.0")
8188

8289
BUILDER_CONFIGS = [

tensorflow_datasets/image/chexpert.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,14 @@ class Chexpert(tfds.core.GeneratorBasedBuilder):
8080
"3.0.0", "New split API (https://tensorflow.org/datasets/splits)"),
8181
]
8282

83+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
84+
You must register and agree to user agreement on the dataset page:
85+
https://stanfordmlgroup.github.io/competitions/chexpert/
86+
Afterwards, you have to put the CheXpert-v1.0-small directory in the
87+
manual_dir. It should contain subdirectories: train/ and valid/ with images
88+
and also train.csv and valid.csv files.
89+
"""
90+
8391
def _info(self):
8492
return tfds.core.DatasetInfo(
8593
builder=self,

tensorflow_datasets/image/diabetic_retinopathy_detection.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@ def target_pixels(self):
7878
class DiabeticRetinopathyDetection(tfds.core.GeneratorBasedBuilder):
7979
"""Diabetic retinopathy detection."""
8080

81+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
82+
You have to download this dataset from Kaggle.
83+
https://www.kaggle.com/c/diabetic-retinopathy-detection/data
84+
After downloading, unpack the test.zip file into test/ directory in manual_dir
85+
and sample.zip to sample/. Also unpack the sampleSubmissions.csv and
86+
trainLabels.csv.
87+
"""
88+
8189
BUILDER_CONFIGS = [
8290
DiabeticRetinopathyDetectionConfig(
8391
name="original",

tensorflow_datasets/image/image_folder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ class ImageLabelFolder(tfds.core.GeneratorBasedBuilder):
7474
7575
"""
7676

77+
MANUAL_DOWNLOAD_INSTRUCTIONS = "This is a 'template' dataset."
78+
7779
VERSION = tfds.core.Version("1.0.0",
7880
experiments={tfds.core.Experiment.S3: False})
7981
SUPPORTED_VERSIONS = [

tensorflow_datasets/image/imagenet.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,13 @@ class Imagenet2012(tfds.core.GeneratorBasedBuilder):
103103
'5.0.0', 'New split API (https://tensorflow.org/datasets/splits)'),
104104
]
105105

106+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
107+
manual_dir should contain two files: ILSVRC2012_img_train.tar and
108+
ILSVRC2012_img_val.tar.
109+
You need to register on http://www.image-net.org/download-images in order
110+
to get the link to download the dataset.
111+
"""
112+
106113
def _info(self):
107114
names_file = tfds.core.get_tfds_path(_LABELS_FNAME)
108115
return tfds.core.DatasetInfo(

tensorflow_datasets/image/resisc45.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,12 @@ class Resisc45(tfds.core.GeneratorBasedBuilder):
7171
# 2.0.0: S3 with new hashing function (different shuffle).
7272
# 1.0.0: S3 (new shuffling, sharding and slicing mechanism).
7373

74+
MANUAL_DOWNLOAD_INSTRUCTIONS = """\
75+
Dataset can be downloaded from OneDrive:
76+
https://1drv.ms/u/s!AmgKYzARBl5ca3HNaHIlzp_IXjs
77+
After downloading the rar file, please extract it to the manual_dir.
78+
"""
79+
7480
def _info(self):
7581
return tfds.core.DatasetInfo(
7682
builder=self,

0 commit comments

Comments
 (0)