Skip to content

Commit 077788c

Browse files
adarobcopybara-github
authored andcommitted
Make size_in_bytes reflect final dataset size instead of downloaded size.
PiperOrigin-RevId: 292190740
1 parent 30dfb0d commit 077788c

File tree

9 files changed

+135
-89
lines changed

9 files changed

+135
-89
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,15 @@ def download_and_prepare(self, download_dir=None, download_config=None):
309309

310310
logging.info("Generating dataset %s (%s)", self.name, self._data_dir)
311311
if not utils.has_sufficient_disk_space(
312-
self.info.size_in_bytes, directory=self._data_dir_root):
313-
raise IOError("Not enough disk space. Needed: %s" %
314-
units.size_str(self.info.size_in_bytes))
312+
self.info.size_in_bytes + self.info.download_size,
313+
directory=self._data_dir_root):
314+
raise IOError(
315+
"Not enough disk space. Needed: {} (download: {}, generated: {})"
316+
.format(
317+
units.size_str(self.info.size_in_bytes + self.info.download_size),
318+
units.size_str(self.info.download_size),
319+
units.size_str(self.info.size_in_bytes),
320+
))
315321
self._log_download_bytes()
316322

317323
dl_manager = self._make_download_manager(
@@ -352,7 +358,7 @@ def download_and_prepare(self, download_dir=None, download_config=None):
352358
else: # Mode is forced or stats do not exists yet
353359
logging.info("Computing statistics.")
354360
self.info.compute_dynamic_properties()
355-
self.info.size_in_bytes = dl_manager.downloaded_size
361+
self.info.downloaded_size = dl_manager.downloaded_size
356362
# Write DatasetInfo to disk, even if we haven't computed statistics.
357363
self.info.write_to_directory(self._data_dir)
358364
self._log_download_done()
@@ -1055,8 +1061,9 @@ def _prepare_split(self, split_generator, max_examples_per_split):
10551061
total=split_info.num_examples, leave=False):
10561062
example = self.info.features.encode_example(record)
10571063
writer.write(key, example)
1058-
shard_lengths = writer.finalize()
1064+
shard_lengths, total_size = writer.finalize()
10591065
split_generator.split_info.shard_lengths.extend(shard_lengths)
1066+
split_generator.split_info.num_bytes = total_size
10601067

10611068

10621069
class BeamBasedBuilder(FileAdapterBuilder):
@@ -1148,10 +1155,11 @@ def _download_and_prepare(self, dl_manager, download_config):
11481155
split_dict = self.info.splits
11491156
for split_name, beam_writer in self._beam_writers.items():
11501157
logging.info("Retrieving shard lengths for %s...", split_name)
1151-
shard_lengths = beam_writer.finalize()
1158+
shard_lengths, total_size = beam_writer.finalize()
11521159
split_info = split_dict[split_name]
11531160
split_info.shard_lengths.extend(shard_lengths)
11541161
split_info.num_shards = len(shard_lengths)
1162+
split_info.num_bytes = total_size
11551163
logging.info("Updating split info...")
11561164
self.info.update_splits_if_different(split_dict)
11571165

tensorflow_datasets/core/dataset_info.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,11 +205,17 @@ def citation(self):
205205

206206
@property
207207
def size_in_bytes(self):
208-
return self.as_proto.size_in_bytes
208+
size_in_bytes = sum(split.num_bytes for split in self.splits.values())
209+
# Fall back to deprecated proto field if `num_bytes` fields are empty.
210+
return size_in_bytes or self.as_proto.size_in_bytes
209211

210-
@size_in_bytes.setter
211-
def size_in_bytes(self, size):
212-
self.as_proto.size_in_bytes = size
212+
@property
213+
def download_size(self):
214+
return self.as_proto.download_size
215+
216+
@download_size.setter
217+
def download_size(self, size):
218+
self.as_proto.download_size = size
213219

214220
@property
215221
def features(self):

tensorflow_datasets/core/dataset_info_test.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def test_reading(self):
113113

114114
# Assert that this is computed correctly.
115115
self.assertEqual(40, info.splits.total_num_examples)
116+
self.assertEqual(11594722, info.size_in_bytes)
116117

117118
self.assertEqual("image", info.supervised_keys[0])
118119
self.assertEqual("label", info.supervised_keys[1])
@@ -169,7 +170,9 @@ def test_restore_after_modification(self):
169170
citation="some citation",
170171
redistribution_info={"license": "some license"}
171172
)
172-
info.size_in_bytes = 456
173+
info.download_size = 456
174+
info.as_proto.splits.add(name="train", num_bytes=512)
175+
info.as_proto.splits.add(name="validation", num_bytes=64)
173176
info.as_proto.schema.feature.add()
174177
info.as_proto.schema.feature.add() # Add dynamic statistics
175178
info.download_checksums = {
@@ -198,7 +201,8 @@ def test_restore_after_modification(self):
198201
citation="some citation (new)",
199202
redistribution_info={"license": "some license (new)"}
200203
)
201-
restored_info.size_in_bytes = 789
204+
restored_info.download_size = 789
205+
restored_info.as_proto.splits.add(name="validation", num_bytes=288)
202206
restored_info.as_proto.schema.feature.add()
203207
restored_info.as_proto.schema.feature.add()
204208
restored_info.as_proto.schema.feature.add()
@@ -219,7 +223,8 @@ def test_restore_after_modification(self):
219223
self.assertEqual(restored_info.citation, "some citation (new)")
220224
self.assertEqual(restored_info.redistribution_info.license,
221225
"some license (new)")
222-
self.assertEqual(restored_info.size_in_bytes, 789)
226+
self.assertEqual(restored_info.download_size, 789)
227+
self.assertEqual(restored_info.size_in_bytes, 576)
223228
self.assertEqual(len(restored_info.as_proto.schema.feature), 4)
224229
self.assertEqual(restored_info.download_checksums, {
225230
"url2": "some other checksum (new)",

tensorflow_datasets/core/proto/dataset_info.proto

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,13 @@ message SplitInfo {
2828
int64 num_shards = 2;
2929
// The number of examples in each shard.
3030
repeated int64 shard_lengths = 4;
31+
// The number of bytes in the split.
32+
int64 num_bytes = 5;
3133

3234
// The concrete statistics about this split.
3335
tensorflow.metadata.v0.DatasetFeatureStatistics statistics = 3;
3436

35-
// Next available: 5.
37+
// Next available: 6.
3638
}
3739

3840
// This message indicates which feature in the dataset schema is the input and
@@ -59,8 +61,12 @@ message DatasetInfo {
5961
// A citation string if one exists for this dataset.
6062
string citation = 3;
6163

64+
// DEPRECATED
6265
// *Approximate* size in bytes of this dataset on disk.
63-
int64 size_in_bytes = 4;
66+
int64 size_in_bytes = 4 [deprecated=true];
67+
68+
// Size in bytes of downloaded files.
69+
int64 download_size = 12;
6470

6571
// Canonical location of the dataset.
6672
DatasetLocation location = 5;
@@ -81,5 +87,5 @@ message DatasetInfo {
8187

8288
RedistributionInfo redistribution_info = 11;
8389

84-
// Next available: 12
90+
// Next available: 13
8591
}

0 commit comments

Comments
 (0)